feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/utils/body_action_decoder.py
+++ b/scripts/utils/body_action_decoder.py
@@ -0,0 +1,877 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Body Action Decoder - Extended pose action analysis with body keypoints
+
+Purpose:
+1. Decode face pose actions (existing)
+2. Decode body actions (future MediaPipe Holistic)
+3. Integrate face + body actions for comprehensive analysis
+
+Body Keypoints (MediaPipe Holistic):
+- Face: 468 points (eyes, mouth, nose, etc.)
+- Pose: 33 points (shoulders, elbows, hands, hips, knees, feet)
+- Hands: 21 points per hand
+
+Action Types:
+- Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head
+- Eyes: blink, close, wide_open, look_left, look_right
+- Mouth: open, close, smile, talk, yawn
+- Arms: raise_left, raise_right, cross_arms, wave
+- Hands: point, grab, clap, thumbs_up, fist
+- Legs: stand, sit, walk, run, jump, kick
+- Feet: tap, stomp, cross
+
+Architecture:
+┌─────────────────────────────────────────────────────────────────┐
+│                     Body Action Decoder                         │
+├─────────────────────────────────────────────────────────────────┤
+│                                                                 │
+│  ┌───────────────┐  ┌───────────────┐  ┌───────────────┐       │
+│  │ Face Actions  │  │ Body Actions  │  │ Hand Actions  │       │
+│  │ (InsightFace) │  │ (MediaPipe)   │  │ (MediaPipe)   │       │
+│  └───────────────┘  └───────────────┘  └───────────────┘       │
+│         │                  │                  │                │
+│         └──────────────────┼──────────────────┘                │
+│                            │                                    │
+│                    ┌───────▼───────┐                            │
+│                    │  Action Merger│                            │
+│                    └────────────────┘                            │
+│                            │                                    │
+│                    ┌───────▼───────┐                            │
+│                    │ Action Timeline│                            │
+│                    └────────────────┘                            │
+│                                                                 │
+└─────────────────────────────────────────────────────────────────┘
+"""
+
+import sys
+import json
+import argparse
+import numpy as np
+from typing import Dict, List, Optional
+from collections import defaultdict
+
+
+# =============================================================================
+# Face Action Definitions (Existing from pose_action_decoder.py)
+# =============================================================================
+
+FACE_TURN_ACTIONS = {
+    ("frontal", "three_quarter"): "turn_partial",
+    ("frontal", "profile_left"): "turn_left",
+    ("frontal", "profile_right"): "turn_right",
+    ("three_quarter", "frontal"): "return_frontal",
+    ("three_quarter", "profile_left"): "turn_left",
+    ("three_quarter", "profile_right"): "turn_right",
+    ("profile_left", "frontal"): "turn_to_frontal",
+    ("profile_left", "three_quarter"): "turn_to_three_quarter",
+    ("profile_left", "profile_right"): "turn_full",
+    ("profile_right", "frontal"): "turn_to_frontal",
+    ("profile_right", "three_quarter"): "turn_to_three_quarter",
+    ("profile_right", "profile_left"): "turn_full",
+}
+
+FACE_PITCH_ACTIONS = {
+    ("neutral", "tilted_up"): "look_up",
+    ("neutral", "tilted_down"): "look_down",
+    ("tilted_up", "neutral"): "return_neutral",
+    ("tilted_down", "neutral"): "return_neutral",
+}
+
+
+# =============================================================================
+# Eye Action Definitions
+# =============================================================================
+
+EYE_ACTIONS = {
+    "blink": {
+        "description": "眨眼",
+        "pattern": "eye_aspect_ratio drops < 0.2 for 1-3 frames",
+        "min_frames": 1,
+        "max_frames": 3,
+    },
+    "close": {
+        "description": "闭眼",
+        "pattern": "eye_aspect_ratio < 0.15 for > 10 frames",
+        "min_frames": 10,
+    },
+    "wide_open": {
+        "description": "睁大眼",
+        "pattern": "eye_aspect_ratio > 0.4",
+    },
+    "look_left": {
+        "description": "向左看",
+        "pattern": "iris_position_x < 0.3",
+    },
+    "look_right": {
+        "description": "向右看",
+        "pattern": "iris_position_x > 0.7",
+    },
+    "squint": {
+        "description": "眯眼",
+        "pattern": "eye_aspect_ratio 0.15-0.25",
+    },
+}
+
+
+# =============================================================================
+# Mouth Action Definitions
+# =============================================================================
+
+MOUTH_ACTIONS = {
+    "open": {
+        "description": "张嘴",
+        "pattern": "mouth_aspect_ratio > 0.5",
+    },
+    "close": {
+        "description": "闭嘴",
+        "pattern": "mouth_aspect_ratio < 0.2",
+    },
+    "smile": {
+        "description": "微笑",
+        "pattern": "mouth_corner_distance > threshold",
+    },
+    "talk": {
+        "description": "说话",
+        "pattern": "mouth_aspect_ratio oscillating 0.3-0.6",
+        "min_frames": 10,
+    },
+    "yawn": {
+        "description": "打哈欠",
+        "pattern": "mouth_aspect_ratio > 0.7 for > 20 frames",
+        "min_frames": 20,
+    },
+    "pout": {
+        "description": "嘟嘴",
+        "pattern": "lip_distance > threshold",
+    },
+}
+
+
+# =============================================================================
+# Arm Action Definitions
+# =============================================================================
+
+ARM_ACTIONS = {
+    "raise_left": {
+        "description": "举起左手",
+        "pattern": "left_shoulder_y > left_elbow_y > left_wrist_y",
+    },
+    "raise_right": {
+        "description": "举起右手",
+        "pattern": "right_shoulder_y > right_elbow_y > right_wrist_y",
+    },
+    "raise_both": {
+        "description": "双手举起",
+        "pattern": "both arms raised",
+    },
+    "cross_arms": {
+        "description": "双手交叉",
+        "pattern": "left_wrist_x > right_wrist_x AND right_wrist_x < left_wrist_x",
+    },
+    "wave": {
+        "description": "挥手",
+        "pattern": "wrist_y oscillating ±20px for 5-15 frames",
+        "min_frames": 5,
+        "max_frames": 15,
+    },
+    "extend_left": {
+        "description": "伸展左臂",
+        "pattern": "left_elbow_angle > 150°",
+    },
+    "extend_right": {
+        "description": "伸展右臂",
+        "pattern": "right_elbow_angle > 150°",
+    },
+    "fold_left": {
+        "description": "弯曲左臂",
+        "pattern": "left_elbow_angle < 90°",
+    },
+    "fold_right": {
+        "description": "弯曲右臂",
+        "pattern": "right_elbow_angle < 90°",
+    },
+    "point": {
+        "description": "指向",
+        "pattern": "index_finger extended, other fingers folded",
+    },
+}
+
+
+# =============================================================================
+# Hand Action Definitions
+# =============================================================================
+
+HAND_ACTIONS = {
+    "grab": {
+        "description": "抓取",
+        "pattern": "fingers folded, thumb opposing",
+    },
+    "open": {
+        "description": "张开手",
+        "pattern": "all fingers extended",
+    },
+    "clap": {
+        "description": "拍手",
+        "pattern": "hands together then apart (velocity pattern)",
+        "min_frames": 3,
+        "max_frames": 10,
+    },
+    "thumbs_up": {
+        "description": "点赞",
+        "pattern": "thumb extended upward, other fingers folded",
+    },
+    "fist": {
+        "description": "握拳",
+        "pattern": "all fingers folded into palm",
+    },
+    "peace": {
+        "description": "剪刀手",
+        "pattern": "index and middle fingers extended",
+    },
+    "ok": {
+        "description": "OK 手势",
+        "pattern": "thumb and index finger touching",
+    },
+    "touch_face": {
+        "description": "摸脸",
+        "pattern": "hand near face region",
+    },
+    "touch_hair": {
+        "description": "摸头发",
+        "pattern": "hand above head region",
+    },
+    "pocket_left": {
+        "description": "左手插兜",
+        "pattern": "left_hand in hip region",
+    },
+    "pocket_right": {
+        "description": "右手插兜",
+        "pattern": "right_hand in hip region",
+    },
+}
+
+
+# =============================================================================
+# Leg Action Definitions
+# =============================================================================
+
+LEG_ACTIONS = {
+    "stand": {
+        "description": "站立",
+        "pattern": "hip_y < knee_y < ankle_y, vertical alignment",
+    },
+    "sit": {
+        "description": "坐姿",
+        "pattern": "hip_y ≈ knee_y, thigh horizontal",
+    },
+    "walk": {
+        "description": "行走",
+        "pattern": "hip-knee-ankle oscillating, stride pattern",
+        "min_frames": 10,
+    },
+    "run": {
+        "description": "奔跑",
+        "pattern": "fast oscillating, knee_bend > 60°",
+        "min_frames": 10,
+    },
+    "jump": {
+        "description": "跳跃",
+        "pattern": "all keypoints moving upward then landing",
+        "min_frames": 5,
+        "max_frames": 20,
+    },
+    "kick": {
+        "description": "踢腿",
+        "pattern": "one leg extended forward rapidly",
+        "min_frames": 3,
+        "max_frames": 15,
+    },
+    "cross_left": {
+        "description": "左腿交叉",
+        "pattern": "left_ankle_x > right_ankle_x",
+    },
+    "cross_right": {
+        "description": "右腿交叉",
+        "pattern": "right_ankle_x > left_ankle_x",
+    },
+    "knee_bend": {
+        "description": "弯膝",
+        "pattern": "knee_angle < 120°",
+    },
+}
+
+
+# =============================================================================
+# Feet Action Definitions
+# =============================================================================
+
+FEET_ACTIONS = {
+    "tap": {
+        "description": "轻踏",
+        "pattern": "ankle_y oscillating ±10px",
+        "min_frames": 3,
+        "max_frames": 15,
+    },
+    "stomp": {
+        "description": "重踏",
+        "pattern": "ankle_y large downward movement",
+        "min_frames": 3,
+    },
+    "cross": {
+        "description": "交叉脚",
+        "pattern": "feet_x overlapping",
+    },
+    "point_left": {
+        "description": "左脚前伸",
+        "pattern": "left_ankle_y < right_ankle_y",
+    },
+    "point_right": {
+        "description": "右脚前伸",
+        "pattern": "right_ankle_y < left_ankle_y",
+    },
+}
+
+
+# =============================================================================
+# Combined Actions (Face + Body)
+# =============================================================================
+
+COMBINED_ACTIONS = {
+    "thinking": {
+        "description": "思考姿势",
+        "components": ["touch_face", "look_down"],
+        "pattern": "hand near chin + head tilted down",
+    },
+    "listening": {
+        "description": "倾听姿势",
+        "components": ["turn_partial", "open_mouth"],
+        "pattern": "slight turn + mouth slightly open",
+    },
+    "nodding_agreement": {
+        "description": "点头同意",
+        "components": ["nod_head", "smile"],
+        "pattern": "head nod + smile",
+    },
+    "shaking_disagreement": {
+        "description": "摇头不同意",
+        "components": ["shake_head", "frown"],
+        "pattern": "shake head + frown",
+    },
+    "waving_greeting": {
+        "description": "挥手打招呼",
+        "components": ["wave", "smile"],
+        "pattern": "wave hand + smile",
+    },
+    "crossing_arms_defensive": {
+        "description": "双手交叉防御",
+        "components": ["cross_arms", "frontal_stable"],
+        "pattern": "cross arms + frontal pose",
+    },
+    "pointing_explaining": {
+        "description": "指向解释",
+        "components": ["point", "turn_partial"],
+        "pattern": "pointing + slight turn",
+    },
+    "stretching": {
+        "description": "伸展",
+        "components": ["raise_both", "look_up"],
+        "pattern": "raise arms + look up",
+    },
+    "sitting_relaxed": {
+        "description": "放松坐姿",
+        "components": ["sit", "cross_arms"],
+        "pattern": "sit + cross arms",
+    },
+}
+
+
+# =============================================================================
+# Analysis Functions
+# =============================================================================
+
+def analyze_eye_actions(eye_landmarks: List, prev_eye_landmarks: List = None) -> List[Dict]:
+    """
+    Analyze eye actions from landmarks
+    
+    Args:
+        eye_landmarks: Current frame eye landmarks (left/right eye points)
+        prev_eye_landmarks: Previous frame landmarks (for motion detection)
+    
+    Returns:
+        List of detected eye actions
+    """
+    actions = []
+    
+    if not eye_landmarks or len(eye_landmarks) < 6:
+        return actions
+    
+    # Calculate eye aspect ratio (EAR)
+    # EAR = (|p2-p6| + |p3-p5|) / (2|p1-p4|)
+    # Points: p1, p2, p3, p4, p5, p6 (6 points per eye)
+    
+    # For left eye
+    left_eye = eye_landmarks[:6]
+    if len(left_eye) == 6:
+        # Simplified EAR calculation
+        vertical_1 = np.linalg.norm(np.array(left_eye[1]) - np.array(left_eye[5]))
+        vertical_2 = np.linalg.norm(np.array(left_eye[2]) - np.array(left_eye[4]))
+        horizontal = np.linalg.norm(np.array(left_eye[0]) - np.array(left_eye[3]))
+        
+        left_ear = (vertical_1 + vertical_2) / (2 * horizontal) if horizontal > 0 else 0
+        
+        # Detect actions
+        if left_ear < 0.15:
+            actions.append({"action": "close_left", "description": "闭左眼", "confidence": 1.0 - left_ear})
+        elif left_ear > 0.4:
+            actions.append({"action": "wide_open_left", "description": "睁大左眼", "confidence": left_ear})
+    
+    return actions
+
+
+def analyze_mouth_actions(mouth_landmarks: List) -> List[Dict]:
+    """
+    Analyze mouth actions from landmarks
+    
+    Args:
+        mouth_landmarks: Mouth region landmarks (lips, mouth corners)
+    
+    Returns:
+        List of detected mouth actions
+    """
+    actions = []
+    
+    if not mouth_landmarks or len(mouth_landmarks) < 4:
+        return actions
+    
+    # Calculate mouth aspect ratio
+    # Upper lip - lower lip distance / mouth width
+    
+    upper_lip = np.array(mouth_landmarks[0])
+    lower_lip = np.array(mouth_landmarks[1])
+    left_corner = np.array(mouth_landmarks[2])
+    right_corner = np.array(mouth_landmarks[3])
+    
+    mouth_height = np.linalg.norm(upper_lip - lower_lip)
+    mouth_width = np.linalg.norm(left_corner - right_corner)
+    
+    mar = mouth_height / mouth_width if mouth_width > 0 else 0
+    
+    # Detect actions
+    if mar > 0.7:
+        actions.append({"action": "yawn", "description": "打哈欠", "mar": mar})
+    elif mar > 0.5:
+        actions.append({"action": "open", "description": "张嘴", "mar": mar})
+    elif mar < 0.2:
+        actions.append({"action": "close", "description": "闭嘴", "mar": mar})
+    else:
+        # Check smile (mouth corners distance)
+        corner_distance = abs(left_corner[1] - upper_lip[1]) + abs(right_corner[1] - upper_lip[1])
+        if corner_distance > 10:  # Threshold
+            actions.append({"action": "smile", "description": "微笑", "corner_distance": corner_distance})
+    
+    return actions
+
+
+def analyze_arm_actions(pose_keypoints: Dict) -> List[Dict]:
+    """
+    Analyze arm actions from pose keypoints
+    
+    Args:
+        pose_keypoints: Pose keypoints dict with shoulder, elbow, wrist positions
+    
+    Returns:
+        List of detected arm actions
+    """
+    actions = []
+    
+    # Keypoint indices (MediaPipe Pose):
+    # 11: left_shoulder, 12: right_shoulder
+    # 13: left_elbow, 14: right_elbow
+    # 15: left_wrist, 16: right_wrist
+    
+    left_shoulder = pose_keypoints.get("left_shoulder")
+    left_elbow = pose_keypoints.get("left_elbow")
+    left_wrist = pose_keypoints.get("left_wrist")
+    
+    right_shoulder = pose_keypoints.get("right_shoulder")
+    right_elbow = pose_keypoints.get("right_elbow")
+    right_wrist = pose_keypoints.get("right_wrist")
+    
+    # Left arm actions
+    if left_shoulder and left_elbow and left_wrist:
+        # Calculate elbow angle
+        shoulder_elbow = np.array(left_elbow) - np.array(left_shoulder)
+        elbow_wrist = np.array(left_wrist) - np.array(left_elbow)
+        
+        elbow_angle = np.arccos(
+            np.dot(shoulder_elbow, elbow_wrist) / 
+            (np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
+        )
+        elbow_angle_deg = np.degrees(elbow_angle)
+        
+        # Detect actions
+        if left_wrist[1] < left_elbow[1] < left_shoulder[1]:  # Raised (y decreases upward)
+            actions.append({"action": "raise_left", "description": "举起左手", "angle": elbow_angle_deg})
+        
+        if elbow_angle_deg > 150:
+            actions.append({"action": "extend_left", "description": "伸展左臂", "angle": elbow_angle_deg})
+        elif elbow_angle_deg < 90:
+            actions.append({"action": "fold_left", "description": "弯曲左臂", "angle": elbow_angle_deg})
+    
+    # Right arm actions
+    if right_shoulder and right_elbow and right_wrist:
+        shoulder_elbow = np.array(right_elbow) - np.array(right_shoulder)
+        elbow_wrist = np.array(right_wrist) - np.array(right_elbow)
+        
+        elbow_angle = np.arccos(
+            np.dot(shoulder_elbow, elbow_wrist) / 
+            (np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
+        )
+        elbow_angle_deg = np.degrees(elbow_angle)
+        
+        if right_wrist[1] < right_elbow[1] < right_shoulder[1]:
+            actions.append({"action": "raise_right", "description": "举起右手", "angle": elbow_angle_deg})
+        
+        if elbow_angle_deg > 150:
+            actions.append({"action": "extend_right", "description": "伸展右臂", "angle": elbow_angle_deg})
+        elif elbow_angle_deg < 90:
+            actions.append({"action": "fold_right", "description": "弯曲右臂", "angle": elbow_angle_deg})
+    
+    # Cross arms detection
+    if left_wrist and right_wrist:
+        if left_wrist[0] > right_wrist[0] and right_wrist[0] < left_shoulder[0]:
+            actions.append({"action": "cross_arms", "description": "双手交叉"})
+    
+    return actions
+
+
+def analyze_hand_actions(hand_keypoints: List, hand_type: str = "right") -> List[Dict]:
+    """
+    Analyze hand actions from hand keypoints
+    
+    Args:
+        hand_keypoints: 21 hand keypoints (MediaPipe Hand)
+        hand_type: "left" or "right"
+    
+    Returns:
+        List of detected hand actions
+    """
+    actions = []
+    
+    if not hand_keypoints or len(hand_keypoints) < 21:
+        return actions
+    
+    # MediaPipe Hand keypoint indices:
+    # 0: wrist
+    # 1-4: thumb (CMC, MCP, IP, TIP)
+    # 5-8: index finger (MCP, PIP, DIP, TIP)
+    # 9-12: middle finger
+    # 13-16: ring finger
+    # 17-20: pinky
+    
+    wrist = np.array(hand_keypoints[0])
+    thumb_tip = np.array(hand_keypoints[4])
+    index_tip = np.array(hand_keypoints[8])
+    middle_tip = np.array(hand_keypoints[12])
+    ring_tip = np.array(hand_keypoints[16])
+    pinky_tip = np.array(hand_keypoints[20])
+    
+    # Calculate finger extensions
+    finger_tips = [thumb_tip, index_tip, middle_tip, ring_tip, pinky_tip]
+    finger_bases = [
+        np.array(hand_keypoints[2]),   # thumb IP
+        np.array(hand_keypoints[5]),   # index MCP
+        np.array(hand_keypoints[9]),   # middle MCP
+        np.array(hand_keypoints[13]),  # ring MCP
+        np.array(hand_keypoints[17]),  # pinky MCP
+    ]
+    
+    extensions = []
+    for tip, base in zip(finger_tips, finger_bases):
+        dist = np.linalg.norm(tip - base)
+        extensions.append(dist)
+    
+    # Detect actions
+    avg_extension = np.mean(extensions)
+    
+    if avg_extension > 50:  # Open hand
+        actions.append({"action": f"open_{hand_type}", "description": f"张开{hand_type}手"})
+    
+    elif avg_extension < 30:  # Closed/fist
+        actions.append({"action": f"fist_{hand_type}", "description": f"握{hand_type}拳"})
+    
+    # Thumbs up (thumb extended upward, others folded)
+    if extensions[0] > 40 and np.mean(extensions[1:]) < 30:
+        actions.append({"action": f"thumbs_up_{hand_type}", "description": f"{hand_type}手点赞"})
+    
+    # Peace sign (index and middle extended)
+    if extensions[1] > 40 and extensions[2] > 40 and np.mean(extensions[3:]) < 30:
+        actions.append({"action": f"peace_{hand_type}", "description": f"{hand_type}手剪刀手"})
+    
+    # Pointing (index extended, others folded)
+    if extensions[1] > 40 and np.mean([extensions[0], extensions[2], extensions[3], extensions[4]]) < 30:
+        actions.append({"action": f"point_{hand_type}", "description": f"{hand_type}手指向"})
+    
+    return actions
+
+
+def analyze_leg_actions(pose_keypoints: Dict) -> List[Dict]:
+    """
+    Analyze leg actions from pose keypoints
+    
+    Args:
+        pose_keypoints: Pose keypoints with hip, knee, ankle positions
+    
+    Returns:
+        List of detected leg actions
+    """
+    actions = []
+    
+    # Keypoint indices (MediaPipe Pose):
+    # 23: left_hip, 24: right_hip
+    # 25: left_knee, 26: right_knee
+    # 27: left_ankle, 28: right_ankle
+    
+    left_hip = pose_keypoints.get("left_hip")
+    left_knee = pose_keypoints.get("left_knee")
+    left_ankle = pose_keypoints.get("left_ankle")
+    
+    right_hip = pose_keypoints.get("right_hip")
+    right_knee = pose_keypoints.get("right_knee")
+    right_ankle = pose_keypoints.get("right_ankle")
+    
+    # Left leg actions
+    if left_hip and left_knee and left_ankle:
+        hip_knee = np.array(left_knee) - np.array(left_hip)
+        knee_ankle = np.array(left_ankle) - np.array(left_knee)
+        
+        knee_angle = np.arccos(
+            np.dot(hip_knee, knee_ankle) / 
+            (np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
+        )
+        knee_angle_deg = np.degrees(knee_angle)
+        
+        # Detect actions
+        if knee_angle_deg < 120:
+            actions.append({"action": "knee_bend_left", "description": "弯左膝", "angle": knee_angle_deg})
+        
+        # Standing detection
+        if left_hip[1] < left_knee[1] < left_ankle[1]:  # Vertical alignment (y increases downward)
+            actions.append({"action": "stand_left", "description": "左腿站立"})
+    
+    # Right leg actions
+    if right_hip and right_knee and right_ankle:
+        hip_knee = np.array(right_knee) - np.array(right_hip)
+        knee_ankle = np.array(right_ankle) - np.array(right_knee)
+        
+        knee_angle = np.arccos(
+            np.dot(hip_knee, knee_ankle) / 
+            (np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
+        )
+        knee_angle_deg = np.degrees(knee_angle)
+        
+        if knee_angle_deg < 120:
+            actions.append({"action": "knee_bend_right", "description": "弯右膝", "angle": knee_angle_deg})
+        
+        if right_hip[1] < right_knee[1] < right_ankle[1]:
+            actions.append({"action": "stand_right", "description": "右腿站立"})
+    
+    # Sit detection (hip ≈ knee height)
+    if left_hip and left_knee and right_hip and right_knee:
+        hip_avg_y = (left_hip[1] + right_hip[1]) / 2
+        knee_avg_y = (left_knee[1] + right_knee[1]) / 2
+        
+        if abs(hip_avg_y - knee_avg_y) < 30:  # Hip and knee at similar height
+            actions.append({"action": "sit", "description": "坐姿"})
+    
+    return actions
+
+
+# =============================================================================
+# Main Decoder Function
+# =============================================================================
+
+def decode_body_actions(
+    pose_data: Dict,
+    face_data: Dict = None,
+    hand_data: Dict = None,
+) -> Dict:
+    """
+    Decode all body actions from multiple data sources
+    
+    Args:
+        pose_data: Pose estimation data (MediaPipe Pose)
+        face_data: Face pose data (InsightFace pose_angle)
+        hand_data: Hand tracking data (MediaPipe Hand)
+    
+    Returns:
+        Combined action data dict
+    """
+    all_actions = {
+        "face": [],
+        "eyes": [],
+        "mouth": [],
+        "arms": [],
+        "hands": [],
+        "legs": [],
+        "feet": [],
+        "combined": [],
+    }
+    
+    # 1. Face actions (existing)
+    if face_data:
+        pose_angle = face_data.get("pose_angle", {})
+        prev_pose_angle = face_data.get("prev_pose_angle", {})
+        
+        if pose_angle and prev_pose_angle:
+            angle = pose_angle.get("angle", "unknown")
+            prev_angle = prev_pose_angle.get("angle", "unknown")
+            
+            turn_key = (prev_angle, angle)
+            if turn_key in FACE_TURN_ACTIONS:
+                all_actions["face"].append({
+                    "action": FACE_TURN_ACTIONS[turn_key],
+                    "description": f"Face: {prev_angle} → {angle}",
+                })
+        
+        # Pitch actions
+        pitch = pose_angle.get("pitch", "neutral")
+        prev_pitch = prev_pose_angle.get("pitch", "neutral")
+        
+        pitch_key = (prev_pitch, pitch)
+        if pitch_key in FACE_PITCH_ACTIONS:
+            all_actions["face"].append({
+                "action": FACE_PITCH_ACTIONS[pitch_key],
+                "description": f"Pitch: {prev_pitch} → {pitch}",
+            })
+    
+    # 2. Eye actions (if eye landmarks available)
+    if face_data and face_data.get("eye_landmarks"):
+        all_actions["eyes"] = analyze_eye_actions(
+            face_data["eye_landmarks"],
+            face_data.get("prev_eye_landmarks")
+        )
+    
+    # 3. Mouth actions (if mouth landmarks available)
+    if face_data and face_data.get("mouth_landmarks"):
+        all_actions["mouth"] = analyze_mouth_actions(face_data["mouth_landmarks"])
+    
+    # 4. Arm actions (if pose keypoints available)
+    if pose_data and pose_data.get("keypoints"):
+        all_actions["arms"] = analyze_arm_actions(pose_data["keypoints"])
+    
+    # 5. Hand actions (if hand keypoints available)
+    if hand_data:
+        if hand_data.get("left_hand"):
+            all_actions["hands"].extend(analyze_hand_actions(hand_data["left_hand"], "left"))
+        if hand_data.get("right_hand"):
+            all_actions["hands"].extend(analyze_hand_actions(hand_data["right_hand"], "right"))
+    
+    # 6. Leg actions (if pose keypoints available)
+    if pose_data and pose_data.get("keypoints"):
+        all_actions["legs"] = analyze_leg_actions(pose_data["keypoints"])
+    
+    # 7. Combined actions
+    detected_actions = []
+    for category, actions in all_actions.items():
+        if actions:
+            detected_actions.extend([a["action"] for a in actions])
+    
+    for combined_name, combined_def in COMBINED_ACTIONS.items():
+        components = combined_def["components"]
+        if all(comp in detected_actions for comp in components):
+            all_actions["combined"].append({
+                "action": combined_name,
+                "description": combined_def["description"],
+                "components": components,
+            })
+    
+    return all_actions
+
+
+def print_body_action_report(action_data: Dict) -> None:
+    """
+    Print body action report
+    """
+    print("\n" + "=" * 70)
+    print("Body Action Decoder Report")
+    print("=" * 70)
+    
+    categories = ["face", "eyes", "mouth", "arms", "hands", "legs", "feet", "combined"]
+    
+    for category in categories:
+        actions = action_data.get(category, [])
+        
+        if actions:
+            print(f"\n{category.upper()} Actions ({len(actions)}):")
+            for act in actions:
+                desc = act.get("description", act["action"])
+                print(f"  - {act['action']}: {desc}")
+    
+    print("\n" + "=" * 70)
+
+
+# =============================================================================
+# Main Entry Point
+# =============================================================================
+
+def main():
+    parser = argparse.ArgumentParser(description="Decode body actions from pose data")
+    parser.add_argument("--pose-json", help="Path to pose.json (MediaPipe output)")
+    parser.add_argument("--face-json", help="Path to face.json (InsightFace output)")
+    parser.add_argument("--hand-json", help="Path to hand.json (MediaPipe Hand output)")
+    parser.add_argument("--output-json", help="Output action data JSON")
+    parser.add_argument("--frame", type=int, help="Analyze specific frame")
+    args = parser.parse_args()
+    
+    print("=" * 70)
+    print("Body Action Decoder")
+    print("=" * 70)
+    
+    # Load data
+    pose_data = None
+    face_data = None
+    hand_data = None
+    
+    if args.pose_json:
+        with open(args.pose_json) as f:
+            pose_data = json.load(f)
+    
+    if args.face_json:
+        with open(args.face_json) as f:
+            face_data = json.load(f)
+    
+    if args.hand_json:
+        with open(args.hand_json) as f:
+            hand_data = json.load(f)
+    
+    # Analyze
+    if pose_data or face_data or hand_data:
+        action_data = decode_body_actions(
+            pose_data=pose_data,
+            face_data=face_data,
+            hand_data=hand_data,
+        )
+        
+        print_body_action_report(action_data)
+        
+        if args.output_json:
+            with open(args.output_json, "w") as f:
+                json.dump(action_data, f, indent=2)
+            print(f"\n✅ Output saved to: {args.output_json}")
+    else:
+        print("\n⚠️ No input data provided")
+        print("\nAction Categories:")
+        print("  - Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head")
+        print("  - Eyes: blink, close, wide_open, look_left, look_right")
+        print("  - Mouth: open, close, smile, talk, yawn")
+        print("  - Arms: raise_left, raise_right, cross_arms, wave, point")
+        print("  - Hands: grab, open, clap, thumbs_up, fist, peace, ok")
+        print("  - Legs: stand, sit, walk, run, jump, kick")
+        print("  - Feet: tap, stomp, cross, point")
+        print("  - Combined: thinking, listening, nodding_agreement, waving_greeting")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/utils/face_trace_visualizer.py
+++ b/scripts/utils/face_trace_visualizer.py
@@ -0,0 +1,201 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Face Trace Visualizer - Visualize face tracking paths
+
+Output:
+1. Trace path visualization (matplotlib)
+2. Trace statistics CSV
+"""
+
+import sys
+import json
+import argparse
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+from collections import defaultdict
+from typing import Dict
+
+
+def visualize_traces(face_data: Dict, output_path: str = None) -> None:
+    """
+    Visualize face trace paths
+    """
+    frames = face_data.get("frames", {})
+    traces = face_data.get("traces", {})
+    metadata = face_data.get("metadata", {})
+    
+    if not frames or not traces:
+        print("No frames or traces found")
+        return
+    
+    video_width = metadata.get("width", 640)
+    video_height = metadata.get("height", 360)
+    video_duration = metadata.get("total_duration", 15)
+    
+    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+    
+    ax1 = axes[0, 0]
+    ax2 = axes[0, 1]
+    ax3 = axes[1, 0]
+    ax4 = axes[1, 1]
+    
+    colors = plt.cm.tab10(np.linspace(0, 1, len(traces)))
+    
+    trace_data = {}
+    for trace_id_str, trace in traces.items():
+        trace_id = int(trace_id_str)
+        path = trace.get("path", [])
+        
+        trace_data[trace_id] = {
+            "frames": [p["frame"] for p in path],
+            "x": [p["bbox"]["x"] + p["bbox"]["width"] / 2 for p in path],
+            "y": [p["bbox"]["y"] + p["bbox"]["height"] / 2 for p in path],
+            "confidence": [p["confidence"] for p in path],
+            "pose": [p["pose_angle"] for p in path],
+        }
+    
+    for trace_id, color in zip(sorted(trace_data.keys()), colors):
+        data = trace_data[trace_id]
+        
+        ax1.plot(data["frames"], data["x"], color=color, label=f"Trace {trace_id}", linewidth=2)
+        ax1.scatter(data["frames"], data["x"], color=color, s=30)
+        
+        ax2.plot(data["frames"], data["y"], color=color, label=f"Trace {trace_id}", linewidth=2)
+        ax2.scatter(data["frames"], data["y"], color=color, s=30)
+        
+        ax3.plot(data["frames"], data["confidence"], color=color, label=f"Trace {trace_id}", linewidth=2)
+        ax3.scatter(data["frames"], data["confidence"], color=color, s=30)
+    
+    ax1.set_xlabel("Frame Number")
+    ax1.set_ylabel("X Position (center)")
+    ax1.set_title("Face X Position Over Time")
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    
+    ax2.set_xlabel("Frame Number")
+    ax2.set_ylabel("Y Position (center)")
+    ax2.set_title("Face Y Position Over Time")
+    ax2.legend()
+    ax2.grid(True, alpha=0.3)
+    
+    ax3.set_xlabel("Frame Number")
+    ax3.set_ylabel("Detection Confidence")
+    ax3.set_title("Face Detection Confidence Over Time")
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    
+    pose_colors = {
+        "frontal": "green",
+        "three_quarter": "blue",
+        "profile_left": "orange",
+        "profile_right": "red",
+        "unknown": "gray",
+    }
+    
+    for trace_id, color in zip(sorted(trace_data.keys()), colors):
+        data = trace_data[trace_id]
+        poses = data["pose"]
+        frames = data["frames"]
+        
+        pose_counts = defaultdict(int)
+        for pose in poses:
+            pose_counts[pose] += 1
+        
+        ax4.bar(
+            [f"Trace {trace_id}\n{pose}" for pose in pose_counts.keys()],
+            pose_counts.values(),
+            color=[pose_colors.get(pose, "gray") for pose in pose_counts.keys()],
+            alpha=0.7,
+            label=f"Trace {trace_id}",
+        )
+    
+    ax4.set_xlabel("Trace / Pose")
+    ax4.set_ylabel("Count")
+    ax4.set_title("Pose Distribution by Trace")
+    ax4.tick_params(axis='x', rotation=45)
+    
+    plt.tight_layout()
+    
+    if output_path:
+        plt.savefig(output_path, dpi=150, bbox_inches="tight")
+        print(f"\n✅ Visualization saved to: {output_path}")
+    else:
+        plt.show()
+
+
+def export_trace_csv(face_data: Dict, output_path: str) -> None:
+    """
+    Export trace statistics to CSV
+    """
+    traces = face_data.get("traces", {})
+    
+    import csv
+    
+    with open(output_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow([
+            "trace_id",
+            "start_frame",
+            "end_frame",
+            "duration_frames",
+            "duration_seconds",
+            "total_appearances",
+            "avg_confidence",
+            "pose_three_quarter",
+            "pose_profile_right",
+            "pose_profile_left",
+            "pose_frontal",
+        ])
+        
+        for trace_id_str, trace in sorted(traces.items(), key=lambda x: int(x[0])):
+            poses = trace.get("pose_angles", [])
+            pose_counts = defaultdict(int)
+            for pose in poses:
+                pose_counts[pose] += 1
+            
+            writer.writerow([
+                trace["trace_id"],
+                trace["start_frame"],
+                trace["end_frame"],
+                trace["duration_frames"],
+                trace["duration_seconds"],
+                trace["total_appearances"],
+                trace["avg_confidence"],
+                pose_counts.get("three_quarter", 0),
+                pose_counts.get("profile_right", 0),
+                pose_counts.get("profile_left", 0),
+                pose_counts.get("frontal", 0),
+            ])
+    
+    print(f"\n✅ CSV exported to: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Visualize face traces")
+    parser.add_argument("--face-json", required=True, help="Path to face_traced.json")
+    parser.add_argument("--output-plot", help="Output plot path (PNG)")
+    parser.add_argument("--output-csv", help="Output CSV path")
+    args = parser.parse_args()
+    
+    with open(args.face_json) as f:
+        face_data = json.load(f)
+    
+    print("=" * 60)
+    print("Face Trace Visualizer")
+    print("=" * 60)
+    print(f"\nInput: {args.face_json}")
+    print(f"Traces: {len(face_data.get('traces', {}))}")
+    
+    if args.output_plot:
+        visualize_traces(face_data, args.output_plot)
+    
+    if args.output_csv:
+        export_trace_csv(face_data, args.output_csv)
+    
+    if not args.output_plot and not args.output_csv:
+        visualize_traces(face_data)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/utils/face_tracker.py
+++ b/scripts/utils/face_tracker.py
@@ -0,0 +1,452 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Face Tracker - Track faces across frames using embedding similarity and bbox proximity
+
+Purpose:
+1. Assign unique trace_id to each face across frames
+2. Track face movement across adjacent frames
+3. Output trace statistics (duration, path, confidence)
+
+Algorithm:
+1. For first frame: assign new trace_id to each face
+2. For subsequent frames:
+   - Calculate bbox overlap with previous frame faces
+   - Calculate embedding cosine similarity
+   - Match faces if both conditions met
+   - Assign same trace_id if matched, new trace_id if not
+
+Matching Conditions:
+- bbox overlap > 0.3 (IoU)
+- embedding similarity > 0.7
+- OR single condition > threshold (fallback)
+
+Output:
+- face.json with trace_id added to each face
+- trace statistics report
+"""
+
+import sys
+import json
+import argparse
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+from collections import defaultdict
+
+
+def calculate_bbox_iou(bbox1: Dict, bbox2: Dict) -> float:
+    """
+    Calculate Intersection over Union (IoU) between two bboxes
+    
+    Args:
+        bbox1: {"x": int, "y": int, "width": int, "height": int}
+        bbox2: same structure
+    
+    Returns:
+        IoU score (0.0 - 1.0)
+    """
+    x1, y1, w1, h1 = bbox1["x"], bbox1["y"], bbox1["width"], bbox1["height"]
+    x2, y2, w2, h2 = bbox2["x"], bbox2["y"], bbox2["width"], bbox2["height"]
+    
+    x1_min, x1_max = x1, x1 + w1
+    y1_min, y1_max = y1, y1 + h1
+    x2_min, x2_max = x2, x2 + w2
+    y2_min, y2_max = y2, y2 + h2
+    
+    inter_x_min = max(x1_min, x2_min)
+    inter_x_max = min(x1_max, x2_max)
+    inter_y_min = max(y1_min, y2_min)
+    inter_y_max = min(y1_max, y2_max)
+    
+    if inter_x_max <= inter_x_min or inter_y_max <= inter_y_min:
+        return 0.0
+    
+    inter_area = (inter_x_max - inter_x_min) * (inter_y_max - inter_y_min)
+    area1 = w1 * h1
+    area2 = w2 * h2
+    union_area = area1 + area2 - inter_area
+    
+    return inter_area / union_area if union_area > 0 else 0.0
+
+
+def calculate_bbox_distance(bbox1: Dict, bbox2: Dict) -> float:
+    """
+    Calculate center distance between two bboxes
+    
+    Returns:
+        Euclidean distance between centers
+    """
+    cx1 = bbox1["x"] + bbox1["width"] / 2
+    cy1 = bbox1["y"] + bbox1["height"] / 2
+    cx2 = bbox2["x"] + bbox2["width"] / 2
+    cy2 = bbox2["y"] + bbox2["height"] / 2
+    
+    return np.sqrt((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2)
+
+
+def calculate_embedding_similarity(emb1: List[float], emb2: List[float]) -> float:
+    """
+    Calculate cosine similarity between two embeddings
+    
+    Returns:
+        Cosine similarity (-1.0 - 1.0)
+    """
+    if emb1 is None or emb2 is None:
+        return 0.0
+    
+    v1 = np.array(emb1)
+    v2 = np.array(emb2)
+    
+    norm1 = np.linalg.norm(v1)
+    norm2 = np.linalg.norm(v2)
+    
+    if norm1 == 0 or norm2 == 0:
+        return 0.0
+    
+    return np.dot(v1, v2) / (norm1 * norm2)
+
+
+def match_faces(
+    current_faces: List[Dict],
+    previous_faces: List[Dict],
+    iou_threshold: float = 0.3,
+    similarity_threshold: float = 0.7,
+    distance_threshold: float = 100.0,
+    use_embedding: bool = True,
+) -> Dict[int, int]:
+    """
+    Match current frame faces to previous frame faces
+    
+    Args:
+        current_faces: Faces in current frame
+        previous_faces: Faces in previous frame
+        iou_threshold: Minimum IoU for matching
+        similarity_threshold: Minimum embedding similarity for matching
+        distance_threshold: Maximum bbox center distance for matching
+        use_embedding: Whether to use embedding similarity
+    
+    Returns:
+        Dict mapping current_face_index -> previous_face_index (or -1 if new)
+    """
+    if not previous_faces:
+        return {i: -1 for i in range(len(current_faces))}
+    
+    matches = {}
+    used_prev = set()
+    
+    for curr_idx, curr_face in enumerate(current_faces):
+        best_prev_idx = -1
+        best_score = 0.0
+        
+        curr_bbox = {
+            "x": curr_face["x"],
+            "y": curr_face["y"],
+            "width": curr_face["width"],
+            "height": curr_face["height"],
+        }
+        curr_emb = curr_face.get("embedding")
+        
+        for prev_idx, prev_face in enumerate(previous_faces):
+            if prev_idx in used_prev:
+                continue
+            
+            prev_bbox = {
+                "x": prev_face["x"],
+                "y": prev_face["y"],
+                "width": prev_face["width"],
+                "height": prev_face["height"],
+            }
+            prev_emb = prev_face.get("embedding")
+            
+            iou = calculate_bbox_iou(curr_bbox, prev_bbox)
+            distance = calculate_bbox_distance(curr_bbox, prev_bbox)
+            
+            similarity = 0.0
+            if use_embedding and curr_emb and prev_emb:
+                similarity = calculate_embedding_similarity(curr_emb, prev_emb)
+            
+            score = 0.0
+            
+            if iou > iou_threshold and similarity > similarity_threshold:
+                score = iou + similarity
+            elif iou > 0.5:
+                score = iou * 2
+            elif similarity > 0.85:
+                score = similarity * 2
+            elif distance < distance_threshold and similarity > 0.6:
+                score = similarity - distance / 1000
+            
+            if score > best_score:
+                best_score = score
+                best_prev_idx = prev_idx
+        
+        if best_prev_idx >= 0 and best_score > 0:
+            matches[curr_idx] = best_prev_idx
+            used_prev.add(best_prev_idx)
+        else:
+            matches[curr_idx] = -1
+    
+    return matches
+
+
+def track_faces(
+    face_data: Dict,
+    iou_threshold: float = 0.3,
+    similarity_threshold: float = 0.7,
+    distance_threshold: float = 100.0,
+    use_embedding: bool = True,
+) -> Dict:
+    """
+    Track faces across all frames
+    
+    Args:
+        face_data: face.json data
+        iou_threshold: IoU threshold for matching
+        similarity_threshold: Embedding similarity threshold
+        distance_threshold: Distance threshold for matching
+        use_embedding: Whether to use embedding
+    
+    Returns:
+        Updated face_data with trace_id added to each face
+    """
+    frames = face_data.get("frames", {})
+    
+    if not frames:
+        print("No frames found in face.json")
+        return face_data
+    
+    sorted_frames = sorted(frames.items(), key=lambda x: int(x[0]))
+    
+    next_trace_id = 0
+    traces = defaultdict(list)
+    
+    prev_faces = []
+    prev_trace_ids = []
+    
+    print(f"\nTracking faces across {len(sorted_frames)} frames...")
+    print(f"Parameters: iou={iou_threshold}, similarity={similarity_threshold}, distance={distance_threshold}")
+    print()
+    
+    for frame_num_str, frame_data in sorted_frames:
+        frame_num = int(frame_num_str)
+        faces = frame_data.get("faces", [])
+        
+        if not faces:
+            prev_faces = []
+            prev_trace_ids = []
+            continue
+        
+        matches = match_faces(
+            faces,
+            prev_faces,
+            iou_threshold,
+            similarity_threshold,
+            distance_threshold,
+            use_embedding,
+        )
+        
+        trace_ids = []
+        for curr_idx, prev_idx in matches.items():
+            if prev_idx >= 0:
+                trace_id = prev_trace_ids[prev_idx]
+            else:
+                trace_id = next_trace_id
+                next_trace_id += 1
+            
+            faces[curr_idx]["trace_id"] = trace_id
+            trace_ids.append(trace_id)
+            traces[trace_id].append({
+                "frame": frame_num,
+                "face_index": curr_idx,
+                "bbox": {
+                    "x": faces[curr_idx]["x"],
+                    "y": faces[curr_idx]["y"],
+                    "width": faces[curr_idx]["width"],
+                    "height": faces[curr_idx]["height"],
+                },
+                "confidence": faces[curr_idx].get("confidence", 0.0),
+                "pose_angle": faces[curr_idx].get("pose_angle", {}).get("angle", "unknown"),
+                "pose_full": faces[curr_idx].get("pose_angle", {}),  # 完整 pose 信息
+            })
+        
+        prev_faces = faces
+        prev_trace_ids = trace_ids
+        
+        if frame_num % 100 == 0:
+            print(f"  Frame {frame_num}: {len(faces)} faces, {len(set(trace_ids))} active traces")
+    
+    face_data["traces"] = {}
+    for trace_id, path in traces.items():
+        if len(path) >= 1:
+            duration_frames = path[-1]["frame"] - path[0]["frame"] + 1
+            avg_confidence = sum(p["confidence"] for p in path) / len(path)
+            pose_angles = [p["pose_angle"] for p in path]
+            
+            # Pose Trace: 完整 pose 信息
+            pose_trace = []
+            for p in path:
+                pose_info = p.get("pose_full", {})
+                pose_trace.append({
+                    "frame": p["frame"],
+                    "angle": pose_info.get("angle", "unknown"),
+                    "confidence": pose_info.get("confidence", 0.0),
+                    "pitch": pose_info.get("pitch", "neutral"),
+                    "features": pose_info.get("features", {}),
+                })
+            
+            # Pose Statistics
+            pose_counts = defaultdict(int)
+            pose_confidence_by_angle = defaultdict(list)
+            for pose in pose_trace:
+                pose_counts[pose["angle"]] += 1
+                pose_confidence_by_angle[pose["angle"]].append(pose["confidence"])
+            
+            pose_statistics = {
+                "distribution": dict(pose_counts),
+                "avg_confidence_by_angle": {
+                    angle: round(sum(conf_list) / len(conf_list), 3)
+                    for angle, conf_list in pose_confidence_by_angle.items()
+                },
+                "dominant_angle": max(pose_counts.items(), key=lambda x: x[1])[0] if pose_counts else "unknown",
+                "pose_count": len(pose_counts),
+            }
+            
+            # Pose Transitions: pose 变化事件
+            pose_transitions = []
+            prev_pose = None
+            for i, pose in enumerate(pose_trace):
+                if prev_pose is not None and pose["angle"] != prev_pose["angle"]:
+                    pose_transitions.append({
+                        "frame": pose["frame"],
+                        "from_angle": prev_pose["angle"],
+                        "to_angle": pose["angle"],
+                        "transition_index": len(pose_transitions) + 1,
+                    })
+                prev_pose = pose
+            
+            face_data["traces"][str(trace_id)] = {
+                "trace_id": trace_id,
+                "start_frame": path[0]["frame"],
+                "end_frame": path[-1]["frame"],
+                "duration_frames": duration_frames,
+                "duration_seconds": duration_frames / face_data["metadata"]["fps"],
+                "total_appearances": len(path),
+                "avg_confidence": avg_confidence,
+                "pose_angles": pose_angles,
+                "pose_trace": pose_trace,
+                "pose_statistics": pose_statistics,
+                "pose_transitions": pose_transitions,
+                "path": path,
+            }
+    
+    face_data["metadata"]["trace_stats"] = {
+        "total_traces": next_trace_id,
+        "active_traces": len(traces),
+        "long_traces": len([t for t in traces.values() if len(t) >= 2]),
+    }
+    
+    return face_data
+
+
+def analyze_traces(face_data: Dict) -> None:
+    """
+    Analyze and print trace statistics
+    """
+    traces = face_data.get("traces", {})
+    metadata = face_data.get("metadata", {})
+    
+    print("\n" + "=" * 60)
+    print("Face Trace Analysis")
+    print("=" * 60)
+    
+    print(f"\nTotal traces: {metadata.get('trace_stats', {}).get('total_traces', 0)}")
+    print(f"Long traces (>= 2 frames): {len(traces)}")
+    
+    if not traces:
+        return
+    
+    sorted_traces = sorted(traces.values(), key=lambda x: x["duration_frames"], reverse=True)
+    
+    print("\n=== Top 10 Longest Traces ===")
+    for i, trace in enumerate(sorted_traces[:10]):
+        print(f"\nTrace {trace['trace_id']}:")
+        print(f"  Frames: {trace['start_frame']} - {trace['end_frame']} ({trace['duration_frames']} frames)")
+        print(f"  Duration: {trace['duration_seconds']:.2f} seconds")
+        print(f"  Appearances: {trace['total_appearances']}")
+        print(f"  Avg Confidence: {trace['avg_confidence']:.3f}")
+        
+        # Pose Statistics
+        pose_stats = trace.get("pose_statistics", {})
+        print(f"  Pose Distribution: {pose_stats.get('distribution', {})}")
+        print(f"  Dominant Angle: {pose_stats.get('dominant_angle', 'unknown')}")
+        
+        # Pose Transitions
+        transitions = trace.get("pose_transitions", [])
+        if transitions:
+            print(f"  Pose Transitions: {len(transitions)} events")
+            for t in transitions[:3]:  # 只显示前 3 个
+                print(f"    - Frame {t['frame']}: {t['from_angle']} → {t['to_angle']}")
+    
+    pose_stats = defaultdict(int)
+    for trace in traces.values():
+        for pose in trace["pose_angles"]:
+            pose_stats[pose] += 1
+    
+    print("\n=== Pose Distribution in Traces ===")
+    for pose, count in sorted(pose_stats.items(), key=lambda x: x[1], reverse=True):
+        print(f"  {pose}: {count}")
+    
+    duration_distribution = defaultdict(int)
+    for trace in traces.values():
+        d = trace["duration_frames"]
+        if d <= 30:
+            duration_distribution["short (<= 30 frames)"] += 1
+        elif d <= 90:
+            duration_distribution["medium (31-90 frames)"] += 1
+        else:
+            duration_distribution["long (> 90 frames)"] += 1
+    
+    print("\n=== Trace Duration Distribution ===")
+    for duration, count in sorted(duration_distribution.items()):
+        print(f"  {duration}: {count}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Track faces across frames")
+    parser.add_argument("--face-json", required=True, help="Path to face.json")
+    parser.add_argument("--output", help="Output path (default: face_traced.json)")
+    parser.add_argument("--iou-threshold", type=float, default=0.3, help="IoU threshold")
+    parser.add_argument("--similarity-threshold", type=float, default=0.7, help="Embedding similarity threshold")
+    parser.add_argument("--distance-threshold", type=float, default=100.0, help="Distance threshold")
+    parser.add_argument("--no-embedding", action="store_true", help="Disable embedding matching")
+    parser.add_argument("--analyze-only", action="store_true", help="Only analyze, don't output")
+    args = parser.parse_args()
+    
+    print("=" * 60)
+    print("Face Tracker")
+    print("=" * 60)
+    
+    with open(args.face_json) as f:
+        face_data = json.load(f)
+    
+    print(f"\nInput: {args.face_json}")
+    print(f"Frames: {len(face_data.get('frames', {}))}")
+    
+    face_data = track_faces(
+        face_data,
+        iou_threshold=args.iou_threshold,
+        similarity_threshold=args.similarity_threshold,
+        distance_threshold=args.distance_threshold,
+        use_embedding=not args.no_embedding,
+    )
+    
+    analyze_traces(face_data)
+    
+    if not args.analyze_only:
+        output_path = args.output or args.face_json.replace(".json", "_traced.json")
+        with open(output_path, "w") as f:
+            json.dump(face_data, f, indent=2)
+        print(f"\n✅ Output saved to: {output_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/utils/pose_action_decoder.py
+++ b/scripts/utils/pose_action_decoder.py
@@ -0,0 +1,522 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Pose Action Decoder - Convert pose_trace into human-readable action names
+
+Purpose:
+1. Decode pose transitions into action names (turn left/right, look up/down, shake head, nod)
+2. Identify stable pose segments with duration
+3. Generate action timeline for each trace
+
+Action Types:
+- Simple: turn_left, turn_right, look_up, look_down
+- Complex: shake_head, nod_head, turn_full
+- Stable: frontal_stable, profile_left_stable, profile_right_stable, three_quarter_stable
+
+Output:
+1. Action timeline (frame-based action list)
+2. Action summary (total counts, duration)
+3. Action visualization (timeline plot)
+"""
+
+import sys
+import json
+import argparse
+import numpy as np
+import matplotlib.pyplot as plt
+from typing import Dict, List, Optional
+from collections import defaultdict
+
+
+# Action definitions
+POSE_TO_ACTION = {
+    # Turn actions (angle changes)
+    ("frontal", "three_quarter"): "turn_partial",
+    ("frontal", "profile_left"): "turn_left",
+    ("frontal", "profile_right"): "turn_right",
+    ("three_quarter", "frontal"): "return_frontal",
+    ("three_quarter", "profile_left"): "turn_left",
+    ("three_quarter", "profile_right"): "turn_right",
+    ("profile_left", "frontal"): "turn_to_frontal",
+    ("profile_left", "three_quarter"): "turn_to_three_quarter",
+    ("profile_left", "profile_right"): "turn_full",
+    ("profile_right", "frontal"): "turn_to_frontal",
+    ("profile_right", "three_quarter"): "turn_to_three_quarter",
+    ("profile_right", "profile_left"): "turn_full",
+    
+    # Pitch actions
+    ("neutral", "tilted_up"): "look_up",
+    ("neutral", "tilted_down"): "look_down",
+    ("tilted_up", "neutral"): "return_neutral",
+    ("tilted_down", "neutral"): "return_neutral",
+    ("tilted_up", "tilted_down"): "nod_full",
+    ("tilted_down", "tilted_up"): "nod_full",
+}
+
+# Stable pose names
+STABLE_ACTION_NAMES = {
+    "frontal": "frontal_stable",
+    "three_quarter": "three_quarter_stable",
+    "profile_left": "profile_left_stable",
+    "profile_right": "profile_right_stable",
+    "unknown": "pose_unknown",
+}
+
+# Complex action patterns (3+ transitions in short time)
+COMPLEX_PATTERNS = {
+    # Shake head: profile_left → profile_right → profile_left (or reverse)
+    "shake_head": {
+        "sequence": ["profile_left", "profile_right", "profile_left"],
+        "min_frames": 5,
+        "max_frames": 30,
+    },
+    "shake_head_reverse": {
+        "sequence": ["profile_right", "profile_left", "profile_right"],
+        "min_frames": 5,
+        "max_frames": 30,
+    },
+    # Nod: tilted_up → tilted_down → tilted_up (or reverse)
+    "nod_head": {
+        "sequence": ["tilted_up", "tilted_down", "tilted_up"],
+        "min_frames": 3,
+        "max_frames": 20,
+        "pitch_mode": True,
+    },
+}
+
+
+def decode_pose_to_action(from_pose: str, to_pose: str) -> str:
+    """
+    Decode single pose transition to action name
+    
+    Args:
+        from_pose: Source pose angle
+        to_pose: Target pose angle
+    
+    Returns:
+        Action name
+    """
+    key = (from_pose, to_pose)
+    
+    if key in POSE_TO_ACTION:
+        return POSE_TO_ACTION[key]
+    
+    # Default action
+    return f"pose_change_{from_pose}_to_{to_pose}"
+
+
+def detect_complex_actions(pose_trace: List[Dict]) -> List[Dict]:
+    """
+    Detect complex action patterns (shake head, nod, etc.)
+    
+    Args:
+        pose_trace: Pose trace list
+    
+    Returns:
+        List of complex action events
+    """
+    complex_actions = []
+    
+    # Shake head detection
+    for i in range(len(pose_trace) - 2):
+        angles = [pose_trace[i]["angle"], pose_trace[i+1]["angle"], pose_trace[i+2]["angle"]]
+        
+        # Check shake_head pattern
+        if angles == ["profile_left", "profile_right", "profile_left"]:
+            duration_frames = pose_trace[i+2]["frame"] - pose_trace[i]["frame"]
+            if 5 <= duration_frames <= 30:
+                complex_actions.append({
+                    "action": "shake_head",
+                    "start_frame": pose_trace[i]["frame"],
+                    "end_frame": pose_trace[i+2]["frame"],
+                    "duration_frames": duration_frames,
+                    "description": "shake head left-right-left",
+                })
+        
+        elif angles == ["profile_right", "profile_left", "profile_right"]:
+            duration_frames = pose_trace[i+2]["frame"] - pose_trace[i]["frame"]
+            if 5 <= duration_frames <= 30:
+                complex_actions.append({
+                    "action": "shake_head",
+                    "start_frame": pose_trace[i]["frame"],
+                    "end_frame": pose_trace[i+2]["frame"],
+                    "duration_frames": duration_frames,
+                    "description": "shake head right-left-right",
+                })
+    
+    # Nod detection (pitch-based)
+    for i in range(len(pose_trace) - 2):
+        pitches = [pose_trace[i]["pitch"], pose_trace[i+1]["pitch"], pose_trace[i+2]["pitch"]]
+        
+        if pitches == ["tilted_up", "tilted_down", "tilted_up"] or \
+           pitches == ["tilted_down", "tilted_up", "tilted_down"]:
+            duration_frames = pose_trace[i+2]["frame"] - pose_trace[i]["frame"]
+            if 3 <= duration_frames <= 20:
+                complex_actions.append({
+                    "action": "nod_head",
+                    "start_frame": pose_trace[i]["frame"],
+                    "end_frame": pose_trace[i+2]["frame"],
+                    "duration_frames": duration_frames,
+                    "description": "nod head up-down",
+                })
+    
+    return complex_actions
+
+
+def build_action_timeline(trace: Dict) -> Dict:
+    """
+    Build action timeline from pose_trace
+    
+    Args:
+        trace: Trace data with pose_trace, pose_transitions
+    
+    Returns:
+        Action timeline dict
+    """
+    pose_trace = trace.get("pose_trace", [])
+    pose_transitions = trace.get("pose_transitions", [])
+    
+    if len(pose_trace) < 1:
+        return {
+            "trace_id": trace.get("trace_id"),
+            "action_timeline": [],
+            "action_summary": {},
+            "complex_actions": [],
+        }
+    
+    action_timeline = []
+    complex_actions = detect_complex_actions(pose_trace)
+    
+    # Build pose segments (stable periods)
+    pose_segments = []
+    current_pose = pose_trace[0]["angle"]
+    current_start = pose_trace[0]["frame"]
+    current_pitch = pose_trace[0]["pitch"]
+    
+    for i in range(1, len(pose_trace)):
+        pose = pose_trace[i]
+        
+        # Check if pose changed
+        if pose["angle"] != current_pose or pose["pitch"] != current_pitch:
+            pose_segments.append({
+                "angle": current_pose,
+                "pitch": current_pitch,
+                "start_frame": current_start,
+                "end_frame": pose_trace[i-1]["frame"],
+                "duration_frames": pose_trace[i-1]["frame"] - current_start + 1,
+            })
+            current_pose = pose["angle"]
+            current_pitch = pose["pitch"]
+            current_start = pose["frame"]
+    
+    # Add last segment
+    pose_segments.append({
+        "angle": current_pose,
+        "pitch": current_pitch,
+        "start_frame": current_start,
+        "end_frame": pose_trace[-1]["frame"],
+        "duration_frames": pose_trace[-1]["frame"] - current_start + 1,
+    })
+    
+    # Build action timeline
+    for seg in pose_segments:
+        # Determine action name
+        if seg["duration_frames"] >= 10:  # Stable pose (>= 10 frames)
+            action_name = STABLE_ACTION_NAMES.get(seg["angle"], "pose_stable")
+            
+            # Add pitch modifier
+            if seg["pitch"] != "neutral":
+                action_name += f"_pitch_{seg['pitch']}"
+            
+            action_timeline.append({
+                "frame": seg["start_frame"],
+                "action": action_name,
+                "duration_frames": seg["duration_frames"],
+                "description": f"stable {seg['angle']} pose for {seg['duration_frames']} frames",
+                "type": "stable",
+            })
+        
+        else:  # Short pose (transitional)
+            action_name = f"pose_{seg['angle']}_brief"
+            action_timeline.append({
+                "frame": seg["start_frame"],
+                "action": action_name,
+                "duration_frames": seg["duration_frames"],
+                "description": f"brief {seg['angle']} pose for {seg['duration_frames']} frames",
+                "type": "transitional",
+            })
+    
+    # Add transition actions
+    for trans in pose_transitions:
+        action_name = decode_pose_to_action(trans["from_angle"], trans["to_angle"])
+        action_timeline.append({
+            "frame": trans["frame"],
+            "action": action_name,
+            "duration_frames": 1,  # Transition is instant
+            "description": f"transition from {trans['from_angle']} to {trans['to_angle']}",
+            "type": "transition",
+        })
+    
+    # Sort by frame
+    action_timeline.sort(key=lambda x: x["frame"])
+    
+    # Add complex actions
+    for complex_act in complex_actions:
+        action_timeline.append({
+            "frame": complex_act["start_frame"],
+            "action": complex_act["action"],
+            "duration_frames": complex_act["duration_frames"],
+            "description": complex_act["description"],
+            "type": "complex",
+        })
+    
+    # Re-sort
+    action_timeline.sort(key=lambda x: (x["frame"], -x["duration_frames"]))
+    
+    # Build action summary
+    action_counts = defaultdict(int)
+    action_durations = defaultdict(float)
+    
+    for act in action_timeline:
+        action_counts[act["action"]] += 1
+        action_durations[act["action"]] += act["duration_frames"]
+    
+    action_summary = {
+        "total_actions": len(action_timeline),
+        "unique_actions": len(action_counts),
+        "action_counts": dict(action_counts),
+        "action_durations_frames": {k: round(v, 1) for k, v in action_durations.items()},
+        "complex_action_count": len(complex_actions),
+        "stable_percentage": round(
+            sum(1 for act in action_timeline if act["type"] == "stable") / len(action_timeline) * 100, 1
+        ) if action_timeline else 0,
+    }
+    
+    return {
+        "trace_id": trace.get("trace_id"),
+        "action_timeline": action_timeline,
+        "action_summary": action_summary,
+        "complex_actions": complex_actions,
+    }
+
+
+def generate_action_description(action_timeline: List[Dict]) -> str:
+    """
+    Generate human-readable action description
+    
+    Args:
+        action_timeline: Action timeline list
+    
+    Returns:
+        Action description string
+    """
+    if not action_timeline:
+        return "No actions detected"
+    
+    # Group actions by type
+    stable_actions = [a for a in action_timeline if a["type"] == "stable"]
+    transition_actions = [a for a in action_timeline if a["type"] == "transition"]
+    complex_actions = [a for a in action_timeline if a["type"] == "complex"]
+    
+    desc_parts = []
+    
+    # Stable poses
+    if stable_actions:
+        stable_desc = []
+        for act in stable_actions[:3]:  # Top 3 stable poses
+            stable_desc.append(f"{act['description']}")
+        desc_parts.append(f"Stable poses: {', '.join(stable_desc)}")
+    
+    # Transitions
+    if transition_actions:
+        trans_desc = [act["action"] for act in transition_actions[:5]]  # Top 5 transitions
+        desc_parts.append(f"Transitions: {', '.join(trans_desc)}")
+    
+    # Complex actions
+    if complex_actions:
+        complex_desc = [act["action"] for act in complex_actions]
+        desc_parts.append(f"Complex actions: {', '.join(complex_desc)}")
+    
+    return ". ".join(desc_parts)
+
+
+def visualize_action_timeline(action_data: Dict, output_path: str = None) -> None:
+    """
+    Visualize action timeline
+    """
+    traces_data = action_data.get("traces", {})
+    
+    if not traces_data:
+        print("No traces found")
+        return
+    
+    fig, axes = plt.subplots(len(traces_data), 1, figsize=(16, 3 * len(traces_data)))
+    
+    if len(traces_data) == 1:
+        axes = [axes]
+    
+    action_colors = {
+        "frontal_stable": "green",
+        "three_quarter_stable": "blue",
+        "profile_left_stable": "orange",
+        "profile_right_stable": "red",
+        "turn_left": "purple",
+        "turn_right": "purple",
+        "turn_full": "darkred",
+        "shake_head": "yellow",
+        "nod_head": "cyan",
+        "look_up": "lightgreen",
+        "look_down": "brown",
+    }
+    
+    for ax, (trace_id, data) in zip(axes, sorted(traces_data.items())):
+        timeline = data["action_timeline"]
+        
+        if not timeline:
+            continue
+        
+        # Plot action timeline as bars
+        for act in timeline:
+            color = action_colors.get(act["action"], "gray")
+            
+            if act["duration_frames"] > 1:
+                ax.barh(
+                    y=0,
+                    width=act["duration_frames"],
+                    left=act["frame"],
+                    height=0.8,
+                    color=color,
+                    alpha=0.6,
+                    edgecolor="black",
+                    linewidth=0.5,
+                )
+                
+                # Add label for stable actions
+                if act["type"] == "stable" and act["duration_frames"] > 30:
+                    ax.text(
+                        act["frame"] + act["duration_frames"] / 2,
+                        0,
+                        act["action"],
+                        ha="center",
+                        va="center",
+                        fontsize=8,
+                        color="white",
+                    )
+            else:
+                # Instant action (transition)
+                ax.axvline(x=act["frame"], color=color, linestyle="--", alpha=0.8)
+                ax.text(
+                    act["frame"],
+                    0.5,
+                    act["action"],
+                    fontsize=7,
+                    rotation=90,
+                    va="bottom",
+                    ha="center",
+                )
+        
+        ax.set_xlabel("Frame Number")
+        ax.set_ylabel("Action")
+        ax.set_title(f"Trace {trace_id} Action Timeline")
+        ax.set_ylim(-0.5, 1)
+        ax.grid(True, alpha=0.3)
+    
+    plt.tight_layout()
+    
+    if output_path:
+        plt.savefig(output_path, dpi=150, bbox_inches="tight")
+        print(f"\n✅ Visualization saved to: {output_path}")
+    else:
+        plt.show()
+
+
+def print_action_report(action_data: Dict) -> None:
+    """
+    Print action report
+    """
+    traces_data = action_data.get("traces", {})
+    
+    print("\n" + "=" * 70)
+    print("Pose Action Decoder Report")
+    print("=" * 70)
+    
+    for trace_id, data in sorted(traces_data.items()):
+        print(f"\n{'='*70}")
+        print(f"Trace {trace_id}")
+        print(f"{'='*70}")
+        
+        summary = data["action_summary"]
+        print(f"\nSummary:")
+        print(f"  Total Actions: {summary['total_actions']}")
+        print(f"  Unique Actions: {summary['unique_actions']}")
+        print(f"  Complex Actions: {summary['complex_action_count']}")
+        print(f"  Stable Percentage: {summary['stable_percentage']}%")
+        
+        print(f"\nAction Counts:")
+        for action, count in sorted(summary["action_counts"].items(), key=lambda x: x[1], reverse=True):
+            print(f"  {action}: {count}")
+        
+        print(f"\nAction Timeline (前 10 个):")
+        timeline = data["action_timeline"]
+        for act in timeline[:10]:
+            print(f"  Frame {act['frame']}: {act['action']} ({act['type']}, {act['duration_frames']} frames)")
+        
+        if data["complex_actions"]:
+            print(f"\nComplex Actions:")
+            for act in data["complex_actions"]:
+                print(f"  {act['action']}: frames {act['start_frame']}-{act['end_frame']} ({act['duration_frames']} frames)")
+        
+        # Generate description
+        desc = generate_action_description(data["action_timeline"])
+        print(f"\nHuman-readable Description:")
+        print(f"  {desc}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Decode pose_trace into action names")
+    parser.add_argument("--face-json", required=True, help="Path to face_traced.json")
+    parser.add_argument("--output-json", help="Output action data JSON")
+    parser.add_argument("--output-plot", help="Output action timeline plot PNG")
+    parser.add_argument("--trace-id", type=int, help="Analyze specific trace only")
+    args = parser.parse_args()
+    
+    print("=" * 70)
+    print("Pose Action Decoder")
+    print("=" * 70)
+    
+    with open(args.face_json) as f:
+        face_data = json.load(f)
+    
+    traces = face_data.get("traces", {})
+    
+    if not traces:
+        print("No traces found in face_traced.json")
+        return
+    
+    # Filter by trace_id if specified
+    if args.trace_id:
+        traces = {str(args.trace_id): traces.get(str(args.trace_id))}
+        if not traces[str(args.trace_id)]:
+            print(f"Trace {args.trace_id} not found")
+            return
+    
+    print(f"\nAnalyzing {len(traces)} traces...")
+    
+    action_data = {"traces": {}}
+    
+    for trace_id_str, trace in traces.items():
+        action_result = build_action_timeline(trace)
+        action_data["traces"][trace_id_str] = action_result
+    
+    print_action_report(action_data)
+    
+    if args.output_json:
+        with open(args.output_json, "w") as f:
+            json.dump(action_data, f, indent=2)
+        print(f"\n✅ Action data saved to: {args.output_json}")
+    
+    if args.output_plot:
+        visualize_action_timeline(action_data, args.output_plot)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/utils/pose_analyzer.py
+++ b/scripts/utils/pose_analyzer.py
@@ -0,0 +1,402 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Pose Analyzer - Multi-feature Pose Angle Classification
+
+Purpose:
+1. Calculate pose angle from 5-point landmarks (InsightFace kps)
+2. Use multiple features for accurate classification:
+   - nose_to_eye_ratio: nose distance relative to eye width
+   - eye_slope: eye line slope (pitch detection)
+   - nose_offset: nose position relative to eye center
+   - mouth_symmetry: mouth corners symmetry
+3. Provide confidence score for classification
+
+Landmarks Order (InsightFace kps):
+- 0: left eye
+- 1: right eye
+- 2: nose
+- 3: left mouth corner
+- 4: right mouth corner
+
+Angles:
+- frontal: nose near center, low ratio (< 0.4)
+- three_quarter: moderate offset (ratio 0.4 - 0.6)
+- profile_left: nose left of eye center (ratio > 0.6)
+- profile_right: nose right of eye center (ratio > 0.6)
+
+Usage:
+    from pose_analyzer import calculate_pose_angle_v2
+    
+    pose_result = calculate_pose_angle_v2(landmarks)
+    print(f"Angle: {pose_result['angle']}, Confidence: {pose_result['confidence']}")
+"""
+
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+
+
+def calculate_nose_to_eye_ratio(landmarks: List) -> Tuple[float, float, float]:
+    """
+    Calculate nose-to-eye ratio
+    
+    Returns:
+        (ratio, eye_width, nose_to_eye_distance)
+    """
+    if len(landmarks) < 5:
+        return (0.0, 0.0, 0.0)
+    
+    left_eye = np.array(landmarks[0][:2])
+    right_eye = np.array(landmarks[1][:2])
+    nose = np.array(landmarks[2][:2])
+    
+    eye_center = (left_eye + right_eye) / 2
+    eye_width = np.linalg.norm(right_eye - left_eye)
+    nose_to_eye = np.linalg.norm(nose - eye_center)
+    
+    ratio = nose_to_eye / eye_width if eye_width > 0 else 0.0
+    
+    return (ratio, eye_width, nose_to_eye)
+
+
+def calculate_eye_slope(landmarks: List) -> Tuple[float, float]:
+    """
+    Calculate eye line slope (for pitch detection)
+    
+    Positive slope = head tilted down
+    Negative slope = head tilted up
+    
+    Returns:
+        (slope, angle_degrees)
+    """
+    if len(landmarks) < 5:
+        return (0.0, 0.0)
+    
+    left_eye = np.array(landmarks[0][:2])
+    right_eye = np.array(landmarks[1][:2])
+    
+    dx = right_eye[0] - left_eye[0]
+    dy = right_eye[1] - left_eye[1]
+    
+    slope = dy / dx if dx != 0 else 0.0
+    angle_degrees = np.arctan(slope) * 180 / np.pi
+    
+    return (slope, angle_degrees)
+
+
+def calculate_nose_offset(landmarks: List) -> Tuple[float, float]:
+    """
+    Calculate nose horizontal offset relative to eye center
+    
+    Returns:
+        (offset_x, normalized_offset)
+    """
+    if len(landmarks) < 5:
+        return (0.0, 0.0)
+    
+    left_eye = np.array(landmarks[0][:2])
+    right_eye = np.array(landmarks[1][:2])
+    nose = np.array(landmarks[2][:2])
+    
+    eye_center = (left_eye + right_eye) / 2
+    eye_width = np.linalg.norm(right_eye - left_eye)
+    
+    offset_x = nose[0] - eye_center[0]
+    normalized_offset = offset_x / eye_width if eye_width > 0 else 0.0
+    
+    return (offset_x, normalized_offset)
+
+
+def calculate_mouth_symmetry(landmarks: List) -> Tuple[float, float]:
+    """
+    Calculate mouth corners symmetry
+    
+    For profile faces, mouth corners are asymmetric
+    
+    Returns:
+        (symmetry_score, mouth_width)
+    """
+    if len(landmarks) < 5:
+        return (1.0, 0.0)
+    
+    left_mouth = np.array(landmarks[3][:2])
+    right_mouth = np.array(landmarks[4][:2])
+    nose = np.array(landmarks[2][:2])
+    
+    mouth_width = np.linalg.norm(right_mouth - left_mouth)
+    
+    left_dist = np.linalg.norm(left_mouth - nose)
+    right_dist = np.linalg.norm(right_mouth - nose)
+    
+    symmetry = min(left_dist, right_dist) / max(left_dist, right_dist) if max(left_dist, right_dist) > 0 else 1.0
+    
+    return (symmetry, mouth_width)
+
+
+def calculate_jaw_visibility_hint(landmarks: List) -> float:
+    """
+    Estimate jaw visibility from mouth position
+    
+    For profile faces, one side of jaw is more visible
+    
+    Returns:
+        visibility_hint (0.0 - 1.0)
+    """
+    if len(landmarks) < 5:
+        return 0.5
+    
+    left_eye = np.array(landmarks[0][:2])
+    right_eye = np.array(landmarks[1][:2])
+    nose = np.array(landmarks[2][:2])
+    left_mouth = np.array(landmarks[3][:2])
+    right_mouth = np.array(landmarks[4][:2])
+    
+    eye_center_y = (left_eye[1] + right_eye[1]) / 2
+    mouth_center_y = (left_mouth[1] + right_mouth[1]) / 2
+    
+    nose_to_mouth_dist = mouth_center_y - nose[1]
+    
+    eye_to_nose_dist = nose[1] - eye_center_y
+    
+    ratio = nose_to_mouth_dist / eye_to_nose_dist if eye_to_nose_dist > 0 else 0.5
+    
+    return min(1.0, max(0.0, ratio))
+
+
+def classify_angle_from_features(
+    ratio: float,
+    nose_offset_norm: float,
+    mouth_symmetry: float,
+    eye_slope: float,
+) -> Tuple[str, float]:
+    """
+    Classify angle using multiple features
+    
+    Returns:
+        (angle_type, confidence)
+    """
+    if ratio < 0.35 and abs(nose_offset_norm) < 0.15:
+        return ("frontal", 0.95)
+    
+    if ratio < 0.55 and abs(nose_offset_norm) < 0.25:
+        return ("three_quarter", 0.85)
+    
+    if ratio >= 0.55:
+        if nose_offset_norm < -0.1:
+            if mouth_symmetry < 0.85:
+                return ("profile_left", 0.90)
+            else:
+                return ("profile_left", 0.75)
+        elif nose_offset_norm > 0.1:
+            if mouth_symmetry < 0.85:
+                return ("profile_right", 0.90)
+            else:
+                return ("profile_right", 0.75)
+        else:
+            return ("three_quarter", 0.70)
+    
+    return ("unknown", 0.50)
+
+
+def calculate_pose_angle_v2(landmarks: List) -> Dict:
+    """
+    Calculate pose angle using multi-feature analysis (V2)
+    
+    This is an improved version that uses multiple features:
+    - nose_to_eye_ratio
+    - eye_slope (pitch)
+    - nose_offset (yaw)
+    - mouth_symmetry
+    
+    Args:
+        landmarks: List of 5 points [[x, y], [x, y], ...]
+                   Order: left_eye, right_eye, nose, left_mouth, right_mouth
+    
+    Returns:
+        Dict with:
+        - angle: 'frontal', 'three_quarter', 'profile_left', 'profile_right', 'unknown'
+        - confidence: 0.0 - 1.0
+        - features: Dict of all calculated features
+    """
+    if len(landmarks) < 5:
+        return {
+            "angle": "unknown",
+            "confidence": 0.0,
+            "features": {},
+            "method": "v2_multi_feature",
+        }
+    
+    ratio, eye_width, nose_to_eye = calculate_nose_to_eye_ratio(landmarks)
+    eye_slope, eye_angle = calculate_eye_slope(landmarks)
+    nose_offset, nose_offset_norm = calculate_nose_offset(landmarks)
+    mouth_symmetry, mouth_width = calculate_mouth_symmetry(landmarks)
+    jaw_hint = calculate_jaw_visibility_hint(landmarks)
+    
+    angle, confidence = classify_angle_from_features(
+        ratio=ratio,
+        nose_offset_norm=nose_offset_norm,
+        mouth_symmetry=mouth_symmetry,
+        eye_slope=eye_slope,
+    )
+    
+    if eye_slope > 0.15:
+        pitch = "tilted_down"
+    elif eye_slope < -0.15:
+        pitch = "tilted_up"
+    else:
+        pitch = "neutral"
+    
+    return {
+        "angle": angle,
+        "confidence": confidence,
+        "pitch": pitch,
+        "features": {
+            "nose_to_eye_ratio": round(ratio, 4),
+            "eye_width": round(eye_width, 2),
+            "nose_to_eye_dist": round(nose_to_eye, 2),
+            "eye_slope": round(eye_slope, 4),
+            "eye_angle_deg": round(eye_angle, 2),
+            "nose_offset_x": round(nose_offset, 2),
+            "nose_offset_norm": round(nose_offset_norm, 4),
+            "mouth_symmetry": round(mouth_symmetry, 4),
+            "mouth_width": round(mouth_width, 2),
+            "jaw_visibility_hint": round(jaw_hint, 4),
+        },
+        "method": "v2_multi_feature",
+        "landmarks_count": len(landmarks),
+    }
+
+
+def calculate_pose_angle_v1(landmarks: List) -> Dict:
+    """
+    Legacy version (V1) - single feature ratio-based
+    
+    For comparison purposes only
+    """
+    if len(landmarks) < 5:
+        return {"angle": "unknown", "confidence": 0.0}
+    
+    left_eye = np.array(landmarks[0][:2])
+    right_eye = np.array(landmarks[1][:2])
+    nose = np.array(landmarks[2][:2])
+    
+    eye_center = (left_eye + right_eye) / 2
+    eye_width = np.linalg.norm(right_eye - left_eye)
+    nose_to_eye = np.linalg.norm(nose - eye_center)
+    
+    ratio = nose_to_eye / eye_width if eye_width > 0 else 0.0
+    
+    if ratio < 0.4:
+        angle = "frontal"
+    elif ratio < 0.6:
+        angle = "three_quarter"
+    elif nose[0] < eye_center[0]:
+        angle = "profile_left"
+    else:
+        angle = "profile_right"
+    
+    return {
+        "angle": angle,
+        "confidence": 0.7,
+        "ratio": round(ratio, 4),
+        "method": "v1_single_feature",
+    }
+
+
+def compare_v1_v2(landmarks: List) -> Dict:
+    """
+    Compare V1 and V2 classification results
+    
+    Useful for validation and debugging
+    """
+    v1_result = calculate_pose_angle_v1(landmarks)
+    v2_result = calculate_pose_angle_v2(landmarks)
+    
+    return {
+        "v1": v1_result,
+        "v2": v2_result,
+        "agreement": v1_result["angle"] == v2_result["angle"],
+        "confidence_improvement": v2_result["confidence"] - v1_result["confidence"],
+    }
+
+
+def batch_classify_angles(face_json_path: str) -> Dict:
+    """
+    Batch classify all faces in face.json
+    
+    Returns:
+        Statistics and per-frame results
+    """
+    import json
+    
+    with open(face_json_path) as f:
+        data = json.load(f)
+    
+    frames = data.get("frames", {})
+    
+    results = []
+    angle_counts = {}
+    confidence_stats = []
+    
+    for frame_key, frame_data in frames.items():
+        for face_idx, face in enumerate(frame_data.get("faces", [])):
+            landmarks = face.get("landmarks", [])
+            
+            if not landmarks or len(landmarks) < 5:
+                continue
+            
+            pose_result = calculate_pose_angle_v2(landmarks)
+            pose_result["frame"] = frame_key
+            pose_result["face_index"] = face_idx
+            
+            results.append(pose_result)
+            
+            angle = pose_result["angle"]
+            angle_counts[angle] = angle_counts.get(angle, 0) + 1
+            confidence_stats.append(pose_result["confidence"])
+    
+    return {
+        "total_faces": len(results),
+        "angle_distribution": angle_counts,
+        "confidence_avg": np.mean(confidence_stats) if confidence_stats else 0.0,
+        "confidence_min": np.min(confidence_stats) if confidence_stats else 0.0,
+        "confidence_max": np.max(confidence_stats) if confidence_stats else 0.0,
+        "results": results,
+    }
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Pose Analyzer")
+    parser.add_argument("--face-json", help="Path to face.json for batch analysis")
+    parser.add_argument("--test", action="store_true", help="Run unit tests")
+    args = parser.parse_args()
+    
+    if args.test:
+        print("=" * 60)
+        print("Pose Analyzer Unit Tests")
+        print("=" * 60)
+        
+        test_landmarks = [
+            [[100, 100], [120, 100], [110, 120], [105, 130], [115, 130]],
+            [[100, 100], [120, 100], [125, 120], [105, 130], [115, 130]],
+            [[100, 100], [120, 100], [95, 120], [105, 130], [115, 130]],
+        ]
+        
+        for i, lm in enumerate(test_landmarks):
+            result = calculate_pose_angle_v2(lm)
+            print(f"\nTest {i+1}: {result['angle']} (confidence: {result['confidence']:.2f})")
+            print(f"  Features: {result['features']}")
+    
+    elif args.face_json:
+        print("=" * 60)
+        print("Batch Pose Analysis")
+        print("=" * 60)
+        
+        batch_result = batch_classify_angles(args.face_json)
+        
+        print(f"\nTotal faces: {batch_result['total_faces']}")
+        print(f"Angle distribution: {batch_result['angle_distribution']}")
+        print(f"Confidence: avg={batch_result['confidence_avg']:.2f}, min={batch_result['confidence_min']:.2f}, max={batch_result['confidence_max']:.2f}")
+    else:
+        print("Please provide --face-json or --test")
--- a/scripts/utils/pose_transition_analyzer.py
+++ b/scripts/utils/pose_transition_analyzer.py
@@ -0,0 +1,239 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Pose Transition Analyzer - Analyze pose changes within traces
+
+Purpose:
+1. Visualize pose transitions over time
+2. Calculate transition frequency and duration
+3. Identify pose stability patterns
+
+Output:
+1. Pose transition timeline
+2. Pose duration statistics
+3. Stability score per trace
+"""
+
+import sys
+import json
+import argparse
+import numpy as np
+import matplotlib.pyplot as plt
+from typing import Dict, List
+from collections import defaultdict
+
+
+def analyze_pose_transitions(face_data: Dict) -> Dict:
+    """
+    Analyze pose transitions for all traces
+    
+    Returns:
+        Dict with transition analysis results
+    """
+    traces = face_data.get("traces", {})
+    
+    if not traces:
+        return {}
+    
+    analysis = {}
+    
+    for trace_id_str, trace in traces.items():
+        trace_id = int(trace_id_str)
+        pose_trace = trace.get("pose_trace", [])
+        transitions = trace.get("pose_transitions", [])
+        
+        if len(pose_trace) < 2:
+            continue
+        
+        # Pose duration analysis
+        pose_segments = []
+        current_pose = pose_trace[0]["angle"]
+        current_start = pose_trace[0]["frame"]
+        
+        for i, pose in enumerate(pose_trace[1:], 1):
+            if pose["angle"] != current_pose:
+                pose_segments.append({
+                    "angle": current_pose,
+                    "start_frame": current_start,
+                    "end_frame": pose_trace[i-1]["frame"],
+                    "duration_frames": pose_trace[i-1]["frame"] - current_start + 1,
+                    "avg_confidence": np.mean([
+                        p["confidence"] 
+                        for p in pose_trace[current_start-pose_trace[0]["frame"]:i]
+                    ]),
+                })
+                current_pose = pose["angle"]
+                current_start = pose["frame"]
+        
+        # Add last segment
+        pose_segments.append({
+            "angle": current_pose,
+            "start_frame": current_start,
+            "end_frame": pose_trace[-1]["frame"],
+            "duration_frames": pose_trace[-1]["frame"] - current_start + 1,
+            "avg_confidence": np.mean([
+                p["confidence"] 
+                for p in pose_trace[current_start-pose_trace[0]["frame"]:]
+            ]),
+        })
+        
+        # Transition frequency
+        transition_frequency = len(transitions) / trace["duration_seconds"] if trace["duration_seconds"] > 0 else 0
+        
+        # Stability score (inverse of transition frequency)
+        stability_score = 1.0 - min(transition_frequency / 2.0, 1.0)  # 2 transitions/second = fully unstable
+        
+        # Pose average duration
+        pose_avg_duration = {}
+        for angle in set([s["angle"] for s in pose_segments]):
+            segments_for_angle = [s for s in pose_segments if s["angle"] == angle]
+            avg_dur = np.mean([s["duration_frames"] for s in segments_for_angle])
+            pose_avg_duration[angle] = round(avg_dur, 1)
+        
+        analysis[trace_id] = {
+            "trace_id": trace_id,
+            "total_transitions": len(transitions),
+            "transition_frequency": round(transition_frequency, 3),  # transitions per second
+            "stability_score": round(stability_score, 3),  # 0-1, higher = more stable
+            "pose_segments": pose_segments,
+            "pose_avg_duration": pose_avg_duration,
+            "longest_stable_pose": max(pose_segments, key=lambda x: x["duration_frames"]),
+            "transition_events": transitions,
+        }
+    
+    return analysis
+
+
+def visualize_pose_transitions(face_data: Dict, output_path: str = None) -> None:
+    """
+    Visualize pose transitions for all traces
+    """
+    traces = face_data.get("traces", {})
+    
+    if not traces:
+        print("No traces found")
+        return
+    
+    sorted_traces = sorted(traces.values(), key=lambda x: x["duration_frames"], reverse=True)
+    
+    fig, axes = plt.subplots(len(sorted_traces), 1, figsize=(16, 4 * len(sorted_traces)))
+    
+    if len(sorted_traces) == 1:
+        axes = [axes]
+    
+    pose_colors = {
+        "frontal": "green",
+        "three_quarter": "blue",
+        "profile_left": "orange",
+        "profile_right": "red",
+        "unknown": "gray",
+    }
+    
+    for ax, trace in zip(axes, sorted_traces):
+        trace_id = trace["trace_id"]
+        pose_trace = trace.get("pose_trace", [])
+        
+        if not pose_trace:
+            continue
+        
+        frames = [p["frame"] for p in pose_trace]
+        angles = [p["angle"] for p in pose_trace]
+        confidences = [p["confidence"] for p in pose_trace]
+        
+        # Plot pose angle timeline
+        for i in range(len(frames) - 1):
+            color = pose_colors.get(angles[i], "gray")
+            ax.fill_between(
+                [frames[i], frames[i+1]],
+                [0, 0],
+                [1, 1],
+                color=color,
+                alpha=0.6,
+            )
+        
+        # Mark transitions
+        transitions = trace.get("pose_transitions", [])
+        for t in transitions:
+            ax.axvline(x=t["frame"], color="black", linestyle="--", alpha=0.5, linewidth=1)
+            ax.text(t["frame"], 1.05, f"{t['from_angle']}→{t['to_angle']}", 
+                    fontsize=8, rotation=90, va="bottom", ha="center")
+        
+        # Plot confidence line
+        ax2 = ax.twinx()
+        ax2.plot(frames, confidences, color="purple", linewidth=1, alpha=0.7, label="Confidence")
+        ax2.set_ylabel("Confidence", color="purple")
+        ax2.set_ylim(0, 1)
+        
+        ax.set_xlabel("Frame Number")
+        ax.set_ylabel("Pose Angle")
+        ax.set_title(f"Trace {trace_id} Pose Timeline (Frames {trace['start_frame']}-{trace['end_frame']})")
+        ax.set_ylim(0, 1.2)
+        
+        # Add pose legend
+        legend_elements = []
+        for pose in set(angles):
+            color = pose_colors.get(pose, "gray")
+            legend_elements.append(plt.Rectangle((0, 0), 1, 1, fc=color, alpha=0.6, label=pose))
+        ax.legend(handles=legend_elements, loc="upper right", fontsize=8)
+    
+    plt.tight_layout()
+    
+    if output_path:
+        plt.savefig(output_path, dpi=150, bbox_inches="tight")
+        print(f"\n✅ Visualization saved to: {output_path}")
+    else:
+        plt.show()
+
+
+def print_transition_analysis(analysis: Dict) -> None:
+    """
+    Print transition analysis results
+    """
+    print("\n" + "=" * 60)
+    print("Pose Transition Analysis")
+    print("=" * 60)
+    
+    for trace_id, data in sorted(analysis.items()):
+        print(f"\n=== Trace {trace_id} ===")
+        print(f"Total Transitions: {data['total_transitions']}")
+        print(f"Transition Frequency: {data['transition_frequency']} transitions/second")
+        print(f"Stability Score: {data['stability_score']} (0-1, higher = more stable)")
+        print(f"Longest Stable Pose: {data['longest_stable_pose']['angle']} ({data['longest_stable_pose']['duration_frames']} frames)")
+        
+        print(f"\nPose Average Duration:")
+        for angle, avg_dur in data['pose_avg_duration'].items():
+            print(f"  {angle}: {avg_dur} frames")
+        
+        print(f"\nPose Segments (共 {len(data['pose_segments'])} 个):")
+        for seg in data['pose_segments'][:5]:
+            print(f"  {seg['angle']}: frames {seg['start_frame']}-{seg['end_frame']} ({seg['duration_frames']} frames, confidence: {seg['avg_confidence']:.3f})")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Analyze pose transitions in face traces")
+    parser.add_argument("--face-json", required=True, help="Path to face_traced.json")
+    parser.add_argument("--output-plot", help="Output plot path (PNG)")
+    parser.add_argument("--output-json", help="Output analysis JSON path")
+    args = parser.parse_args()
+    
+    with open(args.face_json) as f:
+        face_data = json.load(f)
+    
+    print("=" * 60)
+    print("Pose Transition Analyzer")
+    print("=" * 60)
+    
+    analysis = analyze_pose_transitions(face_data)
+    
+    print_transition_analysis(analysis)
+    
+    if args.output_json:
+        with open(args.output_json, "w") as f:
+            json.dump(analysis, f, indent=2)
+        print(f"\n✅ Analysis saved to: {args.output_json}")
+    
+    if args.output_plot:
+        visualize_pose_transitions(face_data, args.output_plot)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/utils/test_mediapipe.py
+++ b/scripts/utils/test_mediapipe.py
@@ -0,0 +1,377 @@
+#!/opt/homebrew/bin/python3.11
+"""
+MediaPipe Test Script - Test all MediaPipe modules
+
+Test modules:
+1. Face Mesh (468 keypoints)
+2. Pose (33 keypoints)
+3. Hands (21 keypoints per hand)
+4. Holistic (Face + Pose + Hands)
+"""
+
+import sys
+import cv2
+import numpy as np
+import mediapipe as mp
+from pathlib import Path
+
+
+def test_face_mesh():
+    """
+    Test MediaPipe Face Mesh (468 keypoints)
+    """
+    print("=" * 60)
+    print("Testing MediaPipe Face Mesh")
+    print("=" * 60)
+    
+    mp_face_mesh = mp.solutions.face_mesh
+    
+    # Create Face Mesh model
+    face_mesh = mp_face_mesh.FaceMesh(
+        static_image_mode=True,
+        max_num_faces=1,
+        refine_landmarks=True,  # Enable iris detection
+        min_detection_confidence=0.5,
+    )
+    
+    print("✅ Face Mesh model created")
+    
+    # Test on sample image
+    test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
+    
+    if Path(test_image_path).exists():
+        image = cv2.imread(test_image_path)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        
+        results = face_mesh.process(image_rgb)
+        
+        if results.multi_face_landmarks:
+            face_landmarks = results.multi_face_landmarks[0]
+            num_landmarks = len(face_landmarks.landmark)
+            
+            print(f"✅ Face detected: {num_landmarks} landmarks")
+            
+            # Key landmark indices
+            key_indices = {
+                "nose_tip": 1,
+                "left_eye_center": 33,
+                "right_eye_center": 263,
+                "left_iris_center": 468,
+                "right_iris_center": 473,
+                "mouth_top": 13,
+                "mouth_bottom": 14,
+                "mouth_left": 61,
+                "mouth_right": 291,
+            }
+            
+            print("\nKey landmarks:")
+            for name, idx in key_indices.items():
+                if idx < num_landmarks:
+                    landmark = face_landmarks.landmark[idx]
+                    print(f"  {name} ({idx}): x={landmark.x:.3f}, y={landmark.y:.3f}")
+            
+            # Calculate Eye Aspect Ratio (EAR)
+            # Left eye
+            p1 = face_landmarks.landmark[33]  # Left eye top
+            p2 = face_landmarks.landmark[133]  # Left eye bottom
+            p3 = face_landmarks.landmark[159]  # Left eye left
+            p4 = face_landmarks.landmark[145]  # Left eye right
+            
+            vertical_dist = abs(p2.y - p1.y)
+            horizontal_dist = abs(p4.x - p3.x)
+            ear_left = vertical_dist / horizontal_dist if horizontal_dist > 0 else 0
+            
+            print(f"\nEye Aspect Ratio (EAR):")
+            print(f"  Left eye EAR: {ear_left:.3f}")
+            print(f"  Interpretation: {'wide_open' if ear_left > 0.35 else 'normal' if ear_left > 0.2 else 'closed'}")
+            
+            # Calculate Mouth Aspect Ratio (MAR)
+            mouth_top = face_landmarks.landmark[13]
+            mouth_bottom = face_landmarks.landmark[14]
+            mouth_left = face_landmarks.landmark[61]
+            mouth_right = face_landmarks.landmark[291]
+            
+            mouth_height = abs(mouth_bottom.y - mouth_top.y)
+            mouth_width = abs(mouth_right.x - mouth_left.x)
+            mar = mouth_height / mouth_width if mouth_width > 0 else 0
+            
+            print(f"\nMouth Aspect Ratio (MAR):")
+            print(f"  MAR: {mar:.3f}")
+            print(f"  Interpretation: {'open' if mar > 0.5 else 'closed' if mar < 0.2 else 'slightly_open'}")
+        else:
+            print("❌ No face detected")
+    
+    face_mesh.close()
+    print("\n✅ Face Mesh test completed")
+
+
+def test_pose():
+    """
+    Test MediaPipe Pose (33 keypoints)
+    """
+    print("\n" + "=" * 60)
+    print("Testing MediaPipe Pose")
+    print("=" * 60)
+    
+    mp_pose = mp.solutions.pose
+    
+    pose = mp_pose.Pose(
+        static_image_mode=True,
+        model_complexity=2,  # Full model
+        enable_segmentation=False,
+        min_detection_confidence=0.5,
+    )
+    
+    print("✅ Pose model created")
+    
+    test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
+    
+    if Path(test_image_path).exists():
+        image = cv2.imread(test_image_path)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        
+        results = pose.process(image_rgb)
+        
+        if results.pose_landmarks:
+            landmarks = results.pose_landmarks.landmark
+            num_landmarks = len(landmarks)
+            
+            print(f"✅ Pose detected: {num_landmarks} keypoints")
+            
+            # Key keypoints
+            key_indices = {
+                "nose": 0,
+                "left_shoulder": 11,
+                "right_shoulder": 12,
+                "left_elbow": 13,
+                "right_elbow": 14,
+                "left_wrist": 15,
+                "right_wrist": 16,
+                "left_hip": 23,
+                "right_hip": 24,
+                "left_knee": 25,
+                "right_knee": 26,
+                "left_ankle": 27,
+                "right_ankle": 28,
+            }
+            
+            print("\nKey keypoints:")
+            for name, idx in key_indices.items():
+                landmark = landmarks[idx]
+                print(f"  {name} ({idx}): x={landmark.x:.3f}, y={landmark.y:.3f}, visibility={landmark.visibility:.2f}")
+            
+            # Calculate elbow angles
+            def calculate_angle(p1, p2, p3):
+                v1 = np.array([p1.x, p1.y]) - np.array([p2.x, p2.y])
+                v2 = np.array([p3.x, p3.y]) - np.array([p2.x, p2.y])
+                angle = np.arccos(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
+                return np.degrees(angle)
+            
+            # Right arm angle
+            right_shoulder = landmarks[12]
+            right_elbow = landmarks[14]
+            right_wrist = landmarks[16]
+            
+            right_elbow_angle = calculate_angle(right_shoulder, right_elbow, right_wrist)
+            
+            print(f"\nRight elbow angle: {right_elbow_angle:.1f}°")
+            print(f"  Interpretation: {'extended' if right_elbow_angle > 150 else 'folded' if right_elbow_angle < 90 else 'neutral'}")
+            
+            # Check if arm is raised
+            if right_wrist.y < right_elbow.y < right_shoulder.y:
+                print(f"  Action: raise_right (arm raised)")
+            
+            # Knee angles
+            left_hip = landmarks[23]
+            left_knee = landmarks[25]
+            left_ankle = landmarks[27]
+            
+            left_knee_angle = calculate_angle(left_hip, left_knee, left_ankle)
+            
+            print(f"\nLeft knee angle: {left_knee_angle:.1f}°")
+            print(f"  Interpretation: {'standing' if left_knee_angle > 160 else 'knee_bend' if left_knee_angle < 120 else 'neutral'}")
+        else:
+            print("❌ No pose detected")
+    
+    pose.close()
+    print("\n✅ Pose test completed")
+
+
+def test_hands():
+    """
+    Test MediaPipe Hands (21 keypoints per hand)
+    """
+    print("\n" + "=" * 60)
+    print("Testing MediaPipe Hands")
+    print("=" * 60)
+    
+    mp_hands = mp.solutions.hands
+    
+    hands = mp_hands.Hands(
+        static_image_mode=True,
+        max_num_hands=2,
+        min_detection_confidence=0.5,
+    )
+    
+    print("✅ Hands model created")
+    
+    test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
+    
+    if Path(test_image_path).exists():
+        image = cv2.imread(test_image_path)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        
+        results = hands.process(image_rgb)
+        
+        if results.multi_hand_landmarks:
+            for idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
+                hand_label = results.multi_handedness[idx].classification[0].label
+                
+                print(f"\n✅ Hand {idx+1} detected ({hand_label}): 21 keypoints")
+                
+                landmarks = hand_landmarks.landmark
+                
+                # Key landmarks
+                key_indices = {
+                    "wrist": 0,
+                    "thumb_tip": 4,
+                    "index_tip": 8,
+                    "middle_tip": 12,
+                    "ring_tip": 16,
+                    "pinky_tip": 20,
+                }
+                
+                print(f"  Key landmarks:")
+                for name, i in key_indices.items():
+                    lm = landmarks[i]
+                    print(f"    {name} ({i}): x={lm.x:.3f}, y={lm.y:.3f}")
+                
+                # Detect gesture
+                thumb_tip = landmarks[4]
+                index_tip = landmarks[8]
+                middle_tip = landmarks[12]
+                ring_tip = landmarks[16]
+                pinky_tip = landmarks[20]
+                wrist = landmarks[0]
+                
+                # Calculate finger extensions
+                def is_finger_extended(tip, base, wrist):
+                    return tip.y < base.y  # Extended upward
+                
+                thumb_extended = is_finger_extended(landmarks[4], landmarks[2], wrist)
+                index_extended = is_finger_extended(landmarks[8], landmarks[5], wrist)
+                middle_extended = is_finger_extended(landmarks[12], landmarks[9], wrist)
+                ring_extended = is_finger_extended(landmarks[16], landmarks[13], wrist)
+                pinky_extended = is_finger_extended(landmarks[20], landmarks[17], wrist)
+                
+                extensions = [thumb_extended, index_extended, middle_extended, ring_extended, pinky_extended]
+                
+                print(f"\n  Finger extensions: {['thumb', 'index', 'middle', 'ring', 'pinky']}")
+                print(f"    {extensions}")
+                
+                # Detect gesture
+                gesture = "unknown"
+                if all(extensions):
+                    gesture = "open_hand"
+                elif not any(extensions):
+                    gesture = "fist"
+                elif thumb_extended and not any(extensions[1:]):
+                    gesture = "thumbs_up"
+                elif index_extended and middle_extended and not any(extensions[2:]):
+                    gesture = "peace_sign"
+                elif index_extended and not any(extensions[2:]) and not thumb_extended:
+                    gesture = "pointing"
+                
+                print(f"  Detected gesture: {gesture}")
+        else:
+            print("❌ No hands detected")
+    
+    hands.close()
+    print("\n✅ Hands test completed")
+
+
+def test_holistic():
+    """
+    Test MediaPipe Holistic (Face + Pose + Hands combined)
+    """
+    print("\n" + "=" * 60)
+    print("Testing MediaPipe Holistic")
+    print("=" * 60)
+    
+    mp_holistic = mp.solutions.holistic
+    
+    holistic = mp_holistic.Holistic(
+        static_image_mode=True,
+        model_complexity=2,
+        enable_segmentation=False,
+        refine_face_landmarks=True,
+    )
+    
+    print("✅ Holistic model created")
+    
+    test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
+    
+    if Path(test_image_path).exists():
+        image = cv2.imread(test_image_path)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        
+        results = holistic.process(image_rgb)
+        
+        detected_count = 0
+        
+        if results.face_landmarks:
+            num_face = len(results.face_landmarks.landmark)
+            print(f"✅ Face: {num_face} landmarks")
+            detected_count += 1
+        
+        if results.pose_landmarks:
+            num_pose = len(results.pose_landmarks.landmark)
+            print(f"✅ Pose: {num_pose} keypoints")
+            detected_count += 1
+        
+        if results.left_hand_landmarks:
+            num_left_hand = len(results.left_hand_landmarks.landmark)
+            print(f"✅ Left hand: {num_left_hand} keypoints")
+            detected_count += 1
+        
+        if results.right_hand_landmarks:
+            num_right_hand = len(results.right_hand_landmarks.landmark)
+            print(f"✅ Right hand: {num_right_hand} keypoints")
+            detected_count += 1
+        
+        if detected_count == 0:
+            print("❌ No landmarks detected")
+        else:
+            print(f"\nTotal detections: {detected_count} components")
+    
+    holistic.close()
+    print("\n✅ Holistic test completed")
+
+
+def main():
+    print("=" * 70)
+    print("MediaPipe Installation Test")
+    print("=" * 70)
+    
+    print(f"\nMediaPipe version: {mp.__version__}")
+    print()
+    
+    # Test all modules
+    test_face_mesh()
+    test_pose()
+    test_hands()
+    test_holistic()
+    
+    print("\n" + "=" * 70)
+    print("✅ All MediaPipe tests completed!")
+    print("=" * 70)
+    
+    print("\nNext steps:")
+    print("  1. Face Mesh: Use for eye/mouth action detection")
+    print("  2. Pose: Use for arm/leg/feet action detection")
+    print("  3. Hands: Use for hand gesture detection")
+    print("  4. Holistic: Use for full-body action detection")
+
+
+if __name__ == "__main__":
+    main()