feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
This commit is contained in:
Warren
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions

View File

@@ -0,0 +1,877 @@
#!/opt/homebrew/bin/python3.11
"""
Body Action Decoder - Extended pose action analysis with body keypoints
Purpose:
1. Decode face pose actions (existing)
2. Decode body actions (future MediaPipe Holistic)
3. Integrate face + body actions for comprehensive analysis
Body Keypoints (MediaPipe Holistic):
- Face: 468 points (eyes, mouth, nose, etc.)
- Pose: 33 points (shoulders, elbows, hands, hips, knees, feet)
- Hands: 21 points per hand
Action Types:
- Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head
- Eyes: blink, close, wide_open, look_left, look_right
- Mouth: open, close, smile, talk, yawn
- Arms: raise_left, raise_right, cross_arms, wave
- Hands: point, grab, clap, thumbs_up, fist
- Legs: stand, sit, walk, run, jump, kick
- Feet: tap, stomp, cross
Architecture:
┌─────────────────────────────────────────────────────────────────┐
│ Body Action Decoder │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │
│ │ Face Actions │ │ Body Actions │ │ Hand Actions │ │
│ │ (InsightFace) │ │ (MediaPipe) │ │ (MediaPipe) │ │
│ └───────────────┘ └───────────────┘ └───────────────┘ │
│ │ │ │ │
│ └──────────────────┼──────────────────┘ │
│ │ │
│ ┌───────▼───────┐ │
│ │ Action Merger│ │
│ └────────────────┘ │
│ │ │
│ ┌───────▼───────┐ │
│ │ Action Timeline│ │
│ └────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
"""
import sys
import json
import argparse
import numpy as np
from typing import Dict, List, Optional
from collections import defaultdict
# =============================================================================
# Face Action Definitions (Existing from pose_action_decoder.py)
# =============================================================================
FACE_TURN_ACTIONS = {
("frontal", "three_quarter"): "turn_partial",
("frontal", "profile_left"): "turn_left",
("frontal", "profile_right"): "turn_right",
("three_quarter", "frontal"): "return_frontal",
("three_quarter", "profile_left"): "turn_left",
("three_quarter", "profile_right"): "turn_right",
("profile_left", "frontal"): "turn_to_frontal",
("profile_left", "three_quarter"): "turn_to_three_quarter",
("profile_left", "profile_right"): "turn_full",
("profile_right", "frontal"): "turn_to_frontal",
("profile_right", "three_quarter"): "turn_to_three_quarter",
("profile_right", "profile_left"): "turn_full",
}
FACE_PITCH_ACTIONS = {
("neutral", "tilted_up"): "look_up",
("neutral", "tilted_down"): "look_down",
("tilted_up", "neutral"): "return_neutral",
("tilted_down", "neutral"): "return_neutral",
}
# =============================================================================
# Eye Action Definitions
# =============================================================================
EYE_ACTIONS = {
"blink": {
"description": "眨眼",
"pattern": "eye_aspect_ratio drops < 0.2 for 1-3 frames",
"min_frames": 1,
"max_frames": 3,
},
"close": {
"description": "闭眼",
"pattern": "eye_aspect_ratio < 0.15 for > 10 frames",
"min_frames": 10,
},
"wide_open": {
"description": "睁大眼",
"pattern": "eye_aspect_ratio > 0.4",
},
"look_left": {
"description": "向左看",
"pattern": "iris_position_x < 0.3",
},
"look_right": {
"description": "向右看",
"pattern": "iris_position_x > 0.7",
},
"squint": {
"description": "眯眼",
"pattern": "eye_aspect_ratio 0.15-0.25",
},
}
# =============================================================================
# Mouth Action Definitions
# =============================================================================
MOUTH_ACTIONS = {
"open": {
"description": "张嘴",
"pattern": "mouth_aspect_ratio > 0.5",
},
"close": {
"description": "闭嘴",
"pattern": "mouth_aspect_ratio < 0.2",
},
"smile": {
"description": "微笑",
"pattern": "mouth_corner_distance > threshold",
},
"talk": {
"description": "说话",
"pattern": "mouth_aspect_ratio oscillating 0.3-0.6",
"min_frames": 10,
},
"yawn": {
"description": "打哈欠",
"pattern": "mouth_aspect_ratio > 0.7 for > 20 frames",
"min_frames": 20,
},
"pout": {
"description": "嘟嘴",
"pattern": "lip_distance > threshold",
},
}
# =============================================================================
# Arm Action Definitions
# =============================================================================
ARM_ACTIONS = {
"raise_left": {
"description": "举起左手",
"pattern": "left_shoulder_y > left_elbow_y > left_wrist_y",
},
"raise_right": {
"description": "举起右手",
"pattern": "right_shoulder_y > right_elbow_y > right_wrist_y",
},
"raise_both": {
"description": "双手举起",
"pattern": "both arms raised",
},
"cross_arms": {
"description": "双手交叉",
"pattern": "left_wrist_x > right_wrist_x AND right_wrist_x < left_wrist_x",
},
"wave": {
"description": "挥手",
"pattern": "wrist_y oscillating ±20px for 5-15 frames",
"min_frames": 5,
"max_frames": 15,
},
"extend_left": {
"description": "伸展左臂",
"pattern": "left_elbow_angle > 150°",
},
"extend_right": {
"description": "伸展右臂",
"pattern": "right_elbow_angle > 150°",
},
"fold_left": {
"description": "弯曲左臂",
"pattern": "left_elbow_angle < 90°",
},
"fold_right": {
"description": "弯曲右臂",
"pattern": "right_elbow_angle < 90°",
},
"point": {
"description": "指向",
"pattern": "index_finger extended, other fingers folded",
},
}
# =============================================================================
# Hand Action Definitions
# =============================================================================
HAND_ACTIONS = {
"grab": {
"description": "抓取",
"pattern": "fingers folded, thumb opposing",
},
"open": {
"description": "张开手",
"pattern": "all fingers extended",
},
"clap": {
"description": "拍手",
"pattern": "hands together then apart (velocity pattern)",
"min_frames": 3,
"max_frames": 10,
},
"thumbs_up": {
"description": "点赞",
"pattern": "thumb extended upward, other fingers folded",
},
"fist": {
"description": "握拳",
"pattern": "all fingers folded into palm",
},
"peace": {
"description": "剪刀手",
"pattern": "index and middle fingers extended",
},
"ok": {
"description": "OK 手势",
"pattern": "thumb and index finger touching",
},
"touch_face": {
"description": "摸脸",
"pattern": "hand near face region",
},
"touch_hair": {
"description": "摸头发",
"pattern": "hand above head region",
},
"pocket_left": {
"description": "左手插兜",
"pattern": "left_hand in hip region",
},
"pocket_right": {
"description": "右手插兜",
"pattern": "right_hand in hip region",
},
}
# =============================================================================
# Leg Action Definitions
# =============================================================================
LEG_ACTIONS = {
"stand": {
"description": "站立",
"pattern": "hip_y < knee_y < ankle_y, vertical alignment",
},
"sit": {
"description": "坐姿",
"pattern": "hip_y ≈ knee_y, thigh horizontal",
},
"walk": {
"description": "行走",
"pattern": "hip-knee-ankle oscillating, stride pattern",
"min_frames": 10,
},
"run": {
"description": "奔跑",
"pattern": "fast oscillating, knee_bend > 60°",
"min_frames": 10,
},
"jump": {
"description": "跳跃",
"pattern": "all keypoints moving upward then landing",
"min_frames": 5,
"max_frames": 20,
},
"kick": {
"description": "踢腿",
"pattern": "one leg extended forward rapidly",
"min_frames": 3,
"max_frames": 15,
},
"cross_left": {
"description": "左腿交叉",
"pattern": "left_ankle_x > right_ankle_x",
},
"cross_right": {
"description": "右腿交叉",
"pattern": "right_ankle_x > left_ankle_x",
},
"knee_bend": {
"description": "弯膝",
"pattern": "knee_angle < 120°",
},
}
# =============================================================================
# Feet Action Definitions
# =============================================================================
FEET_ACTIONS = {
"tap": {
"description": "轻踏",
"pattern": "ankle_y oscillating ±10px",
"min_frames": 3,
"max_frames": 15,
},
"stomp": {
"description": "重踏",
"pattern": "ankle_y large downward movement",
"min_frames": 3,
},
"cross": {
"description": "交叉脚",
"pattern": "feet_x overlapping",
},
"point_left": {
"description": "左脚前伸",
"pattern": "left_ankle_y < right_ankle_y",
},
"point_right": {
"description": "右脚前伸",
"pattern": "right_ankle_y < left_ankle_y",
},
}
# =============================================================================
# Combined Actions (Face + Body)
# =============================================================================
COMBINED_ACTIONS = {
"thinking": {
"description": "思考姿势",
"components": ["touch_face", "look_down"],
"pattern": "hand near chin + head tilted down",
},
"listening": {
"description": "倾听姿势",
"components": ["turn_partial", "open_mouth"],
"pattern": "slight turn + mouth slightly open",
},
"nodding_agreement": {
"description": "点头同意",
"components": ["nod_head", "smile"],
"pattern": "head nod + smile",
},
"shaking_disagreement": {
"description": "摇头不同意",
"components": ["shake_head", "frown"],
"pattern": "shake head + frown",
},
"waving_greeting": {
"description": "挥手打招呼",
"components": ["wave", "smile"],
"pattern": "wave hand + smile",
},
"crossing_arms_defensive": {
"description": "双手交叉防御",
"components": ["cross_arms", "frontal_stable"],
"pattern": "cross arms + frontal pose",
},
"pointing_explaining": {
"description": "指向解释",
"components": ["point", "turn_partial"],
"pattern": "pointing + slight turn",
},
"stretching": {
"description": "伸展",
"components": ["raise_both", "look_up"],
"pattern": "raise arms + look up",
},
"sitting_relaxed": {
"description": "放松坐姿",
"components": ["sit", "cross_arms"],
"pattern": "sit + cross arms",
},
}
# =============================================================================
# Analysis Functions
# =============================================================================
def analyze_eye_actions(eye_landmarks: List, prev_eye_landmarks: List = None) -> List[Dict]:
"""
Analyze eye actions from landmarks
Args:
eye_landmarks: Current frame eye landmarks (left/right eye points)
prev_eye_landmarks: Previous frame landmarks (for motion detection)
Returns:
List of detected eye actions
"""
actions = []
if not eye_landmarks or len(eye_landmarks) < 6:
return actions
# Calculate eye aspect ratio (EAR)
# EAR = (|p2-p6| + |p3-p5|) / (2|p1-p4|)
# Points: p1, p2, p3, p4, p5, p6 (6 points per eye)
# For left eye
left_eye = eye_landmarks[:6]
if len(left_eye) == 6:
# Simplified EAR calculation
vertical_1 = np.linalg.norm(np.array(left_eye[1]) - np.array(left_eye[5]))
vertical_2 = np.linalg.norm(np.array(left_eye[2]) - np.array(left_eye[4]))
horizontal = np.linalg.norm(np.array(left_eye[0]) - np.array(left_eye[3]))
left_ear = (vertical_1 + vertical_2) / (2 * horizontal) if horizontal > 0 else 0
# Detect actions
if left_ear < 0.15:
actions.append({"action": "close_left", "description": "闭左眼", "confidence": 1.0 - left_ear})
elif left_ear > 0.4:
actions.append({"action": "wide_open_left", "description": "睁大左眼", "confidence": left_ear})
return actions
def analyze_mouth_actions(mouth_landmarks: List) -> List[Dict]:
"""
Analyze mouth actions from landmarks
Args:
mouth_landmarks: Mouth region landmarks (lips, mouth corners)
Returns:
List of detected mouth actions
"""
actions = []
if not mouth_landmarks or len(mouth_landmarks) < 4:
return actions
# Calculate mouth aspect ratio
# Upper lip - lower lip distance / mouth width
upper_lip = np.array(mouth_landmarks[0])
lower_lip = np.array(mouth_landmarks[1])
left_corner = np.array(mouth_landmarks[2])
right_corner = np.array(mouth_landmarks[3])
mouth_height = np.linalg.norm(upper_lip - lower_lip)
mouth_width = np.linalg.norm(left_corner - right_corner)
mar = mouth_height / mouth_width if mouth_width > 0 else 0
# Detect actions
if mar > 0.7:
actions.append({"action": "yawn", "description": "打哈欠", "mar": mar})
elif mar > 0.5:
actions.append({"action": "open", "description": "张嘴", "mar": mar})
elif mar < 0.2:
actions.append({"action": "close", "description": "闭嘴", "mar": mar})
else:
# Check smile (mouth corners distance)
corner_distance = abs(left_corner[1] - upper_lip[1]) + abs(right_corner[1] - upper_lip[1])
if corner_distance > 10: # Threshold
actions.append({"action": "smile", "description": "微笑", "corner_distance": corner_distance})
return actions
def analyze_arm_actions(pose_keypoints: Dict) -> List[Dict]:
"""
Analyze arm actions from pose keypoints
Args:
pose_keypoints: Pose keypoints dict with shoulder, elbow, wrist positions
Returns:
List of detected arm actions
"""
actions = []
# Keypoint indices (MediaPipe Pose):
# 11: left_shoulder, 12: right_shoulder
# 13: left_elbow, 14: right_elbow
# 15: left_wrist, 16: right_wrist
left_shoulder = pose_keypoints.get("left_shoulder")
left_elbow = pose_keypoints.get("left_elbow")
left_wrist = pose_keypoints.get("left_wrist")
right_shoulder = pose_keypoints.get("right_shoulder")
right_elbow = pose_keypoints.get("right_elbow")
right_wrist = pose_keypoints.get("right_wrist")
# Left arm actions
if left_shoulder and left_elbow and left_wrist:
# Calculate elbow angle
shoulder_elbow = np.array(left_elbow) - np.array(left_shoulder)
elbow_wrist = np.array(left_wrist) - np.array(left_elbow)
elbow_angle = np.arccos(
np.dot(shoulder_elbow, elbow_wrist) /
(np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
)
elbow_angle_deg = np.degrees(elbow_angle)
# Detect actions
if left_wrist[1] < left_elbow[1] < left_shoulder[1]: # Raised (y decreases upward)
actions.append({"action": "raise_left", "description": "举起左手", "angle": elbow_angle_deg})
if elbow_angle_deg > 150:
actions.append({"action": "extend_left", "description": "伸展左臂", "angle": elbow_angle_deg})
elif elbow_angle_deg < 90:
actions.append({"action": "fold_left", "description": "弯曲左臂", "angle": elbow_angle_deg})
# Right arm actions
if right_shoulder and right_elbow and right_wrist:
shoulder_elbow = np.array(right_elbow) - np.array(right_shoulder)
elbow_wrist = np.array(right_wrist) - np.array(right_elbow)
elbow_angle = np.arccos(
np.dot(shoulder_elbow, elbow_wrist) /
(np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
)
elbow_angle_deg = np.degrees(elbow_angle)
if right_wrist[1] < right_elbow[1] < right_shoulder[1]:
actions.append({"action": "raise_right", "description": "举起右手", "angle": elbow_angle_deg})
if elbow_angle_deg > 150:
actions.append({"action": "extend_right", "description": "伸展右臂", "angle": elbow_angle_deg})
elif elbow_angle_deg < 90:
actions.append({"action": "fold_right", "description": "弯曲右臂", "angle": elbow_angle_deg})
# Cross arms detection
if left_wrist and right_wrist:
if left_wrist[0] > right_wrist[0] and right_wrist[0] < left_shoulder[0]:
actions.append({"action": "cross_arms", "description": "双手交叉"})
return actions
def analyze_hand_actions(hand_keypoints: List, hand_type: str = "right") -> List[Dict]:
"""
Analyze hand actions from hand keypoints
Args:
hand_keypoints: 21 hand keypoints (MediaPipe Hand)
hand_type: "left" or "right"
Returns:
List of detected hand actions
"""
actions = []
if not hand_keypoints or len(hand_keypoints) < 21:
return actions
# MediaPipe Hand keypoint indices:
# 0: wrist
# 1-4: thumb (CMC, MCP, IP, TIP)
# 5-8: index finger (MCP, PIP, DIP, TIP)
# 9-12: middle finger
# 13-16: ring finger
# 17-20: pinky
wrist = np.array(hand_keypoints[0])
thumb_tip = np.array(hand_keypoints[4])
index_tip = np.array(hand_keypoints[8])
middle_tip = np.array(hand_keypoints[12])
ring_tip = np.array(hand_keypoints[16])
pinky_tip = np.array(hand_keypoints[20])
# Calculate finger extensions
finger_tips = [thumb_tip, index_tip, middle_tip, ring_tip, pinky_tip]
finger_bases = [
np.array(hand_keypoints[2]), # thumb IP
np.array(hand_keypoints[5]), # index MCP
np.array(hand_keypoints[9]), # middle MCP
np.array(hand_keypoints[13]), # ring MCP
np.array(hand_keypoints[17]), # pinky MCP
]
extensions = []
for tip, base in zip(finger_tips, finger_bases):
dist = np.linalg.norm(tip - base)
extensions.append(dist)
# Detect actions
avg_extension = np.mean(extensions)
if avg_extension > 50: # Open hand
actions.append({"action": f"open_{hand_type}", "description": f"张开{hand_type}"})
elif avg_extension < 30: # Closed/fist
actions.append({"action": f"fist_{hand_type}", "description": f"{hand_type}"})
# Thumbs up (thumb extended upward, others folded)
if extensions[0] > 40 and np.mean(extensions[1:]) < 30:
actions.append({"action": f"thumbs_up_{hand_type}", "description": f"{hand_type}手点赞"})
# Peace sign (index and middle extended)
if extensions[1] > 40 and extensions[2] > 40 and np.mean(extensions[3:]) < 30:
actions.append({"action": f"peace_{hand_type}", "description": f"{hand_type}手剪刀手"})
# Pointing (index extended, others folded)
if extensions[1] > 40 and np.mean([extensions[0], extensions[2], extensions[3], extensions[4]]) < 30:
actions.append({"action": f"point_{hand_type}", "description": f"{hand_type}手指向"})
return actions
def analyze_leg_actions(pose_keypoints: Dict) -> List[Dict]:
"""
Analyze leg actions from pose keypoints
Args:
pose_keypoints: Pose keypoints with hip, knee, ankle positions
Returns:
List of detected leg actions
"""
actions = []
# Keypoint indices (MediaPipe Pose):
# 23: left_hip, 24: right_hip
# 25: left_knee, 26: right_knee
# 27: left_ankle, 28: right_ankle
left_hip = pose_keypoints.get("left_hip")
left_knee = pose_keypoints.get("left_knee")
left_ankle = pose_keypoints.get("left_ankle")
right_hip = pose_keypoints.get("right_hip")
right_knee = pose_keypoints.get("right_knee")
right_ankle = pose_keypoints.get("right_ankle")
# Left leg actions
if left_hip and left_knee and left_ankle:
hip_knee = np.array(left_knee) - np.array(left_hip)
knee_ankle = np.array(left_ankle) - np.array(left_knee)
knee_angle = np.arccos(
np.dot(hip_knee, knee_ankle) /
(np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
)
knee_angle_deg = np.degrees(knee_angle)
# Detect actions
if knee_angle_deg < 120:
actions.append({"action": "knee_bend_left", "description": "弯左膝", "angle": knee_angle_deg})
# Standing detection
if left_hip[1] < left_knee[1] < left_ankle[1]: # Vertical alignment (y increases downward)
actions.append({"action": "stand_left", "description": "左腿站立"})
# Right leg actions
if right_hip and right_knee and right_ankle:
hip_knee = np.array(right_knee) - np.array(right_hip)
knee_ankle = np.array(right_ankle) - np.array(right_knee)
knee_angle = np.arccos(
np.dot(hip_knee, knee_ankle) /
(np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
)
knee_angle_deg = np.degrees(knee_angle)
if knee_angle_deg < 120:
actions.append({"action": "knee_bend_right", "description": "弯右膝", "angle": knee_angle_deg})
if right_hip[1] < right_knee[1] < right_ankle[1]:
actions.append({"action": "stand_right", "description": "右腿站立"})
# Sit detection (hip ≈ knee height)
if left_hip and left_knee and right_hip and right_knee:
hip_avg_y = (left_hip[1] + right_hip[1]) / 2
knee_avg_y = (left_knee[1] + right_knee[1]) / 2
if abs(hip_avg_y - knee_avg_y) < 30: # Hip and knee at similar height
actions.append({"action": "sit", "description": "坐姿"})
return actions
# =============================================================================
# Main Decoder Function
# =============================================================================
def decode_body_actions(
pose_data: Dict,
face_data: Dict = None,
hand_data: Dict = None,
) -> Dict:
"""
Decode all body actions from multiple data sources
Args:
pose_data: Pose estimation data (MediaPipe Pose)
face_data: Face pose data (InsightFace pose_angle)
hand_data: Hand tracking data (MediaPipe Hand)
Returns:
Combined action data dict
"""
all_actions = {
"face": [],
"eyes": [],
"mouth": [],
"arms": [],
"hands": [],
"legs": [],
"feet": [],
"combined": [],
}
# 1. Face actions (existing)
if face_data:
pose_angle = face_data.get("pose_angle", {})
prev_pose_angle = face_data.get("prev_pose_angle", {})
if pose_angle and prev_pose_angle:
angle = pose_angle.get("angle", "unknown")
prev_angle = prev_pose_angle.get("angle", "unknown")
turn_key = (prev_angle, angle)
if turn_key in FACE_TURN_ACTIONS:
all_actions["face"].append({
"action": FACE_TURN_ACTIONS[turn_key],
"description": f"Face: {prev_angle}{angle}",
})
# Pitch actions
pitch = pose_angle.get("pitch", "neutral")
prev_pitch = prev_pose_angle.get("pitch", "neutral")
pitch_key = (prev_pitch, pitch)
if pitch_key in FACE_PITCH_ACTIONS:
all_actions["face"].append({
"action": FACE_PITCH_ACTIONS[pitch_key],
"description": f"Pitch: {prev_pitch}{pitch}",
})
# 2. Eye actions (if eye landmarks available)
if face_data and face_data.get("eye_landmarks"):
all_actions["eyes"] = analyze_eye_actions(
face_data["eye_landmarks"],
face_data.get("prev_eye_landmarks")
)
# 3. Mouth actions (if mouth landmarks available)
if face_data and face_data.get("mouth_landmarks"):
all_actions["mouth"] = analyze_mouth_actions(face_data["mouth_landmarks"])
# 4. Arm actions (if pose keypoints available)
if pose_data and pose_data.get("keypoints"):
all_actions["arms"] = analyze_arm_actions(pose_data["keypoints"])
# 5. Hand actions (if hand keypoints available)
if hand_data:
if hand_data.get("left_hand"):
all_actions["hands"].extend(analyze_hand_actions(hand_data["left_hand"], "left"))
if hand_data.get("right_hand"):
all_actions["hands"].extend(analyze_hand_actions(hand_data["right_hand"], "right"))
# 6. Leg actions (if pose keypoints available)
if pose_data and pose_data.get("keypoints"):
all_actions["legs"] = analyze_leg_actions(pose_data["keypoints"])
# 7. Combined actions
detected_actions = []
for category, actions in all_actions.items():
if actions:
detected_actions.extend([a["action"] for a in actions])
for combined_name, combined_def in COMBINED_ACTIONS.items():
components = combined_def["components"]
if all(comp in detected_actions for comp in components):
all_actions["combined"].append({
"action": combined_name,
"description": combined_def["description"],
"components": components,
})
return all_actions
def print_body_action_report(action_data: Dict) -> None:
"""
Print body action report
"""
print("\n" + "=" * 70)
print("Body Action Decoder Report")
print("=" * 70)
categories = ["face", "eyes", "mouth", "arms", "hands", "legs", "feet", "combined"]
for category in categories:
actions = action_data.get(category, [])
if actions:
print(f"\n{category.upper()} Actions ({len(actions)}):")
for act in actions:
desc = act.get("description", act["action"])
print(f" - {act['action']}: {desc}")
print("\n" + "=" * 70)
# =============================================================================
# Main Entry Point
# =============================================================================
def main():
parser = argparse.ArgumentParser(description="Decode body actions from pose data")
parser.add_argument("--pose-json", help="Path to pose.json (MediaPipe output)")
parser.add_argument("--face-json", help="Path to face.json (InsightFace output)")
parser.add_argument("--hand-json", help="Path to hand.json (MediaPipe Hand output)")
parser.add_argument("--output-json", help="Output action data JSON")
parser.add_argument("--frame", type=int, help="Analyze specific frame")
args = parser.parse_args()
print("=" * 70)
print("Body Action Decoder")
print("=" * 70)
# Load data
pose_data = None
face_data = None
hand_data = None
if args.pose_json:
with open(args.pose_json) as f:
pose_data = json.load(f)
if args.face_json:
with open(args.face_json) as f:
face_data = json.load(f)
if args.hand_json:
with open(args.hand_json) as f:
hand_data = json.load(f)
# Analyze
if pose_data or face_data or hand_data:
action_data = decode_body_actions(
pose_data=pose_data,
face_data=face_data,
hand_data=hand_data,
)
print_body_action_report(action_data)
if args.output_json:
with open(args.output_json, "w") as f:
json.dump(action_data, f, indent=2)
print(f"\n✅ Output saved to: {args.output_json}")
else:
print("\n⚠️ No input data provided")
print("\nAction Categories:")
print(" - Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head")
print(" - Eyes: blink, close, wide_open, look_left, look_right")
print(" - Mouth: open, close, smile, talk, yawn")
print(" - Arms: raise_left, raise_right, cross_arms, wave, point")
print(" - Hands: grab, open, clap, thumbs_up, fist, peace, ok")
print(" - Legs: stand, sit, walk, run, jump, kick")
print(" - Feet: tap, stomp, cross, point")
print(" - Combined: thinking, listening, nodding_agreement, waving_greeting")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,201 @@
#!/opt/homebrew/bin/python3.11
"""
Face Trace Visualizer - Visualize face tracking paths
Output:
1. Trace path visualization (matplotlib)
2. Trace statistics CSV
"""
import sys
import json
import argparse
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from collections import defaultdict
from typing import Dict
def visualize_traces(face_data: Dict, output_path: str = None) -> None:
"""
Visualize face trace paths
"""
frames = face_data.get("frames", {})
traces = face_data.get("traces", {})
metadata = face_data.get("metadata", {})
if not frames or not traces:
print("No frames or traces found")
return
video_width = metadata.get("width", 640)
video_height = metadata.get("height", 360)
video_duration = metadata.get("total_duration", 15)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
ax1 = axes[0, 0]
ax2 = axes[0, 1]
ax3 = axes[1, 0]
ax4 = axes[1, 1]
colors = plt.cm.tab10(np.linspace(0, 1, len(traces)))
trace_data = {}
for trace_id_str, trace in traces.items():
trace_id = int(trace_id_str)
path = trace.get("path", [])
trace_data[trace_id] = {
"frames": [p["frame"] for p in path],
"x": [p["bbox"]["x"] + p["bbox"]["width"] / 2 for p in path],
"y": [p["bbox"]["y"] + p["bbox"]["height"] / 2 for p in path],
"confidence": [p["confidence"] for p in path],
"pose": [p["pose_angle"] for p in path],
}
for trace_id, color in zip(sorted(trace_data.keys()), colors):
data = trace_data[trace_id]
ax1.plot(data["frames"], data["x"], color=color, label=f"Trace {trace_id}", linewidth=2)
ax1.scatter(data["frames"], data["x"], color=color, s=30)
ax2.plot(data["frames"], data["y"], color=color, label=f"Trace {trace_id}", linewidth=2)
ax2.scatter(data["frames"], data["y"], color=color, s=30)
ax3.plot(data["frames"], data["confidence"], color=color, label=f"Trace {trace_id}", linewidth=2)
ax3.scatter(data["frames"], data["confidence"], color=color, s=30)
ax1.set_xlabel("Frame Number")
ax1.set_ylabel("X Position (center)")
ax1.set_title("Face X Position Over Time")
ax1.legend()
ax1.grid(True, alpha=0.3)
ax2.set_xlabel("Frame Number")
ax2.set_ylabel("Y Position (center)")
ax2.set_title("Face Y Position Over Time")
ax2.legend()
ax2.grid(True, alpha=0.3)
ax3.set_xlabel("Frame Number")
ax3.set_ylabel("Detection Confidence")
ax3.set_title("Face Detection Confidence Over Time")
ax3.legend()
ax3.grid(True, alpha=0.3)
pose_colors = {
"frontal": "green",
"three_quarter": "blue",
"profile_left": "orange",
"profile_right": "red",
"unknown": "gray",
}
for trace_id, color in zip(sorted(trace_data.keys()), colors):
data = trace_data[trace_id]
poses = data["pose"]
frames = data["frames"]
pose_counts = defaultdict(int)
for pose in poses:
pose_counts[pose] += 1
ax4.bar(
[f"Trace {trace_id}\n{pose}" for pose in pose_counts.keys()],
pose_counts.values(),
color=[pose_colors.get(pose, "gray") for pose in pose_counts.keys()],
alpha=0.7,
label=f"Trace {trace_id}",
)
ax4.set_xlabel("Trace / Pose")
ax4.set_ylabel("Count")
ax4.set_title("Pose Distribution by Trace")
ax4.tick_params(axis='x', rotation=45)
plt.tight_layout()
if output_path:
plt.savefig(output_path, dpi=150, bbox_inches="tight")
print(f"\n✅ Visualization saved to: {output_path}")
else:
plt.show()
def export_trace_csv(face_data: Dict, output_path: str) -> None:
"""
Export trace statistics to CSV
"""
traces = face_data.get("traces", {})
import csv
with open(output_path, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow([
"trace_id",
"start_frame",
"end_frame",
"duration_frames",
"duration_seconds",
"total_appearances",
"avg_confidence",
"pose_three_quarter",
"pose_profile_right",
"pose_profile_left",
"pose_frontal",
])
for trace_id_str, trace in sorted(traces.items(), key=lambda x: int(x[0])):
poses = trace.get("pose_angles", [])
pose_counts = defaultdict(int)
for pose in poses:
pose_counts[pose] += 1
writer.writerow([
trace["trace_id"],
trace["start_frame"],
trace["end_frame"],
trace["duration_frames"],
trace["duration_seconds"],
trace["total_appearances"],
trace["avg_confidence"],
pose_counts.get("three_quarter", 0),
pose_counts.get("profile_right", 0),
pose_counts.get("profile_left", 0),
pose_counts.get("frontal", 0),
])
print(f"\n✅ CSV exported to: {output_path}")
def main():
parser = argparse.ArgumentParser(description="Visualize face traces")
parser.add_argument("--face-json", required=True, help="Path to face_traced.json")
parser.add_argument("--output-plot", help="Output plot path (PNG)")
parser.add_argument("--output-csv", help="Output CSV path")
args = parser.parse_args()
with open(args.face_json) as f:
face_data = json.load(f)
print("=" * 60)
print("Face Trace Visualizer")
print("=" * 60)
print(f"\nInput: {args.face_json}")
print(f"Traces: {len(face_data.get('traces', {}))}")
if args.output_plot:
visualize_traces(face_data, args.output_plot)
if args.output_csv:
export_trace_csv(face_data, args.output_csv)
if not args.output_plot and not args.output_csv:
visualize_traces(face_data)
if __name__ == "__main__":
main()

452
scripts/utils/face_tracker.py Executable file
View File

@@ -0,0 +1,452 @@
#!/opt/homebrew/bin/python3.11
"""
Face Tracker - Track faces across frames using embedding similarity and bbox proximity
Purpose:
1. Assign unique trace_id to each face across frames
2. Track face movement across adjacent frames
3. Output trace statistics (duration, path, confidence)
Algorithm:
1. For first frame: assign new trace_id to each face
2. For subsequent frames:
- Calculate bbox overlap with previous frame faces
- Calculate embedding cosine similarity
- Match faces if both conditions met
- Assign same trace_id if matched, new trace_id if not
Matching Conditions:
- bbox overlap > 0.3 (IoU)
- embedding similarity > 0.7
- OR single condition > threshold (fallback)
Output:
- face.json with trace_id added to each face
- trace statistics report
"""
import sys
import json
import argparse
import numpy as np
from typing import Dict, List, Optional, Tuple
from collections import defaultdict
def calculate_bbox_iou(bbox1: Dict, bbox2: Dict) -> float:
"""
Calculate Intersection over Union (IoU) between two bboxes
Args:
bbox1: {"x": int, "y": int, "width": int, "height": int}
bbox2: same structure
Returns:
IoU score (0.0 - 1.0)
"""
x1, y1, w1, h1 = bbox1["x"], bbox1["y"], bbox1["width"], bbox1["height"]
x2, y2, w2, h2 = bbox2["x"], bbox2["y"], bbox2["width"], bbox2["height"]
x1_min, x1_max = x1, x1 + w1
y1_min, y1_max = y1, y1 + h1
x2_min, x2_max = x2, x2 + w2
y2_min, y2_max = y2, y2 + h2
inter_x_min = max(x1_min, x2_min)
inter_x_max = min(x1_max, x2_max)
inter_y_min = max(y1_min, y2_min)
inter_y_max = min(y1_max, y2_max)
if inter_x_max <= inter_x_min or inter_y_max <= inter_y_min:
return 0.0
inter_area = (inter_x_max - inter_x_min) * (inter_y_max - inter_y_min)
area1 = w1 * h1
area2 = w2 * h2
union_area = area1 + area2 - inter_area
return inter_area / union_area if union_area > 0 else 0.0
def calculate_bbox_distance(bbox1: Dict, bbox2: Dict) -> float:
"""
Calculate center distance between two bboxes
Returns:
Euclidean distance between centers
"""
cx1 = bbox1["x"] + bbox1["width"] / 2
cy1 = bbox1["y"] + bbox1["height"] / 2
cx2 = bbox2["x"] + bbox2["width"] / 2
cy2 = bbox2["y"] + bbox2["height"] / 2
return np.sqrt((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2)
def calculate_embedding_similarity(emb1: List[float], emb2: List[float]) -> float:
"""
Calculate cosine similarity between two embeddings
Returns:
Cosine similarity (-1.0 - 1.0)
"""
if emb1 is None or emb2 is None:
return 0.0
v1 = np.array(emb1)
v2 = np.array(emb2)
norm1 = np.linalg.norm(v1)
norm2 = np.linalg.norm(v2)
if norm1 == 0 or norm2 == 0:
return 0.0
return np.dot(v1, v2) / (norm1 * norm2)
def match_faces(
current_faces: List[Dict],
previous_faces: List[Dict],
iou_threshold: float = 0.3,
similarity_threshold: float = 0.7,
distance_threshold: float = 100.0,
use_embedding: bool = True,
) -> Dict[int, int]:
"""
Match current frame faces to previous frame faces
Args:
current_faces: Faces in current frame
previous_faces: Faces in previous frame
iou_threshold: Minimum IoU for matching
similarity_threshold: Minimum embedding similarity for matching
distance_threshold: Maximum bbox center distance for matching
use_embedding: Whether to use embedding similarity
Returns:
Dict mapping current_face_index -> previous_face_index (or -1 if new)
"""
if not previous_faces:
return {i: -1 for i in range(len(current_faces))}
matches = {}
used_prev = set()
for curr_idx, curr_face in enumerate(current_faces):
best_prev_idx = -1
best_score = 0.0
curr_bbox = {
"x": curr_face["x"],
"y": curr_face["y"],
"width": curr_face["width"],
"height": curr_face["height"],
}
curr_emb = curr_face.get("embedding")
for prev_idx, prev_face in enumerate(previous_faces):
if prev_idx in used_prev:
continue
prev_bbox = {
"x": prev_face["x"],
"y": prev_face["y"],
"width": prev_face["width"],
"height": prev_face["height"],
}
prev_emb = prev_face.get("embedding")
iou = calculate_bbox_iou(curr_bbox, prev_bbox)
distance = calculate_bbox_distance(curr_bbox, prev_bbox)
similarity = 0.0
if use_embedding and curr_emb and prev_emb:
similarity = calculate_embedding_similarity(curr_emb, prev_emb)
score = 0.0
if iou > iou_threshold and similarity > similarity_threshold:
score = iou + similarity
elif iou > 0.5:
score = iou * 2
elif similarity > 0.85:
score = similarity * 2
elif distance < distance_threshold and similarity > 0.6:
score = similarity - distance / 1000
if score > best_score:
best_score = score
best_prev_idx = prev_idx
if best_prev_idx >= 0 and best_score > 0:
matches[curr_idx] = best_prev_idx
used_prev.add(best_prev_idx)
else:
matches[curr_idx] = -1
return matches
def track_faces(
face_data: Dict,
iou_threshold: float = 0.3,
similarity_threshold: float = 0.7,
distance_threshold: float = 100.0,
use_embedding: bool = True,
) -> Dict:
"""
Track faces across all frames
Args:
face_data: face.json data
iou_threshold: IoU threshold for matching
similarity_threshold: Embedding similarity threshold
distance_threshold: Distance threshold for matching
use_embedding: Whether to use embedding
Returns:
Updated face_data with trace_id added to each face
"""
frames = face_data.get("frames", {})
if not frames:
print("No frames found in face.json")
return face_data
sorted_frames = sorted(frames.items(), key=lambda x: int(x[0]))
next_trace_id = 0
traces = defaultdict(list)
prev_faces = []
prev_trace_ids = []
print(f"\nTracking faces across {len(sorted_frames)} frames...")
print(f"Parameters: iou={iou_threshold}, similarity={similarity_threshold}, distance={distance_threshold}")
print()
for frame_num_str, frame_data in sorted_frames:
frame_num = int(frame_num_str)
faces = frame_data.get("faces", [])
if not faces:
prev_faces = []
prev_trace_ids = []
continue
matches = match_faces(
faces,
prev_faces,
iou_threshold,
similarity_threshold,
distance_threshold,
use_embedding,
)
trace_ids = []
for curr_idx, prev_idx in matches.items():
if prev_idx >= 0:
trace_id = prev_trace_ids[prev_idx]
else:
trace_id = next_trace_id
next_trace_id += 1
faces[curr_idx]["trace_id"] = trace_id
trace_ids.append(trace_id)
traces[trace_id].append({
"frame": frame_num,
"face_index": curr_idx,
"bbox": {
"x": faces[curr_idx]["x"],
"y": faces[curr_idx]["y"],
"width": faces[curr_idx]["width"],
"height": faces[curr_idx]["height"],
},
"confidence": faces[curr_idx].get("confidence", 0.0),
"pose_angle": faces[curr_idx].get("pose_angle", {}).get("angle", "unknown"),
"pose_full": faces[curr_idx].get("pose_angle", {}), # 完整 pose 信息
})
prev_faces = faces
prev_trace_ids = trace_ids
if frame_num % 100 == 0:
print(f" Frame {frame_num}: {len(faces)} faces, {len(set(trace_ids))} active traces")
face_data["traces"] = {}
for trace_id, path in traces.items():
if len(path) >= 1:
duration_frames = path[-1]["frame"] - path[0]["frame"] + 1
avg_confidence = sum(p["confidence"] for p in path) / len(path)
pose_angles = [p["pose_angle"] for p in path]
# Pose Trace: 完整 pose 信息
pose_trace = []
for p in path:
pose_info = p.get("pose_full", {})
pose_trace.append({
"frame": p["frame"],
"angle": pose_info.get("angle", "unknown"),
"confidence": pose_info.get("confidence", 0.0),
"pitch": pose_info.get("pitch", "neutral"),
"features": pose_info.get("features", {}),
})
# Pose Statistics
pose_counts = defaultdict(int)
pose_confidence_by_angle = defaultdict(list)
for pose in pose_trace:
pose_counts[pose["angle"]] += 1
pose_confidence_by_angle[pose["angle"]].append(pose["confidence"])
pose_statistics = {
"distribution": dict(pose_counts),
"avg_confidence_by_angle": {
angle: round(sum(conf_list) / len(conf_list), 3)
for angle, conf_list in pose_confidence_by_angle.items()
},
"dominant_angle": max(pose_counts.items(), key=lambda x: x[1])[0] if pose_counts else "unknown",
"pose_count": len(pose_counts),
}
# Pose Transitions: pose 变化事件
pose_transitions = []
prev_pose = None
for i, pose in enumerate(pose_trace):
if prev_pose is not None and pose["angle"] != prev_pose["angle"]:
pose_transitions.append({
"frame": pose["frame"],
"from_angle": prev_pose["angle"],
"to_angle": pose["angle"],
"transition_index": len(pose_transitions) + 1,
})
prev_pose = pose
face_data["traces"][str(trace_id)] = {
"trace_id": trace_id,
"start_frame": path[0]["frame"],
"end_frame": path[-1]["frame"],
"duration_frames": duration_frames,
"duration_seconds": duration_frames / face_data["metadata"]["fps"],
"total_appearances": len(path),
"avg_confidence": avg_confidence,
"pose_angles": pose_angles,
"pose_trace": pose_trace,
"pose_statistics": pose_statistics,
"pose_transitions": pose_transitions,
"path": path,
}
face_data["metadata"]["trace_stats"] = {
"total_traces": next_trace_id,
"active_traces": len(traces),
"long_traces": len([t for t in traces.values() if len(t) >= 2]),
}
return face_data
def analyze_traces(face_data: Dict) -> None:
"""
Analyze and print trace statistics
"""
traces = face_data.get("traces", {})
metadata = face_data.get("metadata", {})
print("\n" + "=" * 60)
print("Face Trace Analysis")
print("=" * 60)
print(f"\nTotal traces: {metadata.get('trace_stats', {}).get('total_traces', 0)}")
print(f"Long traces (>= 2 frames): {len(traces)}")
if not traces:
return
sorted_traces = sorted(traces.values(), key=lambda x: x["duration_frames"], reverse=True)
print("\n=== Top 10 Longest Traces ===")
for i, trace in enumerate(sorted_traces[:10]):
print(f"\nTrace {trace['trace_id']}:")
print(f" Frames: {trace['start_frame']} - {trace['end_frame']} ({trace['duration_frames']} frames)")
print(f" Duration: {trace['duration_seconds']:.2f} seconds")
print(f" Appearances: {trace['total_appearances']}")
print(f" Avg Confidence: {trace['avg_confidence']:.3f}")
# Pose Statistics
pose_stats = trace.get("pose_statistics", {})
print(f" Pose Distribution: {pose_stats.get('distribution', {})}")
print(f" Dominant Angle: {pose_stats.get('dominant_angle', 'unknown')}")
# Pose Transitions
transitions = trace.get("pose_transitions", [])
if transitions:
print(f" Pose Transitions: {len(transitions)} events")
for t in transitions[:3]: # 只显示前 3 个
print(f" - Frame {t['frame']}: {t['from_angle']}{t['to_angle']}")
pose_stats = defaultdict(int)
for trace in traces.values():
for pose in trace["pose_angles"]:
pose_stats[pose] += 1
print("\n=== Pose Distribution in Traces ===")
for pose, count in sorted(pose_stats.items(), key=lambda x: x[1], reverse=True):
print(f" {pose}: {count}")
duration_distribution = defaultdict(int)
for trace in traces.values():
d = trace["duration_frames"]
if d <= 30:
duration_distribution["short (<= 30 frames)"] += 1
elif d <= 90:
duration_distribution["medium (31-90 frames)"] += 1
else:
duration_distribution["long (> 90 frames)"] += 1
print("\n=== Trace Duration Distribution ===")
for duration, count in sorted(duration_distribution.items()):
print(f" {duration}: {count}")
def main():
parser = argparse.ArgumentParser(description="Track faces across frames")
parser.add_argument("--face-json", required=True, help="Path to face.json")
parser.add_argument("--output", help="Output path (default: face_traced.json)")
parser.add_argument("--iou-threshold", type=float, default=0.3, help="IoU threshold")
parser.add_argument("--similarity-threshold", type=float, default=0.7, help="Embedding similarity threshold")
parser.add_argument("--distance-threshold", type=float, default=100.0, help="Distance threshold")
parser.add_argument("--no-embedding", action="store_true", help="Disable embedding matching")
parser.add_argument("--analyze-only", action="store_true", help="Only analyze, don't output")
args = parser.parse_args()
print("=" * 60)
print("Face Tracker")
print("=" * 60)
with open(args.face_json) as f:
face_data = json.load(f)
print(f"\nInput: {args.face_json}")
print(f"Frames: {len(face_data.get('frames', {}))}")
face_data = track_faces(
face_data,
iou_threshold=args.iou_threshold,
similarity_threshold=args.similarity_threshold,
distance_threshold=args.distance_threshold,
use_embedding=not args.no_embedding,
)
analyze_traces(face_data)
if not args.analyze_only:
output_path = args.output or args.face_json.replace(".json", "_traced.json")
with open(output_path, "w") as f:
json.dump(face_data, f, indent=2)
print(f"\n✅ Output saved to: {output_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,522 @@
#!/opt/homebrew/bin/python3.11
"""
Pose Action Decoder - Convert pose_trace into human-readable action names
Purpose:
1. Decode pose transitions into action names (turn left/right, look up/down, shake head, nod)
2. Identify stable pose segments with duration
3. Generate action timeline for each trace
Action Types:
- Simple: turn_left, turn_right, look_up, look_down
- Complex: shake_head, nod_head, turn_full
- Stable: frontal_stable, profile_left_stable, profile_right_stable, three_quarter_stable
Output:
1. Action timeline (frame-based action list)
2. Action summary (total counts, duration)
3. Action visualization (timeline plot)
"""
import sys
import json
import argparse
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, List, Optional
from collections import defaultdict
# Action definitions
POSE_TO_ACTION = {
# Turn actions (angle changes)
("frontal", "three_quarter"): "turn_partial",
("frontal", "profile_left"): "turn_left",
("frontal", "profile_right"): "turn_right",
("three_quarter", "frontal"): "return_frontal",
("three_quarter", "profile_left"): "turn_left",
("three_quarter", "profile_right"): "turn_right",
("profile_left", "frontal"): "turn_to_frontal",
("profile_left", "three_quarter"): "turn_to_three_quarter",
("profile_left", "profile_right"): "turn_full",
("profile_right", "frontal"): "turn_to_frontal",
("profile_right", "three_quarter"): "turn_to_three_quarter",
("profile_right", "profile_left"): "turn_full",
# Pitch actions
("neutral", "tilted_up"): "look_up",
("neutral", "tilted_down"): "look_down",
("tilted_up", "neutral"): "return_neutral",
("tilted_down", "neutral"): "return_neutral",
("tilted_up", "tilted_down"): "nod_full",
("tilted_down", "tilted_up"): "nod_full",
}
# Stable pose names
STABLE_ACTION_NAMES = {
"frontal": "frontal_stable",
"three_quarter": "three_quarter_stable",
"profile_left": "profile_left_stable",
"profile_right": "profile_right_stable",
"unknown": "pose_unknown",
}
# Complex action patterns (3+ transitions in short time)
COMPLEX_PATTERNS = {
# Shake head: profile_left → profile_right → profile_left (or reverse)
"shake_head": {
"sequence": ["profile_left", "profile_right", "profile_left"],
"min_frames": 5,
"max_frames": 30,
},
"shake_head_reverse": {
"sequence": ["profile_right", "profile_left", "profile_right"],
"min_frames": 5,
"max_frames": 30,
},
# Nod: tilted_up → tilted_down → tilted_up (or reverse)
"nod_head": {
"sequence": ["tilted_up", "tilted_down", "tilted_up"],
"min_frames": 3,
"max_frames": 20,
"pitch_mode": True,
},
}
def decode_pose_to_action(from_pose: str, to_pose: str) -> str:
"""
Decode single pose transition to action name
Args:
from_pose: Source pose angle
to_pose: Target pose angle
Returns:
Action name
"""
key = (from_pose, to_pose)
if key in POSE_TO_ACTION:
return POSE_TO_ACTION[key]
# Default action
return f"pose_change_{from_pose}_to_{to_pose}"
def detect_complex_actions(pose_trace: List[Dict]) -> List[Dict]:
"""
Detect complex action patterns (shake head, nod, etc.)
Args:
pose_trace: Pose trace list
Returns:
List of complex action events
"""
complex_actions = []
# Shake head detection
for i in range(len(pose_trace) - 2):
angles = [pose_trace[i]["angle"], pose_trace[i+1]["angle"], pose_trace[i+2]["angle"]]
# Check shake_head pattern
if angles == ["profile_left", "profile_right", "profile_left"]:
duration_frames = pose_trace[i+2]["frame"] - pose_trace[i]["frame"]
if 5 <= duration_frames <= 30:
complex_actions.append({
"action": "shake_head",
"start_frame": pose_trace[i]["frame"],
"end_frame": pose_trace[i+2]["frame"],
"duration_frames": duration_frames,
"description": "shake head left-right-left",
})
elif angles == ["profile_right", "profile_left", "profile_right"]:
duration_frames = pose_trace[i+2]["frame"] - pose_trace[i]["frame"]
if 5 <= duration_frames <= 30:
complex_actions.append({
"action": "shake_head",
"start_frame": pose_trace[i]["frame"],
"end_frame": pose_trace[i+2]["frame"],
"duration_frames": duration_frames,
"description": "shake head right-left-right",
})
# Nod detection (pitch-based)
for i in range(len(pose_trace) - 2):
pitches = [pose_trace[i]["pitch"], pose_trace[i+1]["pitch"], pose_trace[i+2]["pitch"]]
if pitches == ["tilted_up", "tilted_down", "tilted_up"] or \
pitches == ["tilted_down", "tilted_up", "tilted_down"]:
duration_frames = pose_trace[i+2]["frame"] - pose_trace[i]["frame"]
if 3 <= duration_frames <= 20:
complex_actions.append({
"action": "nod_head",
"start_frame": pose_trace[i]["frame"],
"end_frame": pose_trace[i+2]["frame"],
"duration_frames": duration_frames,
"description": "nod head up-down",
})
return complex_actions
def build_action_timeline(trace: Dict) -> Dict:
"""
Build action timeline from pose_trace
Args:
trace: Trace data with pose_trace, pose_transitions
Returns:
Action timeline dict
"""
pose_trace = trace.get("pose_trace", [])
pose_transitions = trace.get("pose_transitions", [])
if len(pose_trace) < 1:
return {
"trace_id": trace.get("trace_id"),
"action_timeline": [],
"action_summary": {},
"complex_actions": [],
}
action_timeline = []
complex_actions = detect_complex_actions(pose_trace)
# Build pose segments (stable periods)
pose_segments = []
current_pose = pose_trace[0]["angle"]
current_start = pose_trace[0]["frame"]
current_pitch = pose_trace[0]["pitch"]
for i in range(1, len(pose_trace)):
pose = pose_trace[i]
# Check if pose changed
if pose["angle"] != current_pose or pose["pitch"] != current_pitch:
pose_segments.append({
"angle": current_pose,
"pitch": current_pitch,
"start_frame": current_start,
"end_frame": pose_trace[i-1]["frame"],
"duration_frames": pose_trace[i-1]["frame"] - current_start + 1,
})
current_pose = pose["angle"]
current_pitch = pose["pitch"]
current_start = pose["frame"]
# Add last segment
pose_segments.append({
"angle": current_pose,
"pitch": current_pitch,
"start_frame": current_start,
"end_frame": pose_trace[-1]["frame"],
"duration_frames": pose_trace[-1]["frame"] - current_start + 1,
})
# Build action timeline
for seg in pose_segments:
# Determine action name
if seg["duration_frames"] >= 10: # Stable pose (>= 10 frames)
action_name = STABLE_ACTION_NAMES.get(seg["angle"], "pose_stable")
# Add pitch modifier
if seg["pitch"] != "neutral":
action_name += f"_pitch_{seg['pitch']}"
action_timeline.append({
"frame": seg["start_frame"],
"action": action_name,
"duration_frames": seg["duration_frames"],
"description": f"stable {seg['angle']} pose for {seg['duration_frames']} frames",
"type": "stable",
})
else: # Short pose (transitional)
action_name = f"pose_{seg['angle']}_brief"
action_timeline.append({
"frame": seg["start_frame"],
"action": action_name,
"duration_frames": seg["duration_frames"],
"description": f"brief {seg['angle']} pose for {seg['duration_frames']} frames",
"type": "transitional",
})
# Add transition actions
for trans in pose_transitions:
action_name = decode_pose_to_action(trans["from_angle"], trans["to_angle"])
action_timeline.append({
"frame": trans["frame"],
"action": action_name,
"duration_frames": 1, # Transition is instant
"description": f"transition from {trans['from_angle']} to {trans['to_angle']}",
"type": "transition",
})
# Sort by frame
action_timeline.sort(key=lambda x: x["frame"])
# Add complex actions
for complex_act in complex_actions:
action_timeline.append({
"frame": complex_act["start_frame"],
"action": complex_act["action"],
"duration_frames": complex_act["duration_frames"],
"description": complex_act["description"],
"type": "complex",
})
# Re-sort
action_timeline.sort(key=lambda x: (x["frame"], -x["duration_frames"]))
# Build action summary
action_counts = defaultdict(int)
action_durations = defaultdict(float)
for act in action_timeline:
action_counts[act["action"]] += 1
action_durations[act["action"]] += act["duration_frames"]
action_summary = {
"total_actions": len(action_timeline),
"unique_actions": len(action_counts),
"action_counts": dict(action_counts),
"action_durations_frames": {k: round(v, 1) for k, v in action_durations.items()},
"complex_action_count": len(complex_actions),
"stable_percentage": round(
sum(1 for act in action_timeline if act["type"] == "stable") / len(action_timeline) * 100, 1
) if action_timeline else 0,
}
return {
"trace_id": trace.get("trace_id"),
"action_timeline": action_timeline,
"action_summary": action_summary,
"complex_actions": complex_actions,
}
def generate_action_description(action_timeline: List[Dict]) -> str:
"""
Generate human-readable action description
Args:
action_timeline: Action timeline list
Returns:
Action description string
"""
if not action_timeline:
return "No actions detected"
# Group actions by type
stable_actions = [a for a in action_timeline if a["type"] == "stable"]
transition_actions = [a for a in action_timeline if a["type"] == "transition"]
complex_actions = [a for a in action_timeline if a["type"] == "complex"]
desc_parts = []
# Stable poses
if stable_actions:
stable_desc = []
for act in stable_actions[:3]: # Top 3 stable poses
stable_desc.append(f"{act['description']}")
desc_parts.append(f"Stable poses: {', '.join(stable_desc)}")
# Transitions
if transition_actions:
trans_desc = [act["action"] for act in transition_actions[:5]] # Top 5 transitions
desc_parts.append(f"Transitions: {', '.join(trans_desc)}")
# Complex actions
if complex_actions:
complex_desc = [act["action"] for act in complex_actions]
desc_parts.append(f"Complex actions: {', '.join(complex_desc)}")
return ". ".join(desc_parts)
def visualize_action_timeline(action_data: Dict, output_path: str = None) -> None:
"""
Visualize action timeline
"""
traces_data = action_data.get("traces", {})
if not traces_data:
print("No traces found")
return
fig, axes = plt.subplots(len(traces_data), 1, figsize=(16, 3 * len(traces_data)))
if len(traces_data) == 1:
axes = [axes]
action_colors = {
"frontal_stable": "green",
"three_quarter_stable": "blue",
"profile_left_stable": "orange",
"profile_right_stable": "red",
"turn_left": "purple",
"turn_right": "purple",
"turn_full": "darkred",
"shake_head": "yellow",
"nod_head": "cyan",
"look_up": "lightgreen",
"look_down": "brown",
}
for ax, (trace_id, data) in zip(axes, sorted(traces_data.items())):
timeline = data["action_timeline"]
if not timeline:
continue
# Plot action timeline as bars
for act in timeline:
color = action_colors.get(act["action"], "gray")
if act["duration_frames"] > 1:
ax.barh(
y=0,
width=act["duration_frames"],
left=act["frame"],
height=0.8,
color=color,
alpha=0.6,
edgecolor="black",
linewidth=0.5,
)
# Add label for stable actions
if act["type"] == "stable" and act["duration_frames"] > 30:
ax.text(
act["frame"] + act["duration_frames"] / 2,
0,
act["action"],
ha="center",
va="center",
fontsize=8,
color="white",
)
else:
# Instant action (transition)
ax.axvline(x=act["frame"], color=color, linestyle="--", alpha=0.8)
ax.text(
act["frame"],
0.5,
act["action"],
fontsize=7,
rotation=90,
va="bottom",
ha="center",
)
ax.set_xlabel("Frame Number")
ax.set_ylabel("Action")
ax.set_title(f"Trace {trace_id} Action Timeline")
ax.set_ylim(-0.5, 1)
ax.grid(True, alpha=0.3)
plt.tight_layout()
if output_path:
plt.savefig(output_path, dpi=150, bbox_inches="tight")
print(f"\n✅ Visualization saved to: {output_path}")
else:
plt.show()
def print_action_report(action_data: Dict) -> None:
"""
Print action report
"""
traces_data = action_data.get("traces", {})
print("\n" + "=" * 70)
print("Pose Action Decoder Report")
print("=" * 70)
for trace_id, data in sorted(traces_data.items()):
print(f"\n{'='*70}")
print(f"Trace {trace_id}")
print(f"{'='*70}")
summary = data["action_summary"]
print(f"\nSummary:")
print(f" Total Actions: {summary['total_actions']}")
print(f" Unique Actions: {summary['unique_actions']}")
print(f" Complex Actions: {summary['complex_action_count']}")
print(f" Stable Percentage: {summary['stable_percentage']}%")
print(f"\nAction Counts:")
for action, count in sorted(summary["action_counts"].items(), key=lambda x: x[1], reverse=True):
print(f" {action}: {count}")
print(f"\nAction Timeline (前 10 个):")
timeline = data["action_timeline"]
for act in timeline[:10]:
print(f" Frame {act['frame']}: {act['action']} ({act['type']}, {act['duration_frames']} frames)")
if data["complex_actions"]:
print(f"\nComplex Actions:")
for act in data["complex_actions"]:
print(f" {act['action']}: frames {act['start_frame']}-{act['end_frame']} ({act['duration_frames']} frames)")
# Generate description
desc = generate_action_description(data["action_timeline"])
print(f"\nHuman-readable Description:")
print(f" {desc}")
def main():
parser = argparse.ArgumentParser(description="Decode pose_trace into action names")
parser.add_argument("--face-json", required=True, help="Path to face_traced.json")
parser.add_argument("--output-json", help="Output action data JSON")
parser.add_argument("--output-plot", help="Output action timeline plot PNG")
parser.add_argument("--trace-id", type=int, help="Analyze specific trace only")
args = parser.parse_args()
print("=" * 70)
print("Pose Action Decoder")
print("=" * 70)
with open(args.face_json) as f:
face_data = json.load(f)
traces = face_data.get("traces", {})
if not traces:
print("No traces found in face_traced.json")
return
# Filter by trace_id if specified
if args.trace_id:
traces = {str(args.trace_id): traces.get(str(args.trace_id))}
if not traces[str(args.trace_id)]:
print(f"Trace {args.trace_id} not found")
return
print(f"\nAnalyzing {len(traces)} traces...")
action_data = {"traces": {}}
for trace_id_str, trace in traces.items():
action_result = build_action_timeline(trace)
action_data["traces"][trace_id_str] = action_result
print_action_report(action_data)
if args.output_json:
with open(args.output_json, "w") as f:
json.dump(action_data, f, indent=2)
print(f"\n✅ Action data saved to: {args.output_json}")
if args.output_plot:
visualize_action_timeline(action_data, args.output_plot)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,402 @@
#!/opt/homebrew/bin/python3.11
"""
Pose Analyzer - Multi-feature Pose Angle Classification
Purpose:
1. Calculate pose angle from 5-point landmarks (InsightFace kps)
2. Use multiple features for accurate classification:
- nose_to_eye_ratio: nose distance relative to eye width
- eye_slope: eye line slope (pitch detection)
- nose_offset: nose position relative to eye center
- mouth_symmetry: mouth corners symmetry
3. Provide confidence score for classification
Landmarks Order (InsightFace kps):
- 0: left eye
- 1: right eye
- 2: nose
- 3: left mouth corner
- 4: right mouth corner
Angles:
- frontal: nose near center, low ratio (< 0.4)
- three_quarter: moderate offset (ratio 0.4 - 0.6)
- profile_left: nose left of eye center (ratio > 0.6)
- profile_right: nose right of eye center (ratio > 0.6)
Usage:
from pose_analyzer import calculate_pose_angle_v2
pose_result = calculate_pose_angle_v2(landmarks)
print(f"Angle: {pose_result['angle']}, Confidence: {pose_result['confidence']}")
"""
import numpy as np
from typing import Dict, List, Optional, Tuple
def calculate_nose_to_eye_ratio(landmarks: List) -> Tuple[float, float, float]:
"""
Calculate nose-to-eye ratio
Returns:
(ratio, eye_width, nose_to_eye_distance)
"""
if len(landmarks) < 5:
return (0.0, 0.0, 0.0)
left_eye = np.array(landmarks[0][:2])
right_eye = np.array(landmarks[1][:2])
nose = np.array(landmarks[2][:2])
eye_center = (left_eye + right_eye) / 2
eye_width = np.linalg.norm(right_eye - left_eye)
nose_to_eye = np.linalg.norm(nose - eye_center)
ratio = nose_to_eye / eye_width if eye_width > 0 else 0.0
return (ratio, eye_width, nose_to_eye)
def calculate_eye_slope(landmarks: List) -> Tuple[float, float]:
"""
Calculate eye line slope (for pitch detection)
Positive slope = head tilted down
Negative slope = head tilted up
Returns:
(slope, angle_degrees)
"""
if len(landmarks) < 5:
return (0.0, 0.0)
left_eye = np.array(landmarks[0][:2])
right_eye = np.array(landmarks[1][:2])
dx = right_eye[0] - left_eye[0]
dy = right_eye[1] - left_eye[1]
slope = dy / dx if dx != 0 else 0.0
angle_degrees = np.arctan(slope) * 180 / np.pi
return (slope, angle_degrees)
def calculate_nose_offset(landmarks: List) -> Tuple[float, float]:
"""
Calculate nose horizontal offset relative to eye center
Returns:
(offset_x, normalized_offset)
"""
if len(landmarks) < 5:
return (0.0, 0.0)
left_eye = np.array(landmarks[0][:2])
right_eye = np.array(landmarks[1][:2])
nose = np.array(landmarks[2][:2])
eye_center = (left_eye + right_eye) / 2
eye_width = np.linalg.norm(right_eye - left_eye)
offset_x = nose[0] - eye_center[0]
normalized_offset = offset_x / eye_width if eye_width > 0 else 0.0
return (offset_x, normalized_offset)
def calculate_mouth_symmetry(landmarks: List) -> Tuple[float, float]:
"""
Calculate mouth corners symmetry
For profile faces, mouth corners are asymmetric
Returns:
(symmetry_score, mouth_width)
"""
if len(landmarks) < 5:
return (1.0, 0.0)
left_mouth = np.array(landmarks[3][:2])
right_mouth = np.array(landmarks[4][:2])
nose = np.array(landmarks[2][:2])
mouth_width = np.linalg.norm(right_mouth - left_mouth)
left_dist = np.linalg.norm(left_mouth - nose)
right_dist = np.linalg.norm(right_mouth - nose)
symmetry = min(left_dist, right_dist) / max(left_dist, right_dist) if max(left_dist, right_dist) > 0 else 1.0
return (symmetry, mouth_width)
def calculate_jaw_visibility_hint(landmarks: List) -> float:
"""
Estimate jaw visibility from mouth position
For profile faces, one side of jaw is more visible
Returns:
visibility_hint (0.0 - 1.0)
"""
if len(landmarks) < 5:
return 0.5
left_eye = np.array(landmarks[0][:2])
right_eye = np.array(landmarks[1][:2])
nose = np.array(landmarks[2][:2])
left_mouth = np.array(landmarks[3][:2])
right_mouth = np.array(landmarks[4][:2])
eye_center_y = (left_eye[1] + right_eye[1]) / 2
mouth_center_y = (left_mouth[1] + right_mouth[1]) / 2
nose_to_mouth_dist = mouth_center_y - nose[1]
eye_to_nose_dist = nose[1] - eye_center_y
ratio = nose_to_mouth_dist / eye_to_nose_dist if eye_to_nose_dist > 0 else 0.5
return min(1.0, max(0.0, ratio))
def classify_angle_from_features(
ratio: float,
nose_offset_norm: float,
mouth_symmetry: float,
eye_slope: float,
) -> Tuple[str, float]:
"""
Classify angle using multiple features
Returns:
(angle_type, confidence)
"""
if ratio < 0.35 and abs(nose_offset_norm) < 0.15:
return ("frontal", 0.95)
if ratio < 0.55 and abs(nose_offset_norm) < 0.25:
return ("three_quarter", 0.85)
if ratio >= 0.55:
if nose_offset_norm < -0.1:
if mouth_symmetry < 0.85:
return ("profile_left", 0.90)
else:
return ("profile_left", 0.75)
elif nose_offset_norm > 0.1:
if mouth_symmetry < 0.85:
return ("profile_right", 0.90)
else:
return ("profile_right", 0.75)
else:
return ("three_quarter", 0.70)
return ("unknown", 0.50)
def calculate_pose_angle_v2(landmarks: List) -> Dict:
"""
Calculate pose angle using multi-feature analysis (V2)
This is an improved version that uses multiple features:
- nose_to_eye_ratio
- eye_slope (pitch)
- nose_offset (yaw)
- mouth_symmetry
Args:
landmarks: List of 5 points [[x, y], [x, y], ...]
Order: left_eye, right_eye, nose, left_mouth, right_mouth
Returns:
Dict with:
- angle: 'frontal', 'three_quarter', 'profile_left', 'profile_right', 'unknown'
- confidence: 0.0 - 1.0
- features: Dict of all calculated features
"""
if len(landmarks) < 5:
return {
"angle": "unknown",
"confidence": 0.0,
"features": {},
"method": "v2_multi_feature",
}
ratio, eye_width, nose_to_eye = calculate_nose_to_eye_ratio(landmarks)
eye_slope, eye_angle = calculate_eye_slope(landmarks)
nose_offset, nose_offset_norm = calculate_nose_offset(landmarks)
mouth_symmetry, mouth_width = calculate_mouth_symmetry(landmarks)
jaw_hint = calculate_jaw_visibility_hint(landmarks)
angle, confidence = classify_angle_from_features(
ratio=ratio,
nose_offset_norm=nose_offset_norm,
mouth_symmetry=mouth_symmetry,
eye_slope=eye_slope,
)
if eye_slope > 0.15:
pitch = "tilted_down"
elif eye_slope < -0.15:
pitch = "tilted_up"
else:
pitch = "neutral"
return {
"angle": angle,
"confidence": confidence,
"pitch": pitch,
"features": {
"nose_to_eye_ratio": round(ratio, 4),
"eye_width": round(eye_width, 2),
"nose_to_eye_dist": round(nose_to_eye, 2),
"eye_slope": round(eye_slope, 4),
"eye_angle_deg": round(eye_angle, 2),
"nose_offset_x": round(nose_offset, 2),
"nose_offset_norm": round(nose_offset_norm, 4),
"mouth_symmetry": round(mouth_symmetry, 4),
"mouth_width": round(mouth_width, 2),
"jaw_visibility_hint": round(jaw_hint, 4),
},
"method": "v2_multi_feature",
"landmarks_count": len(landmarks),
}
def calculate_pose_angle_v1(landmarks: List) -> Dict:
"""
Legacy version (V1) - single feature ratio-based
For comparison purposes only
"""
if len(landmarks) < 5:
return {"angle": "unknown", "confidence": 0.0}
left_eye = np.array(landmarks[0][:2])
right_eye = np.array(landmarks[1][:2])
nose = np.array(landmarks[2][:2])
eye_center = (left_eye + right_eye) / 2
eye_width = np.linalg.norm(right_eye - left_eye)
nose_to_eye = np.linalg.norm(nose - eye_center)
ratio = nose_to_eye / eye_width if eye_width > 0 else 0.0
if ratio < 0.4:
angle = "frontal"
elif ratio < 0.6:
angle = "three_quarter"
elif nose[0] < eye_center[0]:
angle = "profile_left"
else:
angle = "profile_right"
return {
"angle": angle,
"confidence": 0.7,
"ratio": round(ratio, 4),
"method": "v1_single_feature",
}
def compare_v1_v2(landmarks: List) -> Dict:
"""
Compare V1 and V2 classification results
Useful for validation and debugging
"""
v1_result = calculate_pose_angle_v1(landmarks)
v2_result = calculate_pose_angle_v2(landmarks)
return {
"v1": v1_result,
"v2": v2_result,
"agreement": v1_result["angle"] == v2_result["angle"],
"confidence_improvement": v2_result["confidence"] - v1_result["confidence"],
}
def batch_classify_angles(face_json_path: str) -> Dict:
"""
Batch classify all faces in face.json
Returns:
Statistics and per-frame results
"""
import json
with open(face_json_path) as f:
data = json.load(f)
frames = data.get("frames", {})
results = []
angle_counts = {}
confidence_stats = []
for frame_key, frame_data in frames.items():
for face_idx, face in enumerate(frame_data.get("faces", [])):
landmarks = face.get("landmarks", [])
if not landmarks or len(landmarks) < 5:
continue
pose_result = calculate_pose_angle_v2(landmarks)
pose_result["frame"] = frame_key
pose_result["face_index"] = face_idx
results.append(pose_result)
angle = pose_result["angle"]
angle_counts[angle] = angle_counts.get(angle, 0) + 1
confidence_stats.append(pose_result["confidence"])
return {
"total_faces": len(results),
"angle_distribution": angle_counts,
"confidence_avg": np.mean(confidence_stats) if confidence_stats else 0.0,
"confidence_min": np.min(confidence_stats) if confidence_stats else 0.0,
"confidence_max": np.max(confidence_stats) if confidence_stats else 0.0,
"results": results,
}
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Pose Analyzer")
parser.add_argument("--face-json", help="Path to face.json for batch analysis")
parser.add_argument("--test", action="store_true", help="Run unit tests")
args = parser.parse_args()
if args.test:
print("=" * 60)
print("Pose Analyzer Unit Tests")
print("=" * 60)
test_landmarks = [
[[100, 100], [120, 100], [110, 120], [105, 130], [115, 130]],
[[100, 100], [120, 100], [125, 120], [105, 130], [115, 130]],
[[100, 100], [120, 100], [95, 120], [105, 130], [115, 130]],
]
for i, lm in enumerate(test_landmarks):
result = calculate_pose_angle_v2(lm)
print(f"\nTest {i+1}: {result['angle']} (confidence: {result['confidence']:.2f})")
print(f" Features: {result['features']}")
elif args.face_json:
print("=" * 60)
print("Batch Pose Analysis")
print("=" * 60)
batch_result = batch_classify_angles(args.face_json)
print(f"\nTotal faces: {batch_result['total_faces']}")
print(f"Angle distribution: {batch_result['angle_distribution']}")
print(f"Confidence: avg={batch_result['confidence_avg']:.2f}, min={batch_result['confidence_min']:.2f}, max={batch_result['confidence_max']:.2f}")
else:
print("Please provide --face-json or --test")

View File

@@ -0,0 +1,239 @@
#!/opt/homebrew/bin/python3.11
"""
Pose Transition Analyzer - Analyze pose changes within traces
Purpose:
1. Visualize pose transitions over time
2. Calculate transition frequency and duration
3. Identify pose stability patterns
Output:
1. Pose transition timeline
2. Pose duration statistics
3. Stability score per trace
"""
import sys
import json
import argparse
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, List
from collections import defaultdict
def analyze_pose_transitions(face_data: Dict) -> Dict:
"""
Analyze pose transitions for all traces
Returns:
Dict with transition analysis results
"""
traces = face_data.get("traces", {})
if not traces:
return {}
analysis = {}
for trace_id_str, trace in traces.items():
trace_id = int(trace_id_str)
pose_trace = trace.get("pose_trace", [])
transitions = trace.get("pose_transitions", [])
if len(pose_trace) < 2:
continue
# Pose duration analysis
pose_segments = []
current_pose = pose_trace[0]["angle"]
current_start = pose_trace[0]["frame"]
for i, pose in enumerate(pose_trace[1:], 1):
if pose["angle"] != current_pose:
pose_segments.append({
"angle": current_pose,
"start_frame": current_start,
"end_frame": pose_trace[i-1]["frame"],
"duration_frames": pose_trace[i-1]["frame"] - current_start + 1,
"avg_confidence": np.mean([
p["confidence"]
for p in pose_trace[current_start-pose_trace[0]["frame"]:i]
]),
})
current_pose = pose["angle"]
current_start = pose["frame"]
# Add last segment
pose_segments.append({
"angle": current_pose,
"start_frame": current_start,
"end_frame": pose_trace[-1]["frame"],
"duration_frames": pose_trace[-1]["frame"] - current_start + 1,
"avg_confidence": np.mean([
p["confidence"]
for p in pose_trace[current_start-pose_trace[0]["frame"]:]
]),
})
# Transition frequency
transition_frequency = len(transitions) / trace["duration_seconds"] if trace["duration_seconds"] > 0 else 0
# Stability score (inverse of transition frequency)
stability_score = 1.0 - min(transition_frequency / 2.0, 1.0) # 2 transitions/second = fully unstable
# Pose average duration
pose_avg_duration = {}
for angle in set([s["angle"] for s in pose_segments]):
segments_for_angle = [s for s in pose_segments if s["angle"] == angle]
avg_dur = np.mean([s["duration_frames"] for s in segments_for_angle])
pose_avg_duration[angle] = round(avg_dur, 1)
analysis[trace_id] = {
"trace_id": trace_id,
"total_transitions": len(transitions),
"transition_frequency": round(transition_frequency, 3), # transitions per second
"stability_score": round(stability_score, 3), # 0-1, higher = more stable
"pose_segments": pose_segments,
"pose_avg_duration": pose_avg_duration,
"longest_stable_pose": max(pose_segments, key=lambda x: x["duration_frames"]),
"transition_events": transitions,
}
return analysis
def visualize_pose_transitions(face_data: Dict, output_path: str = None) -> None:
"""
Visualize pose transitions for all traces
"""
traces = face_data.get("traces", {})
if not traces:
print("No traces found")
return
sorted_traces = sorted(traces.values(), key=lambda x: x["duration_frames"], reverse=True)
fig, axes = plt.subplots(len(sorted_traces), 1, figsize=(16, 4 * len(sorted_traces)))
if len(sorted_traces) == 1:
axes = [axes]
pose_colors = {
"frontal": "green",
"three_quarter": "blue",
"profile_left": "orange",
"profile_right": "red",
"unknown": "gray",
}
for ax, trace in zip(axes, sorted_traces):
trace_id = trace["trace_id"]
pose_trace = trace.get("pose_trace", [])
if not pose_trace:
continue
frames = [p["frame"] for p in pose_trace]
angles = [p["angle"] for p in pose_trace]
confidences = [p["confidence"] for p in pose_trace]
# Plot pose angle timeline
for i in range(len(frames) - 1):
color = pose_colors.get(angles[i], "gray")
ax.fill_between(
[frames[i], frames[i+1]],
[0, 0],
[1, 1],
color=color,
alpha=0.6,
)
# Mark transitions
transitions = trace.get("pose_transitions", [])
for t in transitions:
ax.axvline(x=t["frame"], color="black", linestyle="--", alpha=0.5, linewidth=1)
ax.text(t["frame"], 1.05, f"{t['from_angle']}{t['to_angle']}",
fontsize=8, rotation=90, va="bottom", ha="center")
# Plot confidence line
ax2 = ax.twinx()
ax2.plot(frames, confidences, color="purple", linewidth=1, alpha=0.7, label="Confidence")
ax2.set_ylabel("Confidence", color="purple")
ax2.set_ylim(0, 1)
ax.set_xlabel("Frame Number")
ax.set_ylabel("Pose Angle")
ax.set_title(f"Trace {trace_id} Pose Timeline (Frames {trace['start_frame']}-{trace['end_frame']})")
ax.set_ylim(0, 1.2)
# Add pose legend
legend_elements = []
for pose in set(angles):
color = pose_colors.get(pose, "gray")
legend_elements.append(plt.Rectangle((0, 0), 1, 1, fc=color, alpha=0.6, label=pose))
ax.legend(handles=legend_elements, loc="upper right", fontsize=8)
plt.tight_layout()
if output_path:
plt.savefig(output_path, dpi=150, bbox_inches="tight")
print(f"\n✅ Visualization saved to: {output_path}")
else:
plt.show()
def print_transition_analysis(analysis: Dict) -> None:
"""
Print transition analysis results
"""
print("\n" + "=" * 60)
print("Pose Transition Analysis")
print("=" * 60)
for trace_id, data in sorted(analysis.items()):
print(f"\n=== Trace {trace_id} ===")
print(f"Total Transitions: {data['total_transitions']}")
print(f"Transition Frequency: {data['transition_frequency']} transitions/second")
print(f"Stability Score: {data['stability_score']} (0-1, higher = more stable)")
print(f"Longest Stable Pose: {data['longest_stable_pose']['angle']} ({data['longest_stable_pose']['duration_frames']} frames)")
print(f"\nPose Average Duration:")
for angle, avg_dur in data['pose_avg_duration'].items():
print(f" {angle}: {avg_dur} frames")
print(f"\nPose Segments (共 {len(data['pose_segments'])} 个):")
for seg in data['pose_segments'][:5]:
print(f" {seg['angle']}: frames {seg['start_frame']}-{seg['end_frame']} ({seg['duration_frames']} frames, confidence: {seg['avg_confidence']:.3f})")
def main():
parser = argparse.ArgumentParser(description="Analyze pose transitions in face traces")
parser.add_argument("--face-json", required=True, help="Path to face_traced.json")
parser.add_argument("--output-plot", help="Output plot path (PNG)")
parser.add_argument("--output-json", help="Output analysis JSON path")
args = parser.parse_args()
with open(args.face_json) as f:
face_data = json.load(f)
print("=" * 60)
print("Pose Transition Analyzer")
print("=" * 60)
analysis = analyze_pose_transitions(face_data)
print_transition_analysis(analysis)
if args.output_json:
with open(args.output_json, "w") as f:
json.dump(analysis, f, indent=2)
print(f"\n✅ Analysis saved to: {args.output_json}")
if args.output_plot:
visualize_pose_transitions(face_data, args.output_plot)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,377 @@
#!/opt/homebrew/bin/python3.11
"""
MediaPipe Test Script - Test all MediaPipe modules
Test modules:
1. Face Mesh (468 keypoints)
2. Pose (33 keypoints)
3. Hands (21 keypoints per hand)
4. Holistic (Face + Pose + Hands)
"""
import sys
import cv2
import numpy as np
import mediapipe as mp
from pathlib import Path
def test_face_mesh():
"""
Test MediaPipe Face Mesh (468 keypoints)
"""
print("=" * 60)
print("Testing MediaPipe Face Mesh")
print("=" * 60)
mp_face_mesh = mp.solutions.face_mesh
# Create Face Mesh model
face_mesh = mp_face_mesh.FaceMesh(
static_image_mode=True,
max_num_faces=1,
refine_landmarks=True, # Enable iris detection
min_detection_confidence=0.5,
)
print("✅ Face Mesh model created")
# Test on sample image
test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
if Path(test_image_path).exists():
image = cv2.imread(test_image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = face_mesh.process(image_rgb)
if results.multi_face_landmarks:
face_landmarks = results.multi_face_landmarks[0]
num_landmarks = len(face_landmarks.landmark)
print(f"✅ Face detected: {num_landmarks} landmarks")
# Key landmark indices
key_indices = {
"nose_tip": 1,
"left_eye_center": 33,
"right_eye_center": 263,
"left_iris_center": 468,
"right_iris_center": 473,
"mouth_top": 13,
"mouth_bottom": 14,
"mouth_left": 61,
"mouth_right": 291,
}
print("\nKey landmarks:")
for name, idx in key_indices.items():
if idx < num_landmarks:
landmark = face_landmarks.landmark[idx]
print(f" {name} ({idx}): x={landmark.x:.3f}, y={landmark.y:.3f}")
# Calculate Eye Aspect Ratio (EAR)
# Left eye
p1 = face_landmarks.landmark[33] # Left eye top
p2 = face_landmarks.landmark[133] # Left eye bottom
p3 = face_landmarks.landmark[159] # Left eye left
p4 = face_landmarks.landmark[145] # Left eye right
vertical_dist = abs(p2.y - p1.y)
horizontal_dist = abs(p4.x - p3.x)
ear_left = vertical_dist / horizontal_dist if horizontal_dist > 0 else 0
print(f"\nEye Aspect Ratio (EAR):")
print(f" Left eye EAR: {ear_left:.3f}")
print(f" Interpretation: {'wide_open' if ear_left > 0.35 else 'normal' if ear_left > 0.2 else 'closed'}")
# Calculate Mouth Aspect Ratio (MAR)
mouth_top = face_landmarks.landmark[13]
mouth_bottom = face_landmarks.landmark[14]
mouth_left = face_landmarks.landmark[61]
mouth_right = face_landmarks.landmark[291]
mouth_height = abs(mouth_bottom.y - mouth_top.y)
mouth_width = abs(mouth_right.x - mouth_left.x)
mar = mouth_height / mouth_width if mouth_width > 0 else 0
print(f"\nMouth Aspect Ratio (MAR):")
print(f" MAR: {mar:.3f}")
print(f" Interpretation: {'open' if mar > 0.5 else 'closed' if mar < 0.2 else 'slightly_open'}")
else:
print("❌ No face detected")
face_mesh.close()
print("\n✅ Face Mesh test completed")
def test_pose():
"""
Test MediaPipe Pose (33 keypoints)
"""
print("\n" + "=" * 60)
print("Testing MediaPipe Pose")
print("=" * 60)
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(
static_image_mode=True,
model_complexity=2, # Full model
enable_segmentation=False,
min_detection_confidence=0.5,
)
print("✅ Pose model created")
test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
if Path(test_image_path).exists():
image = cv2.imread(test_image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = pose.process(image_rgb)
if results.pose_landmarks:
landmarks = results.pose_landmarks.landmark
num_landmarks = len(landmarks)
print(f"✅ Pose detected: {num_landmarks} keypoints")
# Key keypoints
key_indices = {
"nose": 0,
"left_shoulder": 11,
"right_shoulder": 12,
"left_elbow": 13,
"right_elbow": 14,
"left_wrist": 15,
"right_wrist": 16,
"left_hip": 23,
"right_hip": 24,
"left_knee": 25,
"right_knee": 26,
"left_ankle": 27,
"right_ankle": 28,
}
print("\nKey keypoints:")
for name, idx in key_indices.items():
landmark = landmarks[idx]
print(f" {name} ({idx}): x={landmark.x:.3f}, y={landmark.y:.3f}, visibility={landmark.visibility:.2f}")
# Calculate elbow angles
def calculate_angle(p1, p2, p3):
v1 = np.array([p1.x, p1.y]) - np.array([p2.x, p2.y])
v2 = np.array([p3.x, p3.y]) - np.array([p2.x, p2.y])
angle = np.arccos(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
return np.degrees(angle)
# Right arm angle
right_shoulder = landmarks[12]
right_elbow = landmarks[14]
right_wrist = landmarks[16]
right_elbow_angle = calculate_angle(right_shoulder, right_elbow, right_wrist)
print(f"\nRight elbow angle: {right_elbow_angle:.1f}°")
print(f" Interpretation: {'extended' if right_elbow_angle > 150 else 'folded' if right_elbow_angle < 90 else 'neutral'}")
# Check if arm is raised
if right_wrist.y < right_elbow.y < right_shoulder.y:
print(f" Action: raise_right (arm raised)")
# Knee angles
left_hip = landmarks[23]
left_knee = landmarks[25]
left_ankle = landmarks[27]
left_knee_angle = calculate_angle(left_hip, left_knee, left_ankle)
print(f"\nLeft knee angle: {left_knee_angle:.1f}°")
print(f" Interpretation: {'standing' if left_knee_angle > 160 else 'knee_bend' if left_knee_angle < 120 else 'neutral'}")
else:
print("❌ No pose detected")
pose.close()
print("\n✅ Pose test completed")
def test_hands():
"""
Test MediaPipe Hands (21 keypoints per hand)
"""
print("\n" + "=" * 60)
print("Testing MediaPipe Hands")
print("=" * 60)
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
static_image_mode=True,
max_num_hands=2,
min_detection_confidence=0.5,
)
print("✅ Hands model created")
test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
if Path(test_image_path).exists():
image = cv2.imread(test_image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = hands.process(image_rgb)
if results.multi_hand_landmarks:
for idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
hand_label = results.multi_handedness[idx].classification[0].label
print(f"\n✅ Hand {idx+1} detected ({hand_label}): 21 keypoints")
landmarks = hand_landmarks.landmark
# Key landmarks
key_indices = {
"wrist": 0,
"thumb_tip": 4,
"index_tip": 8,
"middle_tip": 12,
"ring_tip": 16,
"pinky_tip": 20,
}
print(f" Key landmarks:")
for name, i in key_indices.items():
lm = landmarks[i]
print(f" {name} ({i}): x={lm.x:.3f}, y={lm.y:.3f}")
# Detect gesture
thumb_tip = landmarks[4]
index_tip = landmarks[8]
middle_tip = landmarks[12]
ring_tip = landmarks[16]
pinky_tip = landmarks[20]
wrist = landmarks[0]
# Calculate finger extensions
def is_finger_extended(tip, base, wrist):
return tip.y < base.y # Extended upward
thumb_extended = is_finger_extended(landmarks[4], landmarks[2], wrist)
index_extended = is_finger_extended(landmarks[8], landmarks[5], wrist)
middle_extended = is_finger_extended(landmarks[12], landmarks[9], wrist)
ring_extended = is_finger_extended(landmarks[16], landmarks[13], wrist)
pinky_extended = is_finger_extended(landmarks[20], landmarks[17], wrist)
extensions = [thumb_extended, index_extended, middle_extended, ring_extended, pinky_extended]
print(f"\n Finger extensions: {['thumb', 'index', 'middle', 'ring', 'pinky']}")
print(f" {extensions}")
# Detect gesture
gesture = "unknown"
if all(extensions):
gesture = "open_hand"
elif not any(extensions):
gesture = "fist"
elif thumb_extended and not any(extensions[1:]):
gesture = "thumbs_up"
elif index_extended and middle_extended and not any(extensions[2:]):
gesture = "peace_sign"
elif index_extended and not any(extensions[2:]) and not thumb_extended:
gesture = "pointing"
print(f" Detected gesture: {gesture}")
else:
print("❌ No hands detected")
hands.close()
print("\n✅ Hands test completed")
def test_holistic():
"""
Test MediaPipe Holistic (Face + Pose + Hands combined)
"""
print("\n" + "=" * 60)
print("Testing MediaPipe Holistic")
print("=" * 60)
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic(
static_image_mode=True,
model_complexity=2,
enable_segmentation=False,
refine_face_landmarks=True,
)
print("✅ Holistic model created")
test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
if Path(test_image_path).exists():
image = cv2.imread(test_image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = holistic.process(image_rgb)
detected_count = 0
if results.face_landmarks:
num_face = len(results.face_landmarks.landmark)
print(f"✅ Face: {num_face} landmarks")
detected_count += 1
if results.pose_landmarks:
num_pose = len(results.pose_landmarks.landmark)
print(f"✅ Pose: {num_pose} keypoints")
detected_count += 1
if results.left_hand_landmarks:
num_left_hand = len(results.left_hand_landmarks.landmark)
print(f"✅ Left hand: {num_left_hand} keypoints")
detected_count += 1
if results.right_hand_landmarks:
num_right_hand = len(results.right_hand_landmarks.landmark)
print(f"✅ Right hand: {num_right_hand} keypoints")
detected_count += 1
if detected_count == 0:
print("❌ No landmarks detected")
else:
print(f"\nTotal detections: {detected_count} components")
holistic.close()
print("\n✅ Holistic test completed")
def main():
print("=" * 70)
print("MediaPipe Installation Test")
print("=" * 70)
print(f"\nMediaPipe version: {mp.__version__}")
print()
# Test all modules
test_face_mesh()
test_pose()
test_hands()
test_holistic()
print("\n" + "=" * 70)
print("✅ All MediaPipe tests completed!")
print("=" * 70)
print("\nNext steps:")
print(" 1. Face Mesh: Use for eye/mouth action detection")
print(" 2. Pose: Use for arm/leg/feet action detection")
print(" 3. Hands: Use for hand gesture detection")
print(" 4. Holistic: Use for full-body action detection")
if __name__ == "__main__":
main()