feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
This commit is contained in:
877
scripts/utils/body_action_decoder.py
Normal file
877
scripts/utils/body_action_decoder.py
Normal file
@@ -0,0 +1,877 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Body Action Decoder - Extended pose action analysis with body keypoints
|
||||
|
||||
Purpose:
|
||||
1. Decode face pose actions (existing)
|
||||
2. Decode body actions (future MediaPipe Holistic)
|
||||
3. Integrate face + body actions for comprehensive analysis
|
||||
|
||||
Body Keypoints (MediaPipe Holistic):
|
||||
- Face: 468 points (eyes, mouth, nose, etc.)
|
||||
- Pose: 33 points (shoulders, elbows, hands, hips, knees, feet)
|
||||
- Hands: 21 points per hand
|
||||
|
||||
Action Types:
|
||||
- Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head
|
||||
- Eyes: blink, close, wide_open, look_left, look_right
|
||||
- Mouth: open, close, smile, talk, yawn
|
||||
- Arms: raise_left, raise_right, cross_arms, wave
|
||||
- Hands: point, grab, clap, thumbs_up, fist
|
||||
- Legs: stand, sit, walk, run, jump, kick
|
||||
- Feet: tap, stomp, cross
|
||||
|
||||
Architecture:
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Body Action Decoder │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │
|
||||
│ │ Face Actions │ │ Body Actions │ │ Hand Actions │ │
|
||||
│ │ (InsightFace) │ │ (MediaPipe) │ │ (MediaPipe) │ │
|
||||
│ └───────────────┘ └───────────────┘ └───────────────┘ │
|
||||
│ │ │ │ │
|
||||
│ └──────────────────┼──────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌───────▼───────┐ │
|
||||
│ │ Action Merger│ │
|
||||
│ └────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌───────▼───────┐ │
|
||||
│ │ Action Timeline│ │
|
||||
│ └────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Face Action Definitions (Existing from pose_action_decoder.py)
|
||||
# =============================================================================
|
||||
|
||||
FACE_TURN_ACTIONS = {
|
||||
("frontal", "three_quarter"): "turn_partial",
|
||||
("frontal", "profile_left"): "turn_left",
|
||||
("frontal", "profile_right"): "turn_right",
|
||||
("three_quarter", "frontal"): "return_frontal",
|
||||
("three_quarter", "profile_left"): "turn_left",
|
||||
("three_quarter", "profile_right"): "turn_right",
|
||||
("profile_left", "frontal"): "turn_to_frontal",
|
||||
("profile_left", "three_quarter"): "turn_to_three_quarter",
|
||||
("profile_left", "profile_right"): "turn_full",
|
||||
("profile_right", "frontal"): "turn_to_frontal",
|
||||
("profile_right", "three_quarter"): "turn_to_three_quarter",
|
||||
("profile_right", "profile_left"): "turn_full",
|
||||
}
|
||||
|
||||
FACE_PITCH_ACTIONS = {
|
||||
("neutral", "tilted_up"): "look_up",
|
||||
("neutral", "tilted_down"): "look_down",
|
||||
("tilted_up", "neutral"): "return_neutral",
|
||||
("tilted_down", "neutral"): "return_neutral",
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Eye Action Definitions
|
||||
# =============================================================================
|
||||
|
||||
EYE_ACTIONS = {
|
||||
"blink": {
|
||||
"description": "眨眼",
|
||||
"pattern": "eye_aspect_ratio drops < 0.2 for 1-3 frames",
|
||||
"min_frames": 1,
|
||||
"max_frames": 3,
|
||||
},
|
||||
"close": {
|
||||
"description": "闭眼",
|
||||
"pattern": "eye_aspect_ratio < 0.15 for > 10 frames",
|
||||
"min_frames": 10,
|
||||
},
|
||||
"wide_open": {
|
||||
"description": "睁大眼",
|
||||
"pattern": "eye_aspect_ratio > 0.4",
|
||||
},
|
||||
"look_left": {
|
||||
"description": "向左看",
|
||||
"pattern": "iris_position_x < 0.3",
|
||||
},
|
||||
"look_right": {
|
||||
"description": "向右看",
|
||||
"pattern": "iris_position_x > 0.7",
|
||||
},
|
||||
"squint": {
|
||||
"description": "眯眼",
|
||||
"pattern": "eye_aspect_ratio 0.15-0.25",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Mouth Action Definitions
|
||||
# =============================================================================
|
||||
|
||||
MOUTH_ACTIONS = {
|
||||
"open": {
|
||||
"description": "张嘴",
|
||||
"pattern": "mouth_aspect_ratio > 0.5",
|
||||
},
|
||||
"close": {
|
||||
"description": "闭嘴",
|
||||
"pattern": "mouth_aspect_ratio < 0.2",
|
||||
},
|
||||
"smile": {
|
||||
"description": "微笑",
|
||||
"pattern": "mouth_corner_distance > threshold",
|
||||
},
|
||||
"talk": {
|
||||
"description": "说话",
|
||||
"pattern": "mouth_aspect_ratio oscillating 0.3-0.6",
|
||||
"min_frames": 10,
|
||||
},
|
||||
"yawn": {
|
||||
"description": "打哈欠",
|
||||
"pattern": "mouth_aspect_ratio > 0.7 for > 20 frames",
|
||||
"min_frames": 20,
|
||||
},
|
||||
"pout": {
|
||||
"description": "嘟嘴",
|
||||
"pattern": "lip_distance > threshold",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Arm Action Definitions
|
||||
# =============================================================================
|
||||
|
||||
ARM_ACTIONS = {
|
||||
"raise_left": {
|
||||
"description": "举起左手",
|
||||
"pattern": "left_shoulder_y > left_elbow_y > left_wrist_y",
|
||||
},
|
||||
"raise_right": {
|
||||
"description": "举起右手",
|
||||
"pattern": "right_shoulder_y > right_elbow_y > right_wrist_y",
|
||||
},
|
||||
"raise_both": {
|
||||
"description": "双手举起",
|
||||
"pattern": "both arms raised",
|
||||
},
|
||||
"cross_arms": {
|
||||
"description": "双手交叉",
|
||||
"pattern": "left_wrist_x > right_wrist_x AND right_wrist_x < left_wrist_x",
|
||||
},
|
||||
"wave": {
|
||||
"description": "挥手",
|
||||
"pattern": "wrist_y oscillating ±20px for 5-15 frames",
|
||||
"min_frames": 5,
|
||||
"max_frames": 15,
|
||||
},
|
||||
"extend_left": {
|
||||
"description": "伸展左臂",
|
||||
"pattern": "left_elbow_angle > 150°",
|
||||
},
|
||||
"extend_right": {
|
||||
"description": "伸展右臂",
|
||||
"pattern": "right_elbow_angle > 150°",
|
||||
},
|
||||
"fold_left": {
|
||||
"description": "弯曲左臂",
|
||||
"pattern": "left_elbow_angle < 90°",
|
||||
},
|
||||
"fold_right": {
|
||||
"description": "弯曲右臂",
|
||||
"pattern": "right_elbow_angle < 90°",
|
||||
},
|
||||
"point": {
|
||||
"description": "指向",
|
||||
"pattern": "index_finger extended, other fingers folded",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Hand Action Definitions
|
||||
# =============================================================================
|
||||
|
||||
HAND_ACTIONS = {
|
||||
"grab": {
|
||||
"description": "抓取",
|
||||
"pattern": "fingers folded, thumb opposing",
|
||||
},
|
||||
"open": {
|
||||
"description": "张开手",
|
||||
"pattern": "all fingers extended",
|
||||
},
|
||||
"clap": {
|
||||
"description": "拍手",
|
||||
"pattern": "hands together then apart (velocity pattern)",
|
||||
"min_frames": 3,
|
||||
"max_frames": 10,
|
||||
},
|
||||
"thumbs_up": {
|
||||
"description": "点赞",
|
||||
"pattern": "thumb extended upward, other fingers folded",
|
||||
},
|
||||
"fist": {
|
||||
"description": "握拳",
|
||||
"pattern": "all fingers folded into palm",
|
||||
},
|
||||
"peace": {
|
||||
"description": "剪刀手",
|
||||
"pattern": "index and middle fingers extended",
|
||||
},
|
||||
"ok": {
|
||||
"description": "OK 手势",
|
||||
"pattern": "thumb and index finger touching",
|
||||
},
|
||||
"touch_face": {
|
||||
"description": "摸脸",
|
||||
"pattern": "hand near face region",
|
||||
},
|
||||
"touch_hair": {
|
||||
"description": "摸头发",
|
||||
"pattern": "hand above head region",
|
||||
},
|
||||
"pocket_left": {
|
||||
"description": "左手插兜",
|
||||
"pattern": "left_hand in hip region",
|
||||
},
|
||||
"pocket_right": {
|
||||
"description": "右手插兜",
|
||||
"pattern": "right_hand in hip region",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Leg Action Definitions
|
||||
# =============================================================================
|
||||
|
||||
LEG_ACTIONS = {
|
||||
"stand": {
|
||||
"description": "站立",
|
||||
"pattern": "hip_y < knee_y < ankle_y, vertical alignment",
|
||||
},
|
||||
"sit": {
|
||||
"description": "坐姿",
|
||||
"pattern": "hip_y ≈ knee_y, thigh horizontal",
|
||||
},
|
||||
"walk": {
|
||||
"description": "行走",
|
||||
"pattern": "hip-knee-ankle oscillating, stride pattern",
|
||||
"min_frames": 10,
|
||||
},
|
||||
"run": {
|
||||
"description": "奔跑",
|
||||
"pattern": "fast oscillating, knee_bend > 60°",
|
||||
"min_frames": 10,
|
||||
},
|
||||
"jump": {
|
||||
"description": "跳跃",
|
||||
"pattern": "all keypoints moving upward then landing",
|
||||
"min_frames": 5,
|
||||
"max_frames": 20,
|
||||
},
|
||||
"kick": {
|
||||
"description": "踢腿",
|
||||
"pattern": "one leg extended forward rapidly",
|
||||
"min_frames": 3,
|
||||
"max_frames": 15,
|
||||
},
|
||||
"cross_left": {
|
||||
"description": "左腿交叉",
|
||||
"pattern": "left_ankle_x > right_ankle_x",
|
||||
},
|
||||
"cross_right": {
|
||||
"description": "右腿交叉",
|
||||
"pattern": "right_ankle_x > left_ankle_x",
|
||||
},
|
||||
"knee_bend": {
|
||||
"description": "弯膝",
|
||||
"pattern": "knee_angle < 120°",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Feet Action Definitions
|
||||
# =============================================================================
|
||||
|
||||
FEET_ACTIONS = {
|
||||
"tap": {
|
||||
"description": "轻踏",
|
||||
"pattern": "ankle_y oscillating ±10px",
|
||||
"min_frames": 3,
|
||||
"max_frames": 15,
|
||||
},
|
||||
"stomp": {
|
||||
"description": "重踏",
|
||||
"pattern": "ankle_y large downward movement",
|
||||
"min_frames": 3,
|
||||
},
|
||||
"cross": {
|
||||
"description": "交叉脚",
|
||||
"pattern": "feet_x overlapping",
|
||||
},
|
||||
"point_left": {
|
||||
"description": "左脚前伸",
|
||||
"pattern": "left_ankle_y < right_ankle_y",
|
||||
},
|
||||
"point_right": {
|
||||
"description": "右脚前伸",
|
||||
"pattern": "right_ankle_y < left_ankle_y",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Combined Actions (Face + Body)
|
||||
# =============================================================================
|
||||
|
||||
COMBINED_ACTIONS = {
|
||||
"thinking": {
|
||||
"description": "思考姿势",
|
||||
"components": ["touch_face", "look_down"],
|
||||
"pattern": "hand near chin + head tilted down",
|
||||
},
|
||||
"listening": {
|
||||
"description": "倾听姿势",
|
||||
"components": ["turn_partial", "open_mouth"],
|
||||
"pattern": "slight turn + mouth slightly open",
|
||||
},
|
||||
"nodding_agreement": {
|
||||
"description": "点头同意",
|
||||
"components": ["nod_head", "smile"],
|
||||
"pattern": "head nod + smile",
|
||||
},
|
||||
"shaking_disagreement": {
|
||||
"description": "摇头不同意",
|
||||
"components": ["shake_head", "frown"],
|
||||
"pattern": "shake head + frown",
|
||||
},
|
||||
"waving_greeting": {
|
||||
"description": "挥手打招呼",
|
||||
"components": ["wave", "smile"],
|
||||
"pattern": "wave hand + smile",
|
||||
},
|
||||
"crossing_arms_defensive": {
|
||||
"description": "双手交叉防御",
|
||||
"components": ["cross_arms", "frontal_stable"],
|
||||
"pattern": "cross arms + frontal pose",
|
||||
},
|
||||
"pointing_explaining": {
|
||||
"description": "指向解释",
|
||||
"components": ["point", "turn_partial"],
|
||||
"pattern": "pointing + slight turn",
|
||||
},
|
||||
"stretching": {
|
||||
"description": "伸展",
|
||||
"components": ["raise_both", "look_up"],
|
||||
"pattern": "raise arms + look up",
|
||||
},
|
||||
"sitting_relaxed": {
|
||||
"description": "放松坐姿",
|
||||
"components": ["sit", "cross_arms"],
|
||||
"pattern": "sit + cross arms",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Analysis Functions
|
||||
# =============================================================================
|
||||
|
||||
def analyze_eye_actions(eye_landmarks: List, prev_eye_landmarks: List = None) -> List[Dict]:
|
||||
"""
|
||||
Analyze eye actions from landmarks
|
||||
|
||||
Args:
|
||||
eye_landmarks: Current frame eye landmarks (left/right eye points)
|
||||
prev_eye_landmarks: Previous frame landmarks (for motion detection)
|
||||
|
||||
Returns:
|
||||
List of detected eye actions
|
||||
"""
|
||||
actions = []
|
||||
|
||||
if not eye_landmarks or len(eye_landmarks) < 6:
|
||||
return actions
|
||||
|
||||
# Calculate eye aspect ratio (EAR)
|
||||
# EAR = (|p2-p6| + |p3-p5|) / (2|p1-p4|)
|
||||
# Points: p1, p2, p3, p4, p5, p6 (6 points per eye)
|
||||
|
||||
# For left eye
|
||||
left_eye = eye_landmarks[:6]
|
||||
if len(left_eye) == 6:
|
||||
# Simplified EAR calculation
|
||||
vertical_1 = np.linalg.norm(np.array(left_eye[1]) - np.array(left_eye[5]))
|
||||
vertical_2 = np.linalg.norm(np.array(left_eye[2]) - np.array(left_eye[4]))
|
||||
horizontal = np.linalg.norm(np.array(left_eye[0]) - np.array(left_eye[3]))
|
||||
|
||||
left_ear = (vertical_1 + vertical_2) / (2 * horizontal) if horizontal > 0 else 0
|
||||
|
||||
# Detect actions
|
||||
if left_ear < 0.15:
|
||||
actions.append({"action": "close_left", "description": "闭左眼", "confidence": 1.0 - left_ear})
|
||||
elif left_ear > 0.4:
|
||||
actions.append({"action": "wide_open_left", "description": "睁大左眼", "confidence": left_ear})
|
||||
|
||||
return actions
|
||||
|
||||
|
||||
def analyze_mouth_actions(mouth_landmarks: List) -> List[Dict]:
|
||||
"""
|
||||
Analyze mouth actions from landmarks
|
||||
|
||||
Args:
|
||||
mouth_landmarks: Mouth region landmarks (lips, mouth corners)
|
||||
|
||||
Returns:
|
||||
List of detected mouth actions
|
||||
"""
|
||||
actions = []
|
||||
|
||||
if not mouth_landmarks or len(mouth_landmarks) < 4:
|
||||
return actions
|
||||
|
||||
# Calculate mouth aspect ratio
|
||||
# Upper lip - lower lip distance / mouth width
|
||||
|
||||
upper_lip = np.array(mouth_landmarks[0])
|
||||
lower_lip = np.array(mouth_landmarks[1])
|
||||
left_corner = np.array(mouth_landmarks[2])
|
||||
right_corner = np.array(mouth_landmarks[3])
|
||||
|
||||
mouth_height = np.linalg.norm(upper_lip - lower_lip)
|
||||
mouth_width = np.linalg.norm(left_corner - right_corner)
|
||||
|
||||
mar = mouth_height / mouth_width if mouth_width > 0 else 0
|
||||
|
||||
# Detect actions
|
||||
if mar > 0.7:
|
||||
actions.append({"action": "yawn", "description": "打哈欠", "mar": mar})
|
||||
elif mar > 0.5:
|
||||
actions.append({"action": "open", "description": "张嘴", "mar": mar})
|
||||
elif mar < 0.2:
|
||||
actions.append({"action": "close", "description": "闭嘴", "mar": mar})
|
||||
else:
|
||||
# Check smile (mouth corners distance)
|
||||
corner_distance = abs(left_corner[1] - upper_lip[1]) + abs(right_corner[1] - upper_lip[1])
|
||||
if corner_distance > 10: # Threshold
|
||||
actions.append({"action": "smile", "description": "微笑", "corner_distance": corner_distance})
|
||||
|
||||
return actions
|
||||
|
||||
|
||||
def analyze_arm_actions(pose_keypoints: Dict) -> List[Dict]:
|
||||
"""
|
||||
Analyze arm actions from pose keypoints
|
||||
|
||||
Args:
|
||||
pose_keypoints: Pose keypoints dict with shoulder, elbow, wrist positions
|
||||
|
||||
Returns:
|
||||
List of detected arm actions
|
||||
"""
|
||||
actions = []
|
||||
|
||||
# Keypoint indices (MediaPipe Pose):
|
||||
# 11: left_shoulder, 12: right_shoulder
|
||||
# 13: left_elbow, 14: right_elbow
|
||||
# 15: left_wrist, 16: right_wrist
|
||||
|
||||
left_shoulder = pose_keypoints.get("left_shoulder")
|
||||
left_elbow = pose_keypoints.get("left_elbow")
|
||||
left_wrist = pose_keypoints.get("left_wrist")
|
||||
|
||||
right_shoulder = pose_keypoints.get("right_shoulder")
|
||||
right_elbow = pose_keypoints.get("right_elbow")
|
||||
right_wrist = pose_keypoints.get("right_wrist")
|
||||
|
||||
# Left arm actions
|
||||
if left_shoulder and left_elbow and left_wrist:
|
||||
# Calculate elbow angle
|
||||
shoulder_elbow = np.array(left_elbow) - np.array(left_shoulder)
|
||||
elbow_wrist = np.array(left_wrist) - np.array(left_elbow)
|
||||
|
||||
elbow_angle = np.arccos(
|
||||
np.dot(shoulder_elbow, elbow_wrist) /
|
||||
(np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
|
||||
)
|
||||
elbow_angle_deg = np.degrees(elbow_angle)
|
||||
|
||||
# Detect actions
|
||||
if left_wrist[1] < left_elbow[1] < left_shoulder[1]: # Raised (y decreases upward)
|
||||
actions.append({"action": "raise_left", "description": "举起左手", "angle": elbow_angle_deg})
|
||||
|
||||
if elbow_angle_deg > 150:
|
||||
actions.append({"action": "extend_left", "description": "伸展左臂", "angle": elbow_angle_deg})
|
||||
elif elbow_angle_deg < 90:
|
||||
actions.append({"action": "fold_left", "description": "弯曲左臂", "angle": elbow_angle_deg})
|
||||
|
||||
# Right arm actions
|
||||
if right_shoulder and right_elbow and right_wrist:
|
||||
shoulder_elbow = np.array(right_elbow) - np.array(right_shoulder)
|
||||
elbow_wrist = np.array(right_wrist) - np.array(right_elbow)
|
||||
|
||||
elbow_angle = np.arccos(
|
||||
np.dot(shoulder_elbow, elbow_wrist) /
|
||||
(np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
|
||||
)
|
||||
elbow_angle_deg = np.degrees(elbow_angle)
|
||||
|
||||
if right_wrist[1] < right_elbow[1] < right_shoulder[1]:
|
||||
actions.append({"action": "raise_right", "description": "举起右手", "angle": elbow_angle_deg})
|
||||
|
||||
if elbow_angle_deg > 150:
|
||||
actions.append({"action": "extend_right", "description": "伸展右臂", "angle": elbow_angle_deg})
|
||||
elif elbow_angle_deg < 90:
|
||||
actions.append({"action": "fold_right", "description": "弯曲右臂", "angle": elbow_angle_deg})
|
||||
|
||||
# Cross arms detection
|
||||
if left_wrist and right_wrist:
|
||||
if left_wrist[0] > right_wrist[0] and right_wrist[0] < left_shoulder[0]:
|
||||
actions.append({"action": "cross_arms", "description": "双手交叉"})
|
||||
|
||||
return actions
|
||||
|
||||
|
||||
def analyze_hand_actions(hand_keypoints: List, hand_type: str = "right") -> List[Dict]:
|
||||
"""
|
||||
Analyze hand actions from hand keypoints
|
||||
|
||||
Args:
|
||||
hand_keypoints: 21 hand keypoints (MediaPipe Hand)
|
||||
hand_type: "left" or "right"
|
||||
|
||||
Returns:
|
||||
List of detected hand actions
|
||||
"""
|
||||
actions = []
|
||||
|
||||
if not hand_keypoints or len(hand_keypoints) < 21:
|
||||
return actions
|
||||
|
||||
# MediaPipe Hand keypoint indices:
|
||||
# 0: wrist
|
||||
# 1-4: thumb (CMC, MCP, IP, TIP)
|
||||
# 5-8: index finger (MCP, PIP, DIP, TIP)
|
||||
# 9-12: middle finger
|
||||
# 13-16: ring finger
|
||||
# 17-20: pinky
|
||||
|
||||
wrist = np.array(hand_keypoints[0])
|
||||
thumb_tip = np.array(hand_keypoints[4])
|
||||
index_tip = np.array(hand_keypoints[8])
|
||||
middle_tip = np.array(hand_keypoints[12])
|
||||
ring_tip = np.array(hand_keypoints[16])
|
||||
pinky_tip = np.array(hand_keypoints[20])
|
||||
|
||||
# Calculate finger extensions
|
||||
finger_tips = [thumb_tip, index_tip, middle_tip, ring_tip, pinky_tip]
|
||||
finger_bases = [
|
||||
np.array(hand_keypoints[2]), # thumb IP
|
||||
np.array(hand_keypoints[5]), # index MCP
|
||||
np.array(hand_keypoints[9]), # middle MCP
|
||||
np.array(hand_keypoints[13]), # ring MCP
|
||||
np.array(hand_keypoints[17]), # pinky MCP
|
||||
]
|
||||
|
||||
extensions = []
|
||||
for tip, base in zip(finger_tips, finger_bases):
|
||||
dist = np.linalg.norm(tip - base)
|
||||
extensions.append(dist)
|
||||
|
||||
# Detect actions
|
||||
avg_extension = np.mean(extensions)
|
||||
|
||||
if avg_extension > 50: # Open hand
|
||||
actions.append({"action": f"open_{hand_type}", "description": f"张开{hand_type}手"})
|
||||
|
||||
elif avg_extension < 30: # Closed/fist
|
||||
actions.append({"action": f"fist_{hand_type}", "description": f"握{hand_type}拳"})
|
||||
|
||||
# Thumbs up (thumb extended upward, others folded)
|
||||
if extensions[0] > 40 and np.mean(extensions[1:]) < 30:
|
||||
actions.append({"action": f"thumbs_up_{hand_type}", "description": f"{hand_type}手点赞"})
|
||||
|
||||
# Peace sign (index and middle extended)
|
||||
if extensions[1] > 40 and extensions[2] > 40 and np.mean(extensions[3:]) < 30:
|
||||
actions.append({"action": f"peace_{hand_type}", "description": f"{hand_type}手剪刀手"})
|
||||
|
||||
# Pointing (index extended, others folded)
|
||||
if extensions[1] > 40 and np.mean([extensions[0], extensions[2], extensions[3], extensions[4]]) < 30:
|
||||
actions.append({"action": f"point_{hand_type}", "description": f"{hand_type}手指向"})
|
||||
|
||||
return actions
|
||||
|
||||
|
||||
def analyze_leg_actions(pose_keypoints: Dict) -> List[Dict]:
|
||||
"""
|
||||
Analyze leg actions from pose keypoints
|
||||
|
||||
Args:
|
||||
pose_keypoints: Pose keypoints with hip, knee, ankle positions
|
||||
|
||||
Returns:
|
||||
List of detected leg actions
|
||||
"""
|
||||
actions = []
|
||||
|
||||
# Keypoint indices (MediaPipe Pose):
|
||||
# 23: left_hip, 24: right_hip
|
||||
# 25: left_knee, 26: right_knee
|
||||
# 27: left_ankle, 28: right_ankle
|
||||
|
||||
left_hip = pose_keypoints.get("left_hip")
|
||||
left_knee = pose_keypoints.get("left_knee")
|
||||
left_ankle = pose_keypoints.get("left_ankle")
|
||||
|
||||
right_hip = pose_keypoints.get("right_hip")
|
||||
right_knee = pose_keypoints.get("right_knee")
|
||||
right_ankle = pose_keypoints.get("right_ankle")
|
||||
|
||||
# Left leg actions
|
||||
if left_hip and left_knee and left_ankle:
|
||||
hip_knee = np.array(left_knee) - np.array(left_hip)
|
||||
knee_ankle = np.array(left_ankle) - np.array(left_knee)
|
||||
|
||||
knee_angle = np.arccos(
|
||||
np.dot(hip_knee, knee_ankle) /
|
||||
(np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
|
||||
)
|
||||
knee_angle_deg = np.degrees(knee_angle)
|
||||
|
||||
# Detect actions
|
||||
if knee_angle_deg < 120:
|
||||
actions.append({"action": "knee_bend_left", "description": "弯左膝", "angle": knee_angle_deg})
|
||||
|
||||
# Standing detection
|
||||
if left_hip[1] < left_knee[1] < left_ankle[1]: # Vertical alignment (y increases downward)
|
||||
actions.append({"action": "stand_left", "description": "左腿站立"})
|
||||
|
||||
# Right leg actions
|
||||
if right_hip and right_knee and right_ankle:
|
||||
hip_knee = np.array(right_knee) - np.array(right_hip)
|
||||
knee_ankle = np.array(right_ankle) - np.array(right_knee)
|
||||
|
||||
knee_angle = np.arccos(
|
||||
np.dot(hip_knee, knee_ankle) /
|
||||
(np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
|
||||
)
|
||||
knee_angle_deg = np.degrees(knee_angle)
|
||||
|
||||
if knee_angle_deg < 120:
|
||||
actions.append({"action": "knee_bend_right", "description": "弯右膝", "angle": knee_angle_deg})
|
||||
|
||||
if right_hip[1] < right_knee[1] < right_ankle[1]:
|
||||
actions.append({"action": "stand_right", "description": "右腿站立"})
|
||||
|
||||
# Sit detection (hip ≈ knee height)
|
||||
if left_hip and left_knee and right_hip and right_knee:
|
||||
hip_avg_y = (left_hip[1] + right_hip[1]) / 2
|
||||
knee_avg_y = (left_knee[1] + right_knee[1]) / 2
|
||||
|
||||
if abs(hip_avg_y - knee_avg_y) < 30: # Hip and knee at similar height
|
||||
actions.append({"action": "sit", "description": "坐姿"})
|
||||
|
||||
return actions
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Decoder Function
|
||||
# =============================================================================
|
||||
|
||||
def decode_body_actions(
|
||||
pose_data: Dict,
|
||||
face_data: Dict = None,
|
||||
hand_data: Dict = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Decode all body actions from multiple data sources
|
||||
|
||||
Args:
|
||||
pose_data: Pose estimation data (MediaPipe Pose)
|
||||
face_data: Face pose data (InsightFace pose_angle)
|
||||
hand_data: Hand tracking data (MediaPipe Hand)
|
||||
|
||||
Returns:
|
||||
Combined action data dict
|
||||
"""
|
||||
all_actions = {
|
||||
"face": [],
|
||||
"eyes": [],
|
||||
"mouth": [],
|
||||
"arms": [],
|
||||
"hands": [],
|
||||
"legs": [],
|
||||
"feet": [],
|
||||
"combined": [],
|
||||
}
|
||||
|
||||
# 1. Face actions (existing)
|
||||
if face_data:
|
||||
pose_angle = face_data.get("pose_angle", {})
|
||||
prev_pose_angle = face_data.get("prev_pose_angle", {})
|
||||
|
||||
if pose_angle and prev_pose_angle:
|
||||
angle = pose_angle.get("angle", "unknown")
|
||||
prev_angle = prev_pose_angle.get("angle", "unknown")
|
||||
|
||||
turn_key = (prev_angle, angle)
|
||||
if turn_key in FACE_TURN_ACTIONS:
|
||||
all_actions["face"].append({
|
||||
"action": FACE_TURN_ACTIONS[turn_key],
|
||||
"description": f"Face: {prev_angle} → {angle}",
|
||||
})
|
||||
|
||||
# Pitch actions
|
||||
pitch = pose_angle.get("pitch", "neutral")
|
||||
prev_pitch = prev_pose_angle.get("pitch", "neutral")
|
||||
|
||||
pitch_key = (prev_pitch, pitch)
|
||||
if pitch_key in FACE_PITCH_ACTIONS:
|
||||
all_actions["face"].append({
|
||||
"action": FACE_PITCH_ACTIONS[pitch_key],
|
||||
"description": f"Pitch: {prev_pitch} → {pitch}",
|
||||
})
|
||||
|
||||
# 2. Eye actions (if eye landmarks available)
|
||||
if face_data and face_data.get("eye_landmarks"):
|
||||
all_actions["eyes"] = analyze_eye_actions(
|
||||
face_data["eye_landmarks"],
|
||||
face_data.get("prev_eye_landmarks")
|
||||
)
|
||||
|
||||
# 3. Mouth actions (if mouth landmarks available)
|
||||
if face_data and face_data.get("mouth_landmarks"):
|
||||
all_actions["mouth"] = analyze_mouth_actions(face_data["mouth_landmarks"])
|
||||
|
||||
# 4. Arm actions (if pose keypoints available)
|
||||
if pose_data and pose_data.get("keypoints"):
|
||||
all_actions["arms"] = analyze_arm_actions(pose_data["keypoints"])
|
||||
|
||||
# 5. Hand actions (if hand keypoints available)
|
||||
if hand_data:
|
||||
if hand_data.get("left_hand"):
|
||||
all_actions["hands"].extend(analyze_hand_actions(hand_data["left_hand"], "left"))
|
||||
if hand_data.get("right_hand"):
|
||||
all_actions["hands"].extend(analyze_hand_actions(hand_data["right_hand"], "right"))
|
||||
|
||||
# 6. Leg actions (if pose keypoints available)
|
||||
if pose_data and pose_data.get("keypoints"):
|
||||
all_actions["legs"] = analyze_leg_actions(pose_data["keypoints"])
|
||||
|
||||
# 7. Combined actions
|
||||
detected_actions = []
|
||||
for category, actions in all_actions.items():
|
||||
if actions:
|
||||
detected_actions.extend([a["action"] for a in actions])
|
||||
|
||||
for combined_name, combined_def in COMBINED_ACTIONS.items():
|
||||
components = combined_def["components"]
|
||||
if all(comp in detected_actions for comp in components):
|
||||
all_actions["combined"].append({
|
||||
"action": combined_name,
|
||||
"description": combined_def["description"],
|
||||
"components": components,
|
||||
})
|
||||
|
||||
return all_actions
|
||||
|
||||
|
||||
def print_body_action_report(action_data: Dict) -> None:
|
||||
"""
|
||||
Print body action report
|
||||
"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Body Action Decoder Report")
|
||||
print("=" * 70)
|
||||
|
||||
categories = ["face", "eyes", "mouth", "arms", "hands", "legs", "feet", "combined"]
|
||||
|
||||
for category in categories:
|
||||
actions = action_data.get(category, [])
|
||||
|
||||
if actions:
|
||||
print(f"\n{category.upper()} Actions ({len(actions)}):")
|
||||
for act in actions:
|
||||
desc = act.get("description", act["action"])
|
||||
print(f" - {act['action']}: {desc}")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Entry Point
|
||||
# =============================================================================
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Decode body actions from pose data")
|
||||
parser.add_argument("--pose-json", help="Path to pose.json (MediaPipe output)")
|
||||
parser.add_argument("--face-json", help="Path to face.json (InsightFace output)")
|
||||
parser.add_argument("--hand-json", help="Path to hand.json (MediaPipe Hand output)")
|
||||
parser.add_argument("--output-json", help="Output action data JSON")
|
||||
parser.add_argument("--frame", type=int, help="Analyze specific frame")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 70)
|
||||
print("Body Action Decoder")
|
||||
print("=" * 70)
|
||||
|
||||
# Load data
|
||||
pose_data = None
|
||||
face_data = None
|
||||
hand_data = None
|
||||
|
||||
if args.pose_json:
|
||||
with open(args.pose_json) as f:
|
||||
pose_data = json.load(f)
|
||||
|
||||
if args.face_json:
|
||||
with open(args.face_json) as f:
|
||||
face_data = json.load(f)
|
||||
|
||||
if args.hand_json:
|
||||
with open(args.hand_json) as f:
|
||||
hand_data = json.load(f)
|
||||
|
||||
# Analyze
|
||||
if pose_data or face_data or hand_data:
|
||||
action_data = decode_body_actions(
|
||||
pose_data=pose_data,
|
||||
face_data=face_data,
|
||||
hand_data=hand_data,
|
||||
)
|
||||
|
||||
print_body_action_report(action_data)
|
||||
|
||||
if args.output_json:
|
||||
with open(args.output_json, "w") as f:
|
||||
json.dump(action_data, f, indent=2)
|
||||
print(f"\n✅ Output saved to: {args.output_json}")
|
||||
else:
|
||||
print("\n⚠️ No input data provided")
|
||||
print("\nAction Categories:")
|
||||
print(" - Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head")
|
||||
print(" - Eyes: blink, close, wide_open, look_left, look_right")
|
||||
print(" - Mouth: open, close, smile, talk, yawn")
|
||||
print(" - Arms: raise_left, raise_right, cross_arms, wave, point")
|
||||
print(" - Hands: grab, open, clap, thumbs_up, fist, peace, ok")
|
||||
print(" - Legs: stand, sit, walk, run, jump, kick")
|
||||
print(" - Feet: tap, stomp, cross, point")
|
||||
print(" - Combined: thinking, listening, nodding_agreement, waving_greeting")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
201
scripts/utils/face_trace_visualizer.py
Normal file
201
scripts/utils/face_trace_visualizer.py
Normal file
@@ -0,0 +1,201 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Face Trace Visualizer - Visualize face tracking paths
|
||||
|
||||
Output:
|
||||
1. Trace path visualization (matplotlib)
|
||||
2. Trace statistics CSV
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.patches import Rectangle
|
||||
from collections import defaultdict
|
||||
from typing import Dict
|
||||
|
||||
|
||||
def visualize_traces(face_data: Dict, output_path: str = None) -> None:
|
||||
"""
|
||||
Visualize face trace paths
|
||||
"""
|
||||
frames = face_data.get("frames", {})
|
||||
traces = face_data.get("traces", {})
|
||||
metadata = face_data.get("metadata", {})
|
||||
|
||||
if not frames or not traces:
|
||||
print("No frames or traces found")
|
||||
return
|
||||
|
||||
video_width = metadata.get("width", 640)
|
||||
video_height = metadata.get("height", 360)
|
||||
video_duration = metadata.get("total_duration", 15)
|
||||
|
||||
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
||||
|
||||
ax1 = axes[0, 0]
|
||||
ax2 = axes[0, 1]
|
||||
ax3 = axes[1, 0]
|
||||
ax4 = axes[1, 1]
|
||||
|
||||
colors = plt.cm.tab10(np.linspace(0, 1, len(traces)))
|
||||
|
||||
trace_data = {}
|
||||
for trace_id_str, trace in traces.items():
|
||||
trace_id = int(trace_id_str)
|
||||
path = trace.get("path", [])
|
||||
|
||||
trace_data[trace_id] = {
|
||||
"frames": [p["frame"] for p in path],
|
||||
"x": [p["bbox"]["x"] + p["bbox"]["width"] / 2 for p in path],
|
||||
"y": [p["bbox"]["y"] + p["bbox"]["height"] / 2 for p in path],
|
||||
"confidence": [p["confidence"] for p in path],
|
||||
"pose": [p["pose_angle"] for p in path],
|
||||
}
|
||||
|
||||
for trace_id, color in zip(sorted(trace_data.keys()), colors):
|
||||
data = trace_data[trace_id]
|
||||
|
||||
ax1.plot(data["frames"], data["x"], color=color, label=f"Trace {trace_id}", linewidth=2)
|
||||
ax1.scatter(data["frames"], data["x"], color=color, s=30)
|
||||
|
||||
ax2.plot(data["frames"], data["y"], color=color, label=f"Trace {trace_id}", linewidth=2)
|
||||
ax2.scatter(data["frames"], data["y"], color=color, s=30)
|
||||
|
||||
ax3.plot(data["frames"], data["confidence"], color=color, label=f"Trace {trace_id}", linewidth=2)
|
||||
ax3.scatter(data["frames"], data["confidence"], color=color, s=30)
|
||||
|
||||
ax1.set_xlabel("Frame Number")
|
||||
ax1.set_ylabel("X Position (center)")
|
||||
ax1.set_title("Face X Position Over Time")
|
||||
ax1.legend()
|
||||
ax1.grid(True, alpha=0.3)
|
||||
|
||||
ax2.set_xlabel("Frame Number")
|
||||
ax2.set_ylabel("Y Position (center)")
|
||||
ax2.set_title("Face Y Position Over Time")
|
||||
ax2.legend()
|
||||
ax2.grid(True, alpha=0.3)
|
||||
|
||||
ax3.set_xlabel("Frame Number")
|
||||
ax3.set_ylabel("Detection Confidence")
|
||||
ax3.set_title("Face Detection Confidence Over Time")
|
||||
ax3.legend()
|
||||
ax3.grid(True, alpha=0.3)
|
||||
|
||||
pose_colors = {
|
||||
"frontal": "green",
|
||||
"three_quarter": "blue",
|
||||
"profile_left": "orange",
|
||||
"profile_right": "red",
|
||||
"unknown": "gray",
|
||||
}
|
||||
|
||||
for trace_id, color in zip(sorted(trace_data.keys()), colors):
|
||||
data = trace_data[trace_id]
|
||||
poses = data["pose"]
|
||||
frames = data["frames"]
|
||||
|
||||
pose_counts = defaultdict(int)
|
||||
for pose in poses:
|
||||
pose_counts[pose] += 1
|
||||
|
||||
ax4.bar(
|
||||
[f"Trace {trace_id}\n{pose}" for pose in pose_counts.keys()],
|
||||
pose_counts.values(),
|
||||
color=[pose_colors.get(pose, "gray") for pose in pose_counts.keys()],
|
||||
alpha=0.7,
|
||||
label=f"Trace {trace_id}",
|
||||
)
|
||||
|
||||
ax4.set_xlabel("Trace / Pose")
|
||||
ax4.set_ylabel("Count")
|
||||
ax4.set_title("Pose Distribution by Trace")
|
||||
ax4.tick_params(axis='x', rotation=45)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
if output_path:
|
||||
plt.savefig(output_path, dpi=150, bbox_inches="tight")
|
||||
print(f"\n✅ Visualization saved to: {output_path}")
|
||||
else:
|
||||
plt.show()
|
||||
|
||||
|
||||
def export_trace_csv(face_data: Dict, output_path: str) -> None:
|
||||
"""
|
||||
Export trace statistics to CSV
|
||||
"""
|
||||
traces = face_data.get("traces", {})
|
||||
|
||||
import csv
|
||||
|
||||
with open(output_path, "w", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow([
|
||||
"trace_id",
|
||||
"start_frame",
|
||||
"end_frame",
|
||||
"duration_frames",
|
||||
"duration_seconds",
|
||||
"total_appearances",
|
||||
"avg_confidence",
|
||||
"pose_three_quarter",
|
||||
"pose_profile_right",
|
||||
"pose_profile_left",
|
||||
"pose_frontal",
|
||||
])
|
||||
|
||||
for trace_id_str, trace in sorted(traces.items(), key=lambda x: int(x[0])):
|
||||
poses = trace.get("pose_angles", [])
|
||||
pose_counts = defaultdict(int)
|
||||
for pose in poses:
|
||||
pose_counts[pose] += 1
|
||||
|
||||
writer.writerow([
|
||||
trace["trace_id"],
|
||||
trace["start_frame"],
|
||||
trace["end_frame"],
|
||||
trace["duration_frames"],
|
||||
trace["duration_seconds"],
|
||||
trace["total_appearances"],
|
||||
trace["avg_confidence"],
|
||||
pose_counts.get("three_quarter", 0),
|
||||
pose_counts.get("profile_right", 0),
|
||||
pose_counts.get("profile_left", 0),
|
||||
pose_counts.get("frontal", 0),
|
||||
])
|
||||
|
||||
print(f"\n✅ CSV exported to: {output_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Visualize face traces")
|
||||
parser.add_argument("--face-json", required=True, help="Path to face_traced.json")
|
||||
parser.add_argument("--output-plot", help="Output plot path (PNG)")
|
||||
parser.add_argument("--output-csv", help="Output CSV path")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.face_json) as f:
|
||||
face_data = json.load(f)
|
||||
|
||||
print("=" * 60)
|
||||
print("Face Trace Visualizer")
|
||||
print("=" * 60)
|
||||
print(f"\nInput: {args.face_json}")
|
||||
print(f"Traces: {len(face_data.get('traces', {}))}")
|
||||
|
||||
if args.output_plot:
|
||||
visualize_traces(face_data, args.output_plot)
|
||||
|
||||
if args.output_csv:
|
||||
export_trace_csv(face_data, args.output_csv)
|
||||
|
||||
if not args.output_plot and not args.output_csv:
|
||||
visualize_traces(face_data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
452
scripts/utils/face_tracker.py
Executable file
452
scripts/utils/face_tracker.py
Executable file
@@ -0,0 +1,452 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Face Tracker - Track faces across frames using embedding similarity and bbox proximity
|
||||
|
||||
Purpose:
|
||||
1. Assign unique trace_id to each face across frames
|
||||
2. Track face movement across adjacent frames
|
||||
3. Output trace statistics (duration, path, confidence)
|
||||
|
||||
Algorithm:
|
||||
1. For first frame: assign new trace_id to each face
|
||||
2. For subsequent frames:
|
||||
- Calculate bbox overlap with previous frame faces
|
||||
- Calculate embedding cosine similarity
|
||||
- Match faces if both conditions met
|
||||
- Assign same trace_id if matched, new trace_id if not
|
||||
|
||||
Matching Conditions:
|
||||
- bbox overlap > 0.3 (IoU)
|
||||
- embedding similarity > 0.7
|
||||
- OR single condition > threshold (fallback)
|
||||
|
||||
Output:
|
||||
- face.json with trace_id added to each face
|
||||
- trace statistics report
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def calculate_bbox_iou(bbox1: Dict, bbox2: Dict) -> float:
|
||||
"""
|
||||
Calculate Intersection over Union (IoU) between two bboxes
|
||||
|
||||
Args:
|
||||
bbox1: {"x": int, "y": int, "width": int, "height": int}
|
||||
bbox2: same structure
|
||||
|
||||
Returns:
|
||||
IoU score (0.0 - 1.0)
|
||||
"""
|
||||
x1, y1, w1, h1 = bbox1["x"], bbox1["y"], bbox1["width"], bbox1["height"]
|
||||
x2, y2, w2, h2 = bbox2["x"], bbox2["y"], bbox2["width"], bbox2["height"]
|
||||
|
||||
x1_min, x1_max = x1, x1 + w1
|
||||
y1_min, y1_max = y1, y1 + h1
|
||||
x2_min, x2_max = x2, x2 + w2
|
||||
y2_min, y2_max = y2, y2 + h2
|
||||
|
||||
inter_x_min = max(x1_min, x2_min)
|
||||
inter_x_max = min(x1_max, x2_max)
|
||||
inter_y_min = max(y1_min, y2_min)
|
||||
inter_y_max = min(y1_max, y2_max)
|
||||
|
||||
if inter_x_max <= inter_x_min or inter_y_max <= inter_y_min:
|
||||
return 0.0
|
||||
|
||||
inter_area = (inter_x_max - inter_x_min) * (inter_y_max - inter_y_min)
|
||||
area1 = w1 * h1
|
||||
area2 = w2 * h2
|
||||
union_area = area1 + area2 - inter_area
|
||||
|
||||
return inter_area / union_area if union_area > 0 else 0.0
|
||||
|
||||
|
||||
def calculate_bbox_distance(bbox1: Dict, bbox2: Dict) -> float:
|
||||
"""
|
||||
Calculate center distance between two bboxes
|
||||
|
||||
Returns:
|
||||
Euclidean distance between centers
|
||||
"""
|
||||
cx1 = bbox1["x"] + bbox1["width"] / 2
|
||||
cy1 = bbox1["y"] + bbox1["height"] / 2
|
||||
cx2 = bbox2["x"] + bbox2["width"] / 2
|
||||
cy2 = bbox2["y"] + bbox2["height"] / 2
|
||||
|
||||
return np.sqrt((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2)
|
||||
|
||||
|
||||
def calculate_embedding_similarity(emb1: List[float], emb2: List[float]) -> float:
|
||||
"""
|
||||
Calculate cosine similarity between two embeddings
|
||||
|
||||
Returns:
|
||||
Cosine similarity (-1.0 - 1.0)
|
||||
"""
|
||||
if emb1 is None or emb2 is None:
|
||||
return 0.0
|
||||
|
||||
v1 = np.array(emb1)
|
||||
v2 = np.array(emb2)
|
||||
|
||||
norm1 = np.linalg.norm(v1)
|
||||
norm2 = np.linalg.norm(v2)
|
||||
|
||||
if norm1 == 0 or norm2 == 0:
|
||||
return 0.0
|
||||
|
||||
return np.dot(v1, v2) / (norm1 * norm2)
|
||||
|
||||
|
||||
def match_faces(
|
||||
current_faces: List[Dict],
|
||||
previous_faces: List[Dict],
|
||||
iou_threshold: float = 0.3,
|
||||
similarity_threshold: float = 0.7,
|
||||
distance_threshold: float = 100.0,
|
||||
use_embedding: bool = True,
|
||||
) -> Dict[int, int]:
|
||||
"""
|
||||
Match current frame faces to previous frame faces
|
||||
|
||||
Args:
|
||||
current_faces: Faces in current frame
|
||||
previous_faces: Faces in previous frame
|
||||
iou_threshold: Minimum IoU for matching
|
||||
similarity_threshold: Minimum embedding similarity for matching
|
||||
distance_threshold: Maximum bbox center distance for matching
|
||||
use_embedding: Whether to use embedding similarity
|
||||
|
||||
Returns:
|
||||
Dict mapping current_face_index -> previous_face_index (or -1 if new)
|
||||
"""
|
||||
if not previous_faces:
|
||||
return {i: -1 for i in range(len(current_faces))}
|
||||
|
||||
matches = {}
|
||||
used_prev = set()
|
||||
|
||||
for curr_idx, curr_face in enumerate(current_faces):
|
||||
best_prev_idx = -1
|
||||
best_score = 0.0
|
||||
|
||||
curr_bbox = {
|
||||
"x": curr_face["x"],
|
||||
"y": curr_face["y"],
|
||||
"width": curr_face["width"],
|
||||
"height": curr_face["height"],
|
||||
}
|
||||
curr_emb = curr_face.get("embedding")
|
||||
|
||||
for prev_idx, prev_face in enumerate(previous_faces):
|
||||
if prev_idx in used_prev:
|
||||
continue
|
||||
|
||||
prev_bbox = {
|
||||
"x": prev_face["x"],
|
||||
"y": prev_face["y"],
|
||||
"width": prev_face["width"],
|
||||
"height": prev_face["height"],
|
||||
}
|
||||
prev_emb = prev_face.get("embedding")
|
||||
|
||||
iou = calculate_bbox_iou(curr_bbox, prev_bbox)
|
||||
distance = calculate_bbox_distance(curr_bbox, prev_bbox)
|
||||
|
||||
similarity = 0.0
|
||||
if use_embedding and curr_emb and prev_emb:
|
||||
similarity = calculate_embedding_similarity(curr_emb, prev_emb)
|
||||
|
||||
score = 0.0
|
||||
|
||||
if iou > iou_threshold and similarity > similarity_threshold:
|
||||
score = iou + similarity
|
||||
elif iou > 0.5:
|
||||
score = iou * 2
|
||||
elif similarity > 0.85:
|
||||
score = similarity * 2
|
||||
elif distance < distance_threshold and similarity > 0.6:
|
||||
score = similarity - distance / 1000
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_prev_idx = prev_idx
|
||||
|
||||
if best_prev_idx >= 0 and best_score > 0:
|
||||
matches[curr_idx] = best_prev_idx
|
||||
used_prev.add(best_prev_idx)
|
||||
else:
|
||||
matches[curr_idx] = -1
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
def track_faces(
|
||||
face_data: Dict,
|
||||
iou_threshold: float = 0.3,
|
||||
similarity_threshold: float = 0.7,
|
||||
distance_threshold: float = 100.0,
|
||||
use_embedding: bool = True,
|
||||
) -> Dict:
|
||||
"""
|
||||
Track faces across all frames
|
||||
|
||||
Args:
|
||||
face_data: face.json data
|
||||
iou_threshold: IoU threshold for matching
|
||||
similarity_threshold: Embedding similarity threshold
|
||||
distance_threshold: Distance threshold for matching
|
||||
use_embedding: Whether to use embedding
|
||||
|
||||
Returns:
|
||||
Updated face_data with trace_id added to each face
|
||||
"""
|
||||
frames = face_data.get("frames", {})
|
||||
|
||||
if not frames:
|
||||
print("No frames found in face.json")
|
||||
return face_data
|
||||
|
||||
sorted_frames = sorted(frames.items(), key=lambda x: int(x[0]))
|
||||
|
||||
next_trace_id = 0
|
||||
traces = defaultdict(list)
|
||||
|
||||
prev_faces = []
|
||||
prev_trace_ids = []
|
||||
|
||||
print(f"\nTracking faces across {len(sorted_frames)} frames...")
|
||||
print(f"Parameters: iou={iou_threshold}, similarity={similarity_threshold}, distance={distance_threshold}")
|
||||
print()
|
||||
|
||||
for frame_num_str, frame_data in sorted_frames:
|
||||
frame_num = int(frame_num_str)
|
||||
faces = frame_data.get("faces", [])
|
||||
|
||||
if not faces:
|
||||
prev_faces = []
|
||||
prev_trace_ids = []
|
||||
continue
|
||||
|
||||
matches = match_faces(
|
||||
faces,
|
||||
prev_faces,
|
||||
iou_threshold,
|
||||
similarity_threshold,
|
||||
distance_threshold,
|
||||
use_embedding,
|
||||
)
|
||||
|
||||
trace_ids = []
|
||||
for curr_idx, prev_idx in matches.items():
|
||||
if prev_idx >= 0:
|
||||
trace_id = prev_trace_ids[prev_idx]
|
||||
else:
|
||||
trace_id = next_trace_id
|
||||
next_trace_id += 1
|
||||
|
||||
faces[curr_idx]["trace_id"] = trace_id
|
||||
trace_ids.append(trace_id)
|
||||
traces[trace_id].append({
|
||||
"frame": frame_num,
|
||||
"face_index": curr_idx,
|
||||
"bbox": {
|
||||
"x": faces[curr_idx]["x"],
|
||||
"y": faces[curr_idx]["y"],
|
||||
"width": faces[curr_idx]["width"],
|
||||
"height": faces[curr_idx]["height"],
|
||||
},
|
||||
"confidence": faces[curr_idx].get("confidence", 0.0),
|
||||
"pose_angle": faces[curr_idx].get("pose_angle", {}).get("angle", "unknown"),
|
||||
"pose_full": faces[curr_idx].get("pose_angle", {}), # 完整 pose 信息
|
||||
})
|
||||
|
||||
prev_faces = faces
|
||||
prev_trace_ids = trace_ids
|
||||
|
||||
if frame_num % 100 == 0:
|
||||
print(f" Frame {frame_num}: {len(faces)} faces, {len(set(trace_ids))} active traces")
|
||||
|
||||
face_data["traces"] = {}
|
||||
for trace_id, path in traces.items():
|
||||
if len(path) >= 1:
|
||||
duration_frames = path[-1]["frame"] - path[0]["frame"] + 1
|
||||
avg_confidence = sum(p["confidence"] for p in path) / len(path)
|
||||
pose_angles = [p["pose_angle"] for p in path]
|
||||
|
||||
# Pose Trace: 完整 pose 信息
|
||||
pose_trace = []
|
||||
for p in path:
|
||||
pose_info = p.get("pose_full", {})
|
||||
pose_trace.append({
|
||||
"frame": p["frame"],
|
||||
"angle": pose_info.get("angle", "unknown"),
|
||||
"confidence": pose_info.get("confidence", 0.0),
|
||||
"pitch": pose_info.get("pitch", "neutral"),
|
||||
"features": pose_info.get("features", {}),
|
||||
})
|
||||
|
||||
# Pose Statistics
|
||||
pose_counts = defaultdict(int)
|
||||
pose_confidence_by_angle = defaultdict(list)
|
||||
for pose in pose_trace:
|
||||
pose_counts[pose["angle"]] += 1
|
||||
pose_confidence_by_angle[pose["angle"]].append(pose["confidence"])
|
||||
|
||||
pose_statistics = {
|
||||
"distribution": dict(pose_counts),
|
||||
"avg_confidence_by_angle": {
|
||||
angle: round(sum(conf_list) / len(conf_list), 3)
|
||||
for angle, conf_list in pose_confidence_by_angle.items()
|
||||
},
|
||||
"dominant_angle": max(pose_counts.items(), key=lambda x: x[1])[0] if pose_counts else "unknown",
|
||||
"pose_count": len(pose_counts),
|
||||
}
|
||||
|
||||
# Pose Transitions: pose 变化事件
|
||||
pose_transitions = []
|
||||
prev_pose = None
|
||||
for i, pose in enumerate(pose_trace):
|
||||
if prev_pose is not None and pose["angle"] != prev_pose["angle"]:
|
||||
pose_transitions.append({
|
||||
"frame": pose["frame"],
|
||||
"from_angle": prev_pose["angle"],
|
||||
"to_angle": pose["angle"],
|
||||
"transition_index": len(pose_transitions) + 1,
|
||||
})
|
||||
prev_pose = pose
|
||||
|
||||
face_data["traces"][str(trace_id)] = {
|
||||
"trace_id": trace_id,
|
||||
"start_frame": path[0]["frame"],
|
||||
"end_frame": path[-1]["frame"],
|
||||
"duration_frames": duration_frames,
|
||||
"duration_seconds": duration_frames / face_data["metadata"]["fps"],
|
||||
"total_appearances": len(path),
|
||||
"avg_confidence": avg_confidence,
|
||||
"pose_angles": pose_angles,
|
||||
"pose_trace": pose_trace,
|
||||
"pose_statistics": pose_statistics,
|
||||
"pose_transitions": pose_transitions,
|
||||
"path": path,
|
||||
}
|
||||
|
||||
face_data["metadata"]["trace_stats"] = {
|
||||
"total_traces": next_trace_id,
|
||||
"active_traces": len(traces),
|
||||
"long_traces": len([t for t in traces.values() if len(t) >= 2]),
|
||||
}
|
||||
|
||||
return face_data
|
||||
|
||||
|
||||
def analyze_traces(face_data: Dict) -> None:
|
||||
"""
|
||||
Analyze and print trace statistics
|
||||
"""
|
||||
traces = face_data.get("traces", {})
|
||||
metadata = face_data.get("metadata", {})
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Face Trace Analysis")
|
||||
print("=" * 60)
|
||||
|
||||
print(f"\nTotal traces: {metadata.get('trace_stats', {}).get('total_traces', 0)}")
|
||||
print(f"Long traces (>= 2 frames): {len(traces)}")
|
||||
|
||||
if not traces:
|
||||
return
|
||||
|
||||
sorted_traces = sorted(traces.values(), key=lambda x: x["duration_frames"], reverse=True)
|
||||
|
||||
print("\n=== Top 10 Longest Traces ===")
|
||||
for i, trace in enumerate(sorted_traces[:10]):
|
||||
print(f"\nTrace {trace['trace_id']}:")
|
||||
print(f" Frames: {trace['start_frame']} - {trace['end_frame']} ({trace['duration_frames']} frames)")
|
||||
print(f" Duration: {trace['duration_seconds']:.2f} seconds")
|
||||
print(f" Appearances: {trace['total_appearances']}")
|
||||
print(f" Avg Confidence: {trace['avg_confidence']:.3f}")
|
||||
|
||||
# Pose Statistics
|
||||
pose_stats = trace.get("pose_statistics", {})
|
||||
print(f" Pose Distribution: {pose_stats.get('distribution', {})}")
|
||||
print(f" Dominant Angle: {pose_stats.get('dominant_angle', 'unknown')}")
|
||||
|
||||
# Pose Transitions
|
||||
transitions = trace.get("pose_transitions", [])
|
||||
if transitions:
|
||||
print(f" Pose Transitions: {len(transitions)} events")
|
||||
for t in transitions[:3]: # 只显示前 3 个
|
||||
print(f" - Frame {t['frame']}: {t['from_angle']} → {t['to_angle']}")
|
||||
|
||||
pose_stats = defaultdict(int)
|
||||
for trace in traces.values():
|
||||
for pose in trace["pose_angles"]:
|
||||
pose_stats[pose] += 1
|
||||
|
||||
print("\n=== Pose Distribution in Traces ===")
|
||||
for pose, count in sorted(pose_stats.items(), key=lambda x: x[1], reverse=True):
|
||||
print(f" {pose}: {count}")
|
||||
|
||||
duration_distribution = defaultdict(int)
|
||||
for trace in traces.values():
|
||||
d = trace["duration_frames"]
|
||||
if d <= 30:
|
||||
duration_distribution["short (<= 30 frames)"] += 1
|
||||
elif d <= 90:
|
||||
duration_distribution["medium (31-90 frames)"] += 1
|
||||
else:
|
||||
duration_distribution["long (> 90 frames)"] += 1
|
||||
|
||||
print("\n=== Trace Duration Distribution ===")
|
||||
for duration, count in sorted(duration_distribution.items()):
|
||||
print(f" {duration}: {count}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Track faces across frames")
|
||||
parser.add_argument("--face-json", required=True, help="Path to face.json")
|
||||
parser.add_argument("--output", help="Output path (default: face_traced.json)")
|
||||
parser.add_argument("--iou-threshold", type=float, default=0.3, help="IoU threshold")
|
||||
parser.add_argument("--similarity-threshold", type=float, default=0.7, help="Embedding similarity threshold")
|
||||
parser.add_argument("--distance-threshold", type=float, default=100.0, help="Distance threshold")
|
||||
parser.add_argument("--no-embedding", action="store_true", help="Disable embedding matching")
|
||||
parser.add_argument("--analyze-only", action="store_true", help="Only analyze, don't output")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("Face Tracker")
|
||||
print("=" * 60)
|
||||
|
||||
with open(args.face_json) as f:
|
||||
face_data = json.load(f)
|
||||
|
||||
print(f"\nInput: {args.face_json}")
|
||||
print(f"Frames: {len(face_data.get('frames', {}))}")
|
||||
|
||||
face_data = track_faces(
|
||||
face_data,
|
||||
iou_threshold=args.iou_threshold,
|
||||
similarity_threshold=args.similarity_threshold,
|
||||
distance_threshold=args.distance_threshold,
|
||||
use_embedding=not args.no_embedding,
|
||||
)
|
||||
|
||||
analyze_traces(face_data)
|
||||
|
||||
if not args.analyze_only:
|
||||
output_path = args.output or args.face_json.replace(".json", "_traced.json")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(face_data, f, indent=2)
|
||||
print(f"\n✅ Output saved to: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
522
scripts/utils/pose_action_decoder.py
Normal file
522
scripts/utils/pose_action_decoder.py
Normal file
@@ -0,0 +1,522 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Pose Action Decoder - Convert pose_trace into human-readable action names
|
||||
|
||||
Purpose:
|
||||
1. Decode pose transitions into action names (turn left/right, look up/down, shake head, nod)
|
||||
2. Identify stable pose segments with duration
|
||||
3. Generate action timeline for each trace
|
||||
|
||||
Action Types:
|
||||
- Simple: turn_left, turn_right, look_up, look_down
|
||||
- Complex: shake_head, nod_head, turn_full
|
||||
- Stable: frontal_stable, profile_left_stable, profile_right_stable, three_quarter_stable
|
||||
|
||||
Output:
|
||||
1. Action timeline (frame-based action list)
|
||||
2. Action summary (total counts, duration)
|
||||
3. Action visualization (timeline plot)
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from typing import Dict, List, Optional
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
# Action definitions
|
||||
POSE_TO_ACTION = {
|
||||
# Turn actions (angle changes)
|
||||
("frontal", "three_quarter"): "turn_partial",
|
||||
("frontal", "profile_left"): "turn_left",
|
||||
("frontal", "profile_right"): "turn_right",
|
||||
("three_quarter", "frontal"): "return_frontal",
|
||||
("three_quarter", "profile_left"): "turn_left",
|
||||
("three_quarter", "profile_right"): "turn_right",
|
||||
("profile_left", "frontal"): "turn_to_frontal",
|
||||
("profile_left", "three_quarter"): "turn_to_three_quarter",
|
||||
("profile_left", "profile_right"): "turn_full",
|
||||
("profile_right", "frontal"): "turn_to_frontal",
|
||||
("profile_right", "three_quarter"): "turn_to_three_quarter",
|
||||
("profile_right", "profile_left"): "turn_full",
|
||||
|
||||
# Pitch actions
|
||||
("neutral", "tilted_up"): "look_up",
|
||||
("neutral", "tilted_down"): "look_down",
|
||||
("tilted_up", "neutral"): "return_neutral",
|
||||
("tilted_down", "neutral"): "return_neutral",
|
||||
("tilted_up", "tilted_down"): "nod_full",
|
||||
("tilted_down", "tilted_up"): "nod_full",
|
||||
}
|
||||
|
||||
# Stable pose names
|
||||
STABLE_ACTION_NAMES = {
|
||||
"frontal": "frontal_stable",
|
||||
"three_quarter": "three_quarter_stable",
|
||||
"profile_left": "profile_left_stable",
|
||||
"profile_right": "profile_right_stable",
|
||||
"unknown": "pose_unknown",
|
||||
}
|
||||
|
||||
# Complex action patterns (3+ transitions in short time)
|
||||
COMPLEX_PATTERNS = {
|
||||
# Shake head: profile_left → profile_right → profile_left (or reverse)
|
||||
"shake_head": {
|
||||
"sequence": ["profile_left", "profile_right", "profile_left"],
|
||||
"min_frames": 5,
|
||||
"max_frames": 30,
|
||||
},
|
||||
"shake_head_reverse": {
|
||||
"sequence": ["profile_right", "profile_left", "profile_right"],
|
||||
"min_frames": 5,
|
||||
"max_frames": 30,
|
||||
},
|
||||
# Nod: tilted_up → tilted_down → tilted_up (or reverse)
|
||||
"nod_head": {
|
||||
"sequence": ["tilted_up", "tilted_down", "tilted_up"],
|
||||
"min_frames": 3,
|
||||
"max_frames": 20,
|
||||
"pitch_mode": True,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def decode_pose_to_action(from_pose: str, to_pose: str) -> str:
|
||||
"""
|
||||
Decode single pose transition to action name
|
||||
|
||||
Args:
|
||||
from_pose: Source pose angle
|
||||
to_pose: Target pose angle
|
||||
|
||||
Returns:
|
||||
Action name
|
||||
"""
|
||||
key = (from_pose, to_pose)
|
||||
|
||||
if key in POSE_TO_ACTION:
|
||||
return POSE_TO_ACTION[key]
|
||||
|
||||
# Default action
|
||||
return f"pose_change_{from_pose}_to_{to_pose}"
|
||||
|
||||
|
||||
def detect_complex_actions(pose_trace: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
Detect complex action patterns (shake head, nod, etc.)
|
||||
|
||||
Args:
|
||||
pose_trace: Pose trace list
|
||||
|
||||
Returns:
|
||||
List of complex action events
|
||||
"""
|
||||
complex_actions = []
|
||||
|
||||
# Shake head detection
|
||||
for i in range(len(pose_trace) - 2):
|
||||
angles = [pose_trace[i]["angle"], pose_trace[i+1]["angle"], pose_trace[i+2]["angle"]]
|
||||
|
||||
# Check shake_head pattern
|
||||
if angles == ["profile_left", "profile_right", "profile_left"]:
|
||||
duration_frames = pose_trace[i+2]["frame"] - pose_trace[i]["frame"]
|
||||
if 5 <= duration_frames <= 30:
|
||||
complex_actions.append({
|
||||
"action": "shake_head",
|
||||
"start_frame": pose_trace[i]["frame"],
|
||||
"end_frame": pose_trace[i+2]["frame"],
|
||||
"duration_frames": duration_frames,
|
||||
"description": "shake head left-right-left",
|
||||
})
|
||||
|
||||
elif angles == ["profile_right", "profile_left", "profile_right"]:
|
||||
duration_frames = pose_trace[i+2]["frame"] - pose_trace[i]["frame"]
|
||||
if 5 <= duration_frames <= 30:
|
||||
complex_actions.append({
|
||||
"action": "shake_head",
|
||||
"start_frame": pose_trace[i]["frame"],
|
||||
"end_frame": pose_trace[i+2]["frame"],
|
||||
"duration_frames": duration_frames,
|
||||
"description": "shake head right-left-right",
|
||||
})
|
||||
|
||||
# Nod detection (pitch-based)
|
||||
for i in range(len(pose_trace) - 2):
|
||||
pitches = [pose_trace[i]["pitch"], pose_trace[i+1]["pitch"], pose_trace[i+2]["pitch"]]
|
||||
|
||||
if pitches == ["tilted_up", "tilted_down", "tilted_up"] or \
|
||||
pitches == ["tilted_down", "tilted_up", "tilted_down"]:
|
||||
duration_frames = pose_trace[i+2]["frame"] - pose_trace[i]["frame"]
|
||||
if 3 <= duration_frames <= 20:
|
||||
complex_actions.append({
|
||||
"action": "nod_head",
|
||||
"start_frame": pose_trace[i]["frame"],
|
||||
"end_frame": pose_trace[i+2]["frame"],
|
||||
"duration_frames": duration_frames,
|
||||
"description": "nod head up-down",
|
||||
})
|
||||
|
||||
return complex_actions
|
||||
|
||||
|
||||
def build_action_timeline(trace: Dict) -> Dict:
|
||||
"""
|
||||
Build action timeline from pose_trace
|
||||
|
||||
Args:
|
||||
trace: Trace data with pose_trace, pose_transitions
|
||||
|
||||
Returns:
|
||||
Action timeline dict
|
||||
"""
|
||||
pose_trace = trace.get("pose_trace", [])
|
||||
pose_transitions = trace.get("pose_transitions", [])
|
||||
|
||||
if len(pose_trace) < 1:
|
||||
return {
|
||||
"trace_id": trace.get("trace_id"),
|
||||
"action_timeline": [],
|
||||
"action_summary": {},
|
||||
"complex_actions": [],
|
||||
}
|
||||
|
||||
action_timeline = []
|
||||
complex_actions = detect_complex_actions(pose_trace)
|
||||
|
||||
# Build pose segments (stable periods)
|
||||
pose_segments = []
|
||||
current_pose = pose_trace[0]["angle"]
|
||||
current_start = pose_trace[0]["frame"]
|
||||
current_pitch = pose_trace[0]["pitch"]
|
||||
|
||||
for i in range(1, len(pose_trace)):
|
||||
pose = pose_trace[i]
|
||||
|
||||
# Check if pose changed
|
||||
if pose["angle"] != current_pose or pose["pitch"] != current_pitch:
|
||||
pose_segments.append({
|
||||
"angle": current_pose,
|
||||
"pitch": current_pitch,
|
||||
"start_frame": current_start,
|
||||
"end_frame": pose_trace[i-1]["frame"],
|
||||
"duration_frames": pose_trace[i-1]["frame"] - current_start + 1,
|
||||
})
|
||||
current_pose = pose["angle"]
|
||||
current_pitch = pose["pitch"]
|
||||
current_start = pose["frame"]
|
||||
|
||||
# Add last segment
|
||||
pose_segments.append({
|
||||
"angle": current_pose,
|
||||
"pitch": current_pitch,
|
||||
"start_frame": current_start,
|
||||
"end_frame": pose_trace[-1]["frame"],
|
||||
"duration_frames": pose_trace[-1]["frame"] - current_start + 1,
|
||||
})
|
||||
|
||||
# Build action timeline
|
||||
for seg in pose_segments:
|
||||
# Determine action name
|
||||
if seg["duration_frames"] >= 10: # Stable pose (>= 10 frames)
|
||||
action_name = STABLE_ACTION_NAMES.get(seg["angle"], "pose_stable")
|
||||
|
||||
# Add pitch modifier
|
||||
if seg["pitch"] != "neutral":
|
||||
action_name += f"_pitch_{seg['pitch']}"
|
||||
|
||||
action_timeline.append({
|
||||
"frame": seg["start_frame"],
|
||||
"action": action_name,
|
||||
"duration_frames": seg["duration_frames"],
|
||||
"description": f"stable {seg['angle']} pose for {seg['duration_frames']} frames",
|
||||
"type": "stable",
|
||||
})
|
||||
|
||||
else: # Short pose (transitional)
|
||||
action_name = f"pose_{seg['angle']}_brief"
|
||||
action_timeline.append({
|
||||
"frame": seg["start_frame"],
|
||||
"action": action_name,
|
||||
"duration_frames": seg["duration_frames"],
|
||||
"description": f"brief {seg['angle']} pose for {seg['duration_frames']} frames",
|
||||
"type": "transitional",
|
||||
})
|
||||
|
||||
# Add transition actions
|
||||
for trans in pose_transitions:
|
||||
action_name = decode_pose_to_action(trans["from_angle"], trans["to_angle"])
|
||||
action_timeline.append({
|
||||
"frame": trans["frame"],
|
||||
"action": action_name,
|
||||
"duration_frames": 1, # Transition is instant
|
||||
"description": f"transition from {trans['from_angle']} to {trans['to_angle']}",
|
||||
"type": "transition",
|
||||
})
|
||||
|
||||
# Sort by frame
|
||||
action_timeline.sort(key=lambda x: x["frame"])
|
||||
|
||||
# Add complex actions
|
||||
for complex_act in complex_actions:
|
||||
action_timeline.append({
|
||||
"frame": complex_act["start_frame"],
|
||||
"action": complex_act["action"],
|
||||
"duration_frames": complex_act["duration_frames"],
|
||||
"description": complex_act["description"],
|
||||
"type": "complex",
|
||||
})
|
||||
|
||||
# Re-sort
|
||||
action_timeline.sort(key=lambda x: (x["frame"], -x["duration_frames"]))
|
||||
|
||||
# Build action summary
|
||||
action_counts = defaultdict(int)
|
||||
action_durations = defaultdict(float)
|
||||
|
||||
for act in action_timeline:
|
||||
action_counts[act["action"]] += 1
|
||||
action_durations[act["action"]] += act["duration_frames"]
|
||||
|
||||
action_summary = {
|
||||
"total_actions": len(action_timeline),
|
||||
"unique_actions": len(action_counts),
|
||||
"action_counts": dict(action_counts),
|
||||
"action_durations_frames": {k: round(v, 1) for k, v in action_durations.items()},
|
||||
"complex_action_count": len(complex_actions),
|
||||
"stable_percentage": round(
|
||||
sum(1 for act in action_timeline if act["type"] == "stable") / len(action_timeline) * 100, 1
|
||||
) if action_timeline else 0,
|
||||
}
|
||||
|
||||
return {
|
||||
"trace_id": trace.get("trace_id"),
|
||||
"action_timeline": action_timeline,
|
||||
"action_summary": action_summary,
|
||||
"complex_actions": complex_actions,
|
||||
}
|
||||
|
||||
|
||||
def generate_action_description(action_timeline: List[Dict]) -> str:
|
||||
"""
|
||||
Generate human-readable action description
|
||||
|
||||
Args:
|
||||
action_timeline: Action timeline list
|
||||
|
||||
Returns:
|
||||
Action description string
|
||||
"""
|
||||
if not action_timeline:
|
||||
return "No actions detected"
|
||||
|
||||
# Group actions by type
|
||||
stable_actions = [a for a in action_timeline if a["type"] == "stable"]
|
||||
transition_actions = [a for a in action_timeline if a["type"] == "transition"]
|
||||
complex_actions = [a for a in action_timeline if a["type"] == "complex"]
|
||||
|
||||
desc_parts = []
|
||||
|
||||
# Stable poses
|
||||
if stable_actions:
|
||||
stable_desc = []
|
||||
for act in stable_actions[:3]: # Top 3 stable poses
|
||||
stable_desc.append(f"{act['description']}")
|
||||
desc_parts.append(f"Stable poses: {', '.join(stable_desc)}")
|
||||
|
||||
# Transitions
|
||||
if transition_actions:
|
||||
trans_desc = [act["action"] for act in transition_actions[:5]] # Top 5 transitions
|
||||
desc_parts.append(f"Transitions: {', '.join(trans_desc)}")
|
||||
|
||||
# Complex actions
|
||||
if complex_actions:
|
||||
complex_desc = [act["action"] for act in complex_actions]
|
||||
desc_parts.append(f"Complex actions: {', '.join(complex_desc)}")
|
||||
|
||||
return ". ".join(desc_parts)
|
||||
|
||||
|
||||
def visualize_action_timeline(action_data: Dict, output_path: str = None) -> None:
|
||||
"""
|
||||
Visualize action timeline
|
||||
"""
|
||||
traces_data = action_data.get("traces", {})
|
||||
|
||||
if not traces_data:
|
||||
print("No traces found")
|
||||
return
|
||||
|
||||
fig, axes = plt.subplots(len(traces_data), 1, figsize=(16, 3 * len(traces_data)))
|
||||
|
||||
if len(traces_data) == 1:
|
||||
axes = [axes]
|
||||
|
||||
action_colors = {
|
||||
"frontal_stable": "green",
|
||||
"three_quarter_stable": "blue",
|
||||
"profile_left_stable": "orange",
|
||||
"profile_right_stable": "red",
|
||||
"turn_left": "purple",
|
||||
"turn_right": "purple",
|
||||
"turn_full": "darkred",
|
||||
"shake_head": "yellow",
|
||||
"nod_head": "cyan",
|
||||
"look_up": "lightgreen",
|
||||
"look_down": "brown",
|
||||
}
|
||||
|
||||
for ax, (trace_id, data) in zip(axes, sorted(traces_data.items())):
|
||||
timeline = data["action_timeline"]
|
||||
|
||||
if not timeline:
|
||||
continue
|
||||
|
||||
# Plot action timeline as bars
|
||||
for act in timeline:
|
||||
color = action_colors.get(act["action"], "gray")
|
||||
|
||||
if act["duration_frames"] > 1:
|
||||
ax.barh(
|
||||
y=0,
|
||||
width=act["duration_frames"],
|
||||
left=act["frame"],
|
||||
height=0.8,
|
||||
color=color,
|
||||
alpha=0.6,
|
||||
edgecolor="black",
|
||||
linewidth=0.5,
|
||||
)
|
||||
|
||||
# Add label for stable actions
|
||||
if act["type"] == "stable" and act["duration_frames"] > 30:
|
||||
ax.text(
|
||||
act["frame"] + act["duration_frames"] / 2,
|
||||
0,
|
||||
act["action"],
|
||||
ha="center",
|
||||
va="center",
|
||||
fontsize=8,
|
||||
color="white",
|
||||
)
|
||||
else:
|
||||
# Instant action (transition)
|
||||
ax.axvline(x=act["frame"], color=color, linestyle="--", alpha=0.8)
|
||||
ax.text(
|
||||
act["frame"],
|
||||
0.5,
|
||||
act["action"],
|
||||
fontsize=7,
|
||||
rotation=90,
|
||||
va="bottom",
|
||||
ha="center",
|
||||
)
|
||||
|
||||
ax.set_xlabel("Frame Number")
|
||||
ax.set_ylabel("Action")
|
||||
ax.set_title(f"Trace {trace_id} Action Timeline")
|
||||
ax.set_ylim(-0.5, 1)
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
if output_path:
|
||||
plt.savefig(output_path, dpi=150, bbox_inches="tight")
|
||||
print(f"\n✅ Visualization saved to: {output_path}")
|
||||
else:
|
||||
plt.show()
|
||||
|
||||
|
||||
def print_action_report(action_data: Dict) -> None:
|
||||
"""
|
||||
Print action report
|
||||
"""
|
||||
traces_data = action_data.get("traces", {})
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Pose Action Decoder Report")
|
||||
print("=" * 70)
|
||||
|
||||
for trace_id, data in sorted(traces_data.items()):
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Trace {trace_id}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
summary = data["action_summary"]
|
||||
print(f"\nSummary:")
|
||||
print(f" Total Actions: {summary['total_actions']}")
|
||||
print(f" Unique Actions: {summary['unique_actions']}")
|
||||
print(f" Complex Actions: {summary['complex_action_count']}")
|
||||
print(f" Stable Percentage: {summary['stable_percentage']}%")
|
||||
|
||||
print(f"\nAction Counts:")
|
||||
for action, count in sorted(summary["action_counts"].items(), key=lambda x: x[1], reverse=True):
|
||||
print(f" {action}: {count}")
|
||||
|
||||
print(f"\nAction Timeline (前 10 个):")
|
||||
timeline = data["action_timeline"]
|
||||
for act in timeline[:10]:
|
||||
print(f" Frame {act['frame']}: {act['action']} ({act['type']}, {act['duration_frames']} frames)")
|
||||
|
||||
if data["complex_actions"]:
|
||||
print(f"\nComplex Actions:")
|
||||
for act in data["complex_actions"]:
|
||||
print(f" {act['action']}: frames {act['start_frame']}-{act['end_frame']} ({act['duration_frames']} frames)")
|
||||
|
||||
# Generate description
|
||||
desc = generate_action_description(data["action_timeline"])
|
||||
print(f"\nHuman-readable Description:")
|
||||
print(f" {desc}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Decode pose_trace into action names")
|
||||
parser.add_argument("--face-json", required=True, help="Path to face_traced.json")
|
||||
parser.add_argument("--output-json", help="Output action data JSON")
|
||||
parser.add_argument("--output-plot", help="Output action timeline plot PNG")
|
||||
parser.add_argument("--trace-id", type=int, help="Analyze specific trace only")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 70)
|
||||
print("Pose Action Decoder")
|
||||
print("=" * 70)
|
||||
|
||||
with open(args.face_json) as f:
|
||||
face_data = json.load(f)
|
||||
|
||||
traces = face_data.get("traces", {})
|
||||
|
||||
if not traces:
|
||||
print("No traces found in face_traced.json")
|
||||
return
|
||||
|
||||
# Filter by trace_id if specified
|
||||
if args.trace_id:
|
||||
traces = {str(args.trace_id): traces.get(str(args.trace_id))}
|
||||
if not traces[str(args.trace_id)]:
|
||||
print(f"Trace {args.trace_id} not found")
|
||||
return
|
||||
|
||||
print(f"\nAnalyzing {len(traces)} traces...")
|
||||
|
||||
action_data = {"traces": {}}
|
||||
|
||||
for trace_id_str, trace in traces.items():
|
||||
action_result = build_action_timeline(trace)
|
||||
action_data["traces"][trace_id_str] = action_result
|
||||
|
||||
print_action_report(action_data)
|
||||
|
||||
if args.output_json:
|
||||
with open(args.output_json, "w") as f:
|
||||
json.dump(action_data, f, indent=2)
|
||||
print(f"\n✅ Action data saved to: {args.output_json}")
|
||||
|
||||
if args.output_plot:
|
||||
visualize_action_timeline(action_data, args.output_plot)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
402
scripts/utils/pose_analyzer.py
Normal file
402
scripts/utils/pose_analyzer.py
Normal file
@@ -0,0 +1,402 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Pose Analyzer - Multi-feature Pose Angle Classification
|
||||
|
||||
Purpose:
|
||||
1. Calculate pose angle from 5-point landmarks (InsightFace kps)
|
||||
2. Use multiple features for accurate classification:
|
||||
- nose_to_eye_ratio: nose distance relative to eye width
|
||||
- eye_slope: eye line slope (pitch detection)
|
||||
- nose_offset: nose position relative to eye center
|
||||
- mouth_symmetry: mouth corners symmetry
|
||||
3. Provide confidence score for classification
|
||||
|
||||
Landmarks Order (InsightFace kps):
|
||||
- 0: left eye
|
||||
- 1: right eye
|
||||
- 2: nose
|
||||
- 3: left mouth corner
|
||||
- 4: right mouth corner
|
||||
|
||||
Angles:
|
||||
- frontal: nose near center, low ratio (< 0.4)
|
||||
- three_quarter: moderate offset (ratio 0.4 - 0.6)
|
||||
- profile_left: nose left of eye center (ratio > 0.6)
|
||||
- profile_right: nose right of eye center (ratio > 0.6)
|
||||
|
||||
Usage:
|
||||
from pose_analyzer import calculate_pose_angle_v2
|
||||
|
||||
pose_result = calculate_pose_angle_v2(landmarks)
|
||||
print(f"Angle: {pose_result['angle']}, Confidence: {pose_result['confidence']}")
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
def calculate_nose_to_eye_ratio(landmarks: List) -> Tuple[float, float, float]:
|
||||
"""
|
||||
Calculate nose-to-eye ratio
|
||||
|
||||
Returns:
|
||||
(ratio, eye_width, nose_to_eye_distance)
|
||||
"""
|
||||
if len(landmarks) < 5:
|
||||
return (0.0, 0.0, 0.0)
|
||||
|
||||
left_eye = np.array(landmarks[0][:2])
|
||||
right_eye = np.array(landmarks[1][:2])
|
||||
nose = np.array(landmarks[2][:2])
|
||||
|
||||
eye_center = (left_eye + right_eye) / 2
|
||||
eye_width = np.linalg.norm(right_eye - left_eye)
|
||||
nose_to_eye = np.linalg.norm(nose - eye_center)
|
||||
|
||||
ratio = nose_to_eye / eye_width if eye_width > 0 else 0.0
|
||||
|
||||
return (ratio, eye_width, nose_to_eye)
|
||||
|
||||
|
||||
def calculate_eye_slope(landmarks: List) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculate eye line slope (for pitch detection)
|
||||
|
||||
Positive slope = head tilted down
|
||||
Negative slope = head tilted up
|
||||
|
||||
Returns:
|
||||
(slope, angle_degrees)
|
||||
"""
|
||||
if len(landmarks) < 5:
|
||||
return (0.0, 0.0)
|
||||
|
||||
left_eye = np.array(landmarks[0][:2])
|
||||
right_eye = np.array(landmarks[1][:2])
|
||||
|
||||
dx = right_eye[0] - left_eye[0]
|
||||
dy = right_eye[1] - left_eye[1]
|
||||
|
||||
slope = dy / dx if dx != 0 else 0.0
|
||||
angle_degrees = np.arctan(slope) * 180 / np.pi
|
||||
|
||||
return (slope, angle_degrees)
|
||||
|
||||
|
||||
def calculate_nose_offset(landmarks: List) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculate nose horizontal offset relative to eye center
|
||||
|
||||
Returns:
|
||||
(offset_x, normalized_offset)
|
||||
"""
|
||||
if len(landmarks) < 5:
|
||||
return (0.0, 0.0)
|
||||
|
||||
left_eye = np.array(landmarks[0][:2])
|
||||
right_eye = np.array(landmarks[1][:2])
|
||||
nose = np.array(landmarks[2][:2])
|
||||
|
||||
eye_center = (left_eye + right_eye) / 2
|
||||
eye_width = np.linalg.norm(right_eye - left_eye)
|
||||
|
||||
offset_x = nose[0] - eye_center[0]
|
||||
normalized_offset = offset_x / eye_width if eye_width > 0 else 0.0
|
||||
|
||||
return (offset_x, normalized_offset)
|
||||
|
||||
|
||||
def calculate_mouth_symmetry(landmarks: List) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculate mouth corners symmetry
|
||||
|
||||
For profile faces, mouth corners are asymmetric
|
||||
|
||||
Returns:
|
||||
(symmetry_score, mouth_width)
|
||||
"""
|
||||
if len(landmarks) < 5:
|
||||
return (1.0, 0.0)
|
||||
|
||||
left_mouth = np.array(landmarks[3][:2])
|
||||
right_mouth = np.array(landmarks[4][:2])
|
||||
nose = np.array(landmarks[2][:2])
|
||||
|
||||
mouth_width = np.linalg.norm(right_mouth - left_mouth)
|
||||
|
||||
left_dist = np.linalg.norm(left_mouth - nose)
|
||||
right_dist = np.linalg.norm(right_mouth - nose)
|
||||
|
||||
symmetry = min(left_dist, right_dist) / max(left_dist, right_dist) if max(left_dist, right_dist) > 0 else 1.0
|
||||
|
||||
return (symmetry, mouth_width)
|
||||
|
||||
|
||||
def calculate_jaw_visibility_hint(landmarks: List) -> float:
|
||||
"""
|
||||
Estimate jaw visibility from mouth position
|
||||
|
||||
For profile faces, one side of jaw is more visible
|
||||
|
||||
Returns:
|
||||
visibility_hint (0.0 - 1.0)
|
||||
"""
|
||||
if len(landmarks) < 5:
|
||||
return 0.5
|
||||
|
||||
left_eye = np.array(landmarks[0][:2])
|
||||
right_eye = np.array(landmarks[1][:2])
|
||||
nose = np.array(landmarks[2][:2])
|
||||
left_mouth = np.array(landmarks[3][:2])
|
||||
right_mouth = np.array(landmarks[4][:2])
|
||||
|
||||
eye_center_y = (left_eye[1] + right_eye[1]) / 2
|
||||
mouth_center_y = (left_mouth[1] + right_mouth[1]) / 2
|
||||
|
||||
nose_to_mouth_dist = mouth_center_y - nose[1]
|
||||
|
||||
eye_to_nose_dist = nose[1] - eye_center_y
|
||||
|
||||
ratio = nose_to_mouth_dist / eye_to_nose_dist if eye_to_nose_dist > 0 else 0.5
|
||||
|
||||
return min(1.0, max(0.0, ratio))
|
||||
|
||||
|
||||
def classify_angle_from_features(
|
||||
ratio: float,
|
||||
nose_offset_norm: float,
|
||||
mouth_symmetry: float,
|
||||
eye_slope: float,
|
||||
) -> Tuple[str, float]:
|
||||
"""
|
||||
Classify angle using multiple features
|
||||
|
||||
Returns:
|
||||
(angle_type, confidence)
|
||||
"""
|
||||
if ratio < 0.35 and abs(nose_offset_norm) < 0.15:
|
||||
return ("frontal", 0.95)
|
||||
|
||||
if ratio < 0.55 and abs(nose_offset_norm) < 0.25:
|
||||
return ("three_quarter", 0.85)
|
||||
|
||||
if ratio >= 0.55:
|
||||
if nose_offset_norm < -0.1:
|
||||
if mouth_symmetry < 0.85:
|
||||
return ("profile_left", 0.90)
|
||||
else:
|
||||
return ("profile_left", 0.75)
|
||||
elif nose_offset_norm > 0.1:
|
||||
if mouth_symmetry < 0.85:
|
||||
return ("profile_right", 0.90)
|
||||
else:
|
||||
return ("profile_right", 0.75)
|
||||
else:
|
||||
return ("three_quarter", 0.70)
|
||||
|
||||
return ("unknown", 0.50)
|
||||
|
||||
|
||||
def calculate_pose_angle_v2(landmarks: List) -> Dict:
|
||||
"""
|
||||
Calculate pose angle using multi-feature analysis (V2)
|
||||
|
||||
This is an improved version that uses multiple features:
|
||||
- nose_to_eye_ratio
|
||||
- eye_slope (pitch)
|
||||
- nose_offset (yaw)
|
||||
- mouth_symmetry
|
||||
|
||||
Args:
|
||||
landmarks: List of 5 points [[x, y], [x, y], ...]
|
||||
Order: left_eye, right_eye, nose, left_mouth, right_mouth
|
||||
|
||||
Returns:
|
||||
Dict with:
|
||||
- angle: 'frontal', 'three_quarter', 'profile_left', 'profile_right', 'unknown'
|
||||
- confidence: 0.0 - 1.0
|
||||
- features: Dict of all calculated features
|
||||
"""
|
||||
if len(landmarks) < 5:
|
||||
return {
|
||||
"angle": "unknown",
|
||||
"confidence": 0.0,
|
||||
"features": {},
|
||||
"method": "v2_multi_feature",
|
||||
}
|
||||
|
||||
ratio, eye_width, nose_to_eye = calculate_nose_to_eye_ratio(landmarks)
|
||||
eye_slope, eye_angle = calculate_eye_slope(landmarks)
|
||||
nose_offset, nose_offset_norm = calculate_nose_offset(landmarks)
|
||||
mouth_symmetry, mouth_width = calculate_mouth_symmetry(landmarks)
|
||||
jaw_hint = calculate_jaw_visibility_hint(landmarks)
|
||||
|
||||
angle, confidence = classify_angle_from_features(
|
||||
ratio=ratio,
|
||||
nose_offset_norm=nose_offset_norm,
|
||||
mouth_symmetry=mouth_symmetry,
|
||||
eye_slope=eye_slope,
|
||||
)
|
||||
|
||||
if eye_slope > 0.15:
|
||||
pitch = "tilted_down"
|
||||
elif eye_slope < -0.15:
|
||||
pitch = "tilted_up"
|
||||
else:
|
||||
pitch = "neutral"
|
||||
|
||||
return {
|
||||
"angle": angle,
|
||||
"confidence": confidence,
|
||||
"pitch": pitch,
|
||||
"features": {
|
||||
"nose_to_eye_ratio": round(ratio, 4),
|
||||
"eye_width": round(eye_width, 2),
|
||||
"nose_to_eye_dist": round(nose_to_eye, 2),
|
||||
"eye_slope": round(eye_slope, 4),
|
||||
"eye_angle_deg": round(eye_angle, 2),
|
||||
"nose_offset_x": round(nose_offset, 2),
|
||||
"nose_offset_norm": round(nose_offset_norm, 4),
|
||||
"mouth_symmetry": round(mouth_symmetry, 4),
|
||||
"mouth_width": round(mouth_width, 2),
|
||||
"jaw_visibility_hint": round(jaw_hint, 4),
|
||||
},
|
||||
"method": "v2_multi_feature",
|
||||
"landmarks_count": len(landmarks),
|
||||
}
|
||||
|
||||
|
||||
def calculate_pose_angle_v1(landmarks: List) -> Dict:
|
||||
"""
|
||||
Legacy version (V1) - single feature ratio-based
|
||||
|
||||
For comparison purposes only
|
||||
"""
|
||||
if len(landmarks) < 5:
|
||||
return {"angle": "unknown", "confidence": 0.0}
|
||||
|
||||
left_eye = np.array(landmarks[0][:2])
|
||||
right_eye = np.array(landmarks[1][:2])
|
||||
nose = np.array(landmarks[2][:2])
|
||||
|
||||
eye_center = (left_eye + right_eye) / 2
|
||||
eye_width = np.linalg.norm(right_eye - left_eye)
|
||||
nose_to_eye = np.linalg.norm(nose - eye_center)
|
||||
|
||||
ratio = nose_to_eye / eye_width if eye_width > 0 else 0.0
|
||||
|
||||
if ratio < 0.4:
|
||||
angle = "frontal"
|
||||
elif ratio < 0.6:
|
||||
angle = "three_quarter"
|
||||
elif nose[0] < eye_center[0]:
|
||||
angle = "profile_left"
|
||||
else:
|
||||
angle = "profile_right"
|
||||
|
||||
return {
|
||||
"angle": angle,
|
||||
"confidence": 0.7,
|
||||
"ratio": round(ratio, 4),
|
||||
"method": "v1_single_feature",
|
||||
}
|
||||
|
||||
|
||||
def compare_v1_v2(landmarks: List) -> Dict:
|
||||
"""
|
||||
Compare V1 and V2 classification results
|
||||
|
||||
Useful for validation and debugging
|
||||
"""
|
||||
v1_result = calculate_pose_angle_v1(landmarks)
|
||||
v2_result = calculate_pose_angle_v2(landmarks)
|
||||
|
||||
return {
|
||||
"v1": v1_result,
|
||||
"v2": v2_result,
|
||||
"agreement": v1_result["angle"] == v2_result["angle"],
|
||||
"confidence_improvement": v2_result["confidence"] - v1_result["confidence"],
|
||||
}
|
||||
|
||||
|
||||
def batch_classify_angles(face_json_path: str) -> Dict:
|
||||
"""
|
||||
Batch classify all faces in face.json
|
||||
|
||||
Returns:
|
||||
Statistics and per-frame results
|
||||
"""
|
||||
import json
|
||||
|
||||
with open(face_json_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
frames = data.get("frames", {})
|
||||
|
||||
results = []
|
||||
angle_counts = {}
|
||||
confidence_stats = []
|
||||
|
||||
for frame_key, frame_data in frames.items():
|
||||
for face_idx, face in enumerate(frame_data.get("faces", [])):
|
||||
landmarks = face.get("landmarks", [])
|
||||
|
||||
if not landmarks or len(landmarks) < 5:
|
||||
continue
|
||||
|
||||
pose_result = calculate_pose_angle_v2(landmarks)
|
||||
pose_result["frame"] = frame_key
|
||||
pose_result["face_index"] = face_idx
|
||||
|
||||
results.append(pose_result)
|
||||
|
||||
angle = pose_result["angle"]
|
||||
angle_counts[angle] = angle_counts.get(angle, 0) + 1
|
||||
confidence_stats.append(pose_result["confidence"])
|
||||
|
||||
return {
|
||||
"total_faces": len(results),
|
||||
"angle_distribution": angle_counts,
|
||||
"confidence_avg": np.mean(confidence_stats) if confidence_stats else 0.0,
|
||||
"confidence_min": np.min(confidence_stats) if confidence_stats else 0.0,
|
||||
"confidence_max": np.max(confidence_stats) if confidence_stats else 0.0,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Pose Analyzer")
|
||||
parser.add_argument("--face-json", help="Path to face.json for batch analysis")
|
||||
parser.add_argument("--test", action="store_true", help="Run unit tests")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.test:
|
||||
print("=" * 60)
|
||||
print("Pose Analyzer Unit Tests")
|
||||
print("=" * 60)
|
||||
|
||||
test_landmarks = [
|
||||
[[100, 100], [120, 100], [110, 120], [105, 130], [115, 130]],
|
||||
[[100, 100], [120, 100], [125, 120], [105, 130], [115, 130]],
|
||||
[[100, 100], [120, 100], [95, 120], [105, 130], [115, 130]],
|
||||
]
|
||||
|
||||
for i, lm in enumerate(test_landmarks):
|
||||
result = calculate_pose_angle_v2(lm)
|
||||
print(f"\nTest {i+1}: {result['angle']} (confidence: {result['confidence']:.2f})")
|
||||
print(f" Features: {result['features']}")
|
||||
|
||||
elif args.face_json:
|
||||
print("=" * 60)
|
||||
print("Batch Pose Analysis")
|
||||
print("=" * 60)
|
||||
|
||||
batch_result = batch_classify_angles(args.face_json)
|
||||
|
||||
print(f"\nTotal faces: {batch_result['total_faces']}")
|
||||
print(f"Angle distribution: {batch_result['angle_distribution']}")
|
||||
print(f"Confidence: avg={batch_result['confidence_avg']:.2f}, min={batch_result['confidence_min']:.2f}, max={batch_result['confidence_max']:.2f}")
|
||||
else:
|
||||
print("Please provide --face-json or --test")
|
||||
239
scripts/utils/pose_transition_analyzer.py
Normal file
239
scripts/utils/pose_transition_analyzer.py
Normal file
@@ -0,0 +1,239 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Pose Transition Analyzer - Analyze pose changes within traces
|
||||
|
||||
Purpose:
|
||||
1. Visualize pose transitions over time
|
||||
2. Calculate transition frequency and duration
|
||||
3. Identify pose stability patterns
|
||||
|
||||
Output:
|
||||
1. Pose transition timeline
|
||||
2. Pose duration statistics
|
||||
3. Stability score per trace
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from typing import Dict, List
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def analyze_pose_transitions(face_data: Dict) -> Dict:
|
||||
"""
|
||||
Analyze pose transitions for all traces
|
||||
|
||||
Returns:
|
||||
Dict with transition analysis results
|
||||
"""
|
||||
traces = face_data.get("traces", {})
|
||||
|
||||
if not traces:
|
||||
return {}
|
||||
|
||||
analysis = {}
|
||||
|
||||
for trace_id_str, trace in traces.items():
|
||||
trace_id = int(trace_id_str)
|
||||
pose_trace = trace.get("pose_trace", [])
|
||||
transitions = trace.get("pose_transitions", [])
|
||||
|
||||
if len(pose_trace) < 2:
|
||||
continue
|
||||
|
||||
# Pose duration analysis
|
||||
pose_segments = []
|
||||
current_pose = pose_trace[0]["angle"]
|
||||
current_start = pose_trace[0]["frame"]
|
||||
|
||||
for i, pose in enumerate(pose_trace[1:], 1):
|
||||
if pose["angle"] != current_pose:
|
||||
pose_segments.append({
|
||||
"angle": current_pose,
|
||||
"start_frame": current_start,
|
||||
"end_frame": pose_trace[i-1]["frame"],
|
||||
"duration_frames": pose_trace[i-1]["frame"] - current_start + 1,
|
||||
"avg_confidence": np.mean([
|
||||
p["confidence"]
|
||||
for p in pose_trace[current_start-pose_trace[0]["frame"]:i]
|
||||
]),
|
||||
})
|
||||
current_pose = pose["angle"]
|
||||
current_start = pose["frame"]
|
||||
|
||||
# Add last segment
|
||||
pose_segments.append({
|
||||
"angle": current_pose,
|
||||
"start_frame": current_start,
|
||||
"end_frame": pose_trace[-1]["frame"],
|
||||
"duration_frames": pose_trace[-1]["frame"] - current_start + 1,
|
||||
"avg_confidence": np.mean([
|
||||
p["confidence"]
|
||||
for p in pose_trace[current_start-pose_trace[0]["frame"]:]
|
||||
]),
|
||||
})
|
||||
|
||||
# Transition frequency
|
||||
transition_frequency = len(transitions) / trace["duration_seconds"] if trace["duration_seconds"] > 0 else 0
|
||||
|
||||
# Stability score (inverse of transition frequency)
|
||||
stability_score = 1.0 - min(transition_frequency / 2.0, 1.0) # 2 transitions/second = fully unstable
|
||||
|
||||
# Pose average duration
|
||||
pose_avg_duration = {}
|
||||
for angle in set([s["angle"] for s in pose_segments]):
|
||||
segments_for_angle = [s for s in pose_segments if s["angle"] == angle]
|
||||
avg_dur = np.mean([s["duration_frames"] for s in segments_for_angle])
|
||||
pose_avg_duration[angle] = round(avg_dur, 1)
|
||||
|
||||
analysis[trace_id] = {
|
||||
"trace_id": trace_id,
|
||||
"total_transitions": len(transitions),
|
||||
"transition_frequency": round(transition_frequency, 3), # transitions per second
|
||||
"stability_score": round(stability_score, 3), # 0-1, higher = more stable
|
||||
"pose_segments": pose_segments,
|
||||
"pose_avg_duration": pose_avg_duration,
|
||||
"longest_stable_pose": max(pose_segments, key=lambda x: x["duration_frames"]),
|
||||
"transition_events": transitions,
|
||||
}
|
||||
|
||||
return analysis
|
||||
|
||||
|
||||
def visualize_pose_transitions(face_data: Dict, output_path: str = None) -> None:
|
||||
"""
|
||||
Visualize pose transitions for all traces
|
||||
"""
|
||||
traces = face_data.get("traces", {})
|
||||
|
||||
if not traces:
|
||||
print("No traces found")
|
||||
return
|
||||
|
||||
sorted_traces = sorted(traces.values(), key=lambda x: x["duration_frames"], reverse=True)
|
||||
|
||||
fig, axes = plt.subplots(len(sorted_traces), 1, figsize=(16, 4 * len(sorted_traces)))
|
||||
|
||||
if len(sorted_traces) == 1:
|
||||
axes = [axes]
|
||||
|
||||
pose_colors = {
|
||||
"frontal": "green",
|
||||
"three_quarter": "blue",
|
||||
"profile_left": "orange",
|
||||
"profile_right": "red",
|
||||
"unknown": "gray",
|
||||
}
|
||||
|
||||
for ax, trace in zip(axes, sorted_traces):
|
||||
trace_id = trace["trace_id"]
|
||||
pose_trace = trace.get("pose_trace", [])
|
||||
|
||||
if not pose_trace:
|
||||
continue
|
||||
|
||||
frames = [p["frame"] for p in pose_trace]
|
||||
angles = [p["angle"] for p in pose_trace]
|
||||
confidences = [p["confidence"] for p in pose_trace]
|
||||
|
||||
# Plot pose angle timeline
|
||||
for i in range(len(frames) - 1):
|
||||
color = pose_colors.get(angles[i], "gray")
|
||||
ax.fill_between(
|
||||
[frames[i], frames[i+1]],
|
||||
[0, 0],
|
||||
[1, 1],
|
||||
color=color,
|
||||
alpha=0.6,
|
||||
)
|
||||
|
||||
# Mark transitions
|
||||
transitions = trace.get("pose_transitions", [])
|
||||
for t in transitions:
|
||||
ax.axvline(x=t["frame"], color="black", linestyle="--", alpha=0.5, linewidth=1)
|
||||
ax.text(t["frame"], 1.05, f"{t['from_angle']}→{t['to_angle']}",
|
||||
fontsize=8, rotation=90, va="bottom", ha="center")
|
||||
|
||||
# Plot confidence line
|
||||
ax2 = ax.twinx()
|
||||
ax2.plot(frames, confidences, color="purple", linewidth=1, alpha=0.7, label="Confidence")
|
||||
ax2.set_ylabel("Confidence", color="purple")
|
||||
ax2.set_ylim(0, 1)
|
||||
|
||||
ax.set_xlabel("Frame Number")
|
||||
ax.set_ylabel("Pose Angle")
|
||||
ax.set_title(f"Trace {trace_id} Pose Timeline (Frames {trace['start_frame']}-{trace['end_frame']})")
|
||||
ax.set_ylim(0, 1.2)
|
||||
|
||||
# Add pose legend
|
||||
legend_elements = []
|
||||
for pose in set(angles):
|
||||
color = pose_colors.get(pose, "gray")
|
||||
legend_elements.append(plt.Rectangle((0, 0), 1, 1, fc=color, alpha=0.6, label=pose))
|
||||
ax.legend(handles=legend_elements, loc="upper right", fontsize=8)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
if output_path:
|
||||
plt.savefig(output_path, dpi=150, bbox_inches="tight")
|
||||
print(f"\n✅ Visualization saved to: {output_path}")
|
||||
else:
|
||||
plt.show()
|
||||
|
||||
|
||||
def print_transition_analysis(analysis: Dict) -> None:
|
||||
"""
|
||||
Print transition analysis results
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Pose Transition Analysis")
|
||||
print("=" * 60)
|
||||
|
||||
for trace_id, data in sorted(analysis.items()):
|
||||
print(f"\n=== Trace {trace_id} ===")
|
||||
print(f"Total Transitions: {data['total_transitions']}")
|
||||
print(f"Transition Frequency: {data['transition_frequency']} transitions/second")
|
||||
print(f"Stability Score: {data['stability_score']} (0-1, higher = more stable)")
|
||||
print(f"Longest Stable Pose: {data['longest_stable_pose']['angle']} ({data['longest_stable_pose']['duration_frames']} frames)")
|
||||
|
||||
print(f"\nPose Average Duration:")
|
||||
for angle, avg_dur in data['pose_avg_duration'].items():
|
||||
print(f" {angle}: {avg_dur} frames")
|
||||
|
||||
print(f"\nPose Segments (共 {len(data['pose_segments'])} 个):")
|
||||
for seg in data['pose_segments'][:5]:
|
||||
print(f" {seg['angle']}: frames {seg['start_frame']}-{seg['end_frame']} ({seg['duration_frames']} frames, confidence: {seg['avg_confidence']:.3f})")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Analyze pose transitions in face traces")
|
||||
parser.add_argument("--face-json", required=True, help="Path to face_traced.json")
|
||||
parser.add_argument("--output-plot", help="Output plot path (PNG)")
|
||||
parser.add_argument("--output-json", help="Output analysis JSON path")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.face_json) as f:
|
||||
face_data = json.load(f)
|
||||
|
||||
print("=" * 60)
|
||||
print("Pose Transition Analyzer")
|
||||
print("=" * 60)
|
||||
|
||||
analysis = analyze_pose_transitions(face_data)
|
||||
|
||||
print_transition_analysis(analysis)
|
||||
|
||||
if args.output_json:
|
||||
with open(args.output_json, "w") as f:
|
||||
json.dump(analysis, f, indent=2)
|
||||
print(f"\n✅ Analysis saved to: {args.output_json}")
|
||||
|
||||
if args.output_plot:
|
||||
visualize_pose_transitions(face_data, args.output_plot)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
377
scripts/utils/test_mediapipe.py
Normal file
377
scripts/utils/test_mediapipe.py
Normal file
@@ -0,0 +1,377 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
MediaPipe Test Script - Test all MediaPipe modules
|
||||
|
||||
Test modules:
|
||||
1. Face Mesh (468 keypoints)
|
||||
2. Pose (33 keypoints)
|
||||
3. Hands (21 keypoints per hand)
|
||||
4. Holistic (Face + Pose + Hands)
|
||||
"""
|
||||
|
||||
import sys
|
||||
import cv2
|
||||
import numpy as np
|
||||
import mediapipe as mp
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def test_face_mesh():
|
||||
"""
|
||||
Test MediaPipe Face Mesh (468 keypoints)
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("Testing MediaPipe Face Mesh")
|
||||
print("=" * 60)
|
||||
|
||||
mp_face_mesh = mp.solutions.face_mesh
|
||||
|
||||
# Create Face Mesh model
|
||||
face_mesh = mp_face_mesh.FaceMesh(
|
||||
static_image_mode=True,
|
||||
max_num_faces=1,
|
||||
refine_landmarks=True, # Enable iris detection
|
||||
min_detection_confidence=0.5,
|
||||
)
|
||||
|
||||
print("✅ Face Mesh model created")
|
||||
|
||||
# Test on sample image
|
||||
test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
|
||||
|
||||
if Path(test_image_path).exists():
|
||||
image = cv2.imread(test_image_path)
|
||||
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
results = face_mesh.process(image_rgb)
|
||||
|
||||
if results.multi_face_landmarks:
|
||||
face_landmarks = results.multi_face_landmarks[0]
|
||||
num_landmarks = len(face_landmarks.landmark)
|
||||
|
||||
print(f"✅ Face detected: {num_landmarks} landmarks")
|
||||
|
||||
# Key landmark indices
|
||||
key_indices = {
|
||||
"nose_tip": 1,
|
||||
"left_eye_center": 33,
|
||||
"right_eye_center": 263,
|
||||
"left_iris_center": 468,
|
||||
"right_iris_center": 473,
|
||||
"mouth_top": 13,
|
||||
"mouth_bottom": 14,
|
||||
"mouth_left": 61,
|
||||
"mouth_right": 291,
|
||||
}
|
||||
|
||||
print("\nKey landmarks:")
|
||||
for name, idx in key_indices.items():
|
||||
if idx < num_landmarks:
|
||||
landmark = face_landmarks.landmark[idx]
|
||||
print(f" {name} ({idx}): x={landmark.x:.3f}, y={landmark.y:.3f}")
|
||||
|
||||
# Calculate Eye Aspect Ratio (EAR)
|
||||
# Left eye
|
||||
p1 = face_landmarks.landmark[33] # Left eye top
|
||||
p2 = face_landmarks.landmark[133] # Left eye bottom
|
||||
p3 = face_landmarks.landmark[159] # Left eye left
|
||||
p4 = face_landmarks.landmark[145] # Left eye right
|
||||
|
||||
vertical_dist = abs(p2.y - p1.y)
|
||||
horizontal_dist = abs(p4.x - p3.x)
|
||||
ear_left = vertical_dist / horizontal_dist if horizontal_dist > 0 else 0
|
||||
|
||||
print(f"\nEye Aspect Ratio (EAR):")
|
||||
print(f" Left eye EAR: {ear_left:.3f}")
|
||||
print(f" Interpretation: {'wide_open' if ear_left > 0.35 else 'normal' if ear_left > 0.2 else 'closed'}")
|
||||
|
||||
# Calculate Mouth Aspect Ratio (MAR)
|
||||
mouth_top = face_landmarks.landmark[13]
|
||||
mouth_bottom = face_landmarks.landmark[14]
|
||||
mouth_left = face_landmarks.landmark[61]
|
||||
mouth_right = face_landmarks.landmark[291]
|
||||
|
||||
mouth_height = abs(mouth_bottom.y - mouth_top.y)
|
||||
mouth_width = abs(mouth_right.x - mouth_left.x)
|
||||
mar = mouth_height / mouth_width if mouth_width > 0 else 0
|
||||
|
||||
print(f"\nMouth Aspect Ratio (MAR):")
|
||||
print(f" MAR: {mar:.3f}")
|
||||
print(f" Interpretation: {'open' if mar > 0.5 else 'closed' if mar < 0.2 else 'slightly_open'}")
|
||||
else:
|
||||
print("❌ No face detected")
|
||||
|
||||
face_mesh.close()
|
||||
print("\n✅ Face Mesh test completed")
|
||||
|
||||
|
||||
def test_pose():
|
||||
"""
|
||||
Test MediaPipe Pose (33 keypoints)
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing MediaPipe Pose")
|
||||
print("=" * 60)
|
||||
|
||||
mp_pose = mp.solutions.pose
|
||||
|
||||
pose = mp_pose.Pose(
|
||||
static_image_mode=True,
|
||||
model_complexity=2, # Full model
|
||||
enable_segmentation=False,
|
||||
min_detection_confidence=0.5,
|
||||
)
|
||||
|
||||
print("✅ Pose model created")
|
||||
|
||||
test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
|
||||
|
||||
if Path(test_image_path).exists():
|
||||
image = cv2.imread(test_image_path)
|
||||
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
results = pose.process(image_rgb)
|
||||
|
||||
if results.pose_landmarks:
|
||||
landmarks = results.pose_landmarks.landmark
|
||||
num_landmarks = len(landmarks)
|
||||
|
||||
print(f"✅ Pose detected: {num_landmarks} keypoints")
|
||||
|
||||
# Key keypoints
|
||||
key_indices = {
|
||||
"nose": 0,
|
||||
"left_shoulder": 11,
|
||||
"right_shoulder": 12,
|
||||
"left_elbow": 13,
|
||||
"right_elbow": 14,
|
||||
"left_wrist": 15,
|
||||
"right_wrist": 16,
|
||||
"left_hip": 23,
|
||||
"right_hip": 24,
|
||||
"left_knee": 25,
|
||||
"right_knee": 26,
|
||||
"left_ankle": 27,
|
||||
"right_ankle": 28,
|
||||
}
|
||||
|
||||
print("\nKey keypoints:")
|
||||
for name, idx in key_indices.items():
|
||||
landmark = landmarks[idx]
|
||||
print(f" {name} ({idx}): x={landmark.x:.3f}, y={landmark.y:.3f}, visibility={landmark.visibility:.2f}")
|
||||
|
||||
# Calculate elbow angles
|
||||
def calculate_angle(p1, p2, p3):
|
||||
v1 = np.array([p1.x, p1.y]) - np.array([p2.x, p2.y])
|
||||
v2 = np.array([p3.x, p3.y]) - np.array([p2.x, p2.y])
|
||||
angle = np.arccos(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
|
||||
return np.degrees(angle)
|
||||
|
||||
# Right arm angle
|
||||
right_shoulder = landmarks[12]
|
||||
right_elbow = landmarks[14]
|
||||
right_wrist = landmarks[16]
|
||||
|
||||
right_elbow_angle = calculate_angle(right_shoulder, right_elbow, right_wrist)
|
||||
|
||||
print(f"\nRight elbow angle: {right_elbow_angle:.1f}°")
|
||||
print(f" Interpretation: {'extended' if right_elbow_angle > 150 else 'folded' if right_elbow_angle < 90 else 'neutral'}")
|
||||
|
||||
# Check if arm is raised
|
||||
if right_wrist.y < right_elbow.y < right_shoulder.y:
|
||||
print(f" Action: raise_right (arm raised)")
|
||||
|
||||
# Knee angles
|
||||
left_hip = landmarks[23]
|
||||
left_knee = landmarks[25]
|
||||
left_ankle = landmarks[27]
|
||||
|
||||
left_knee_angle = calculate_angle(left_hip, left_knee, left_ankle)
|
||||
|
||||
print(f"\nLeft knee angle: {left_knee_angle:.1f}°")
|
||||
print(f" Interpretation: {'standing' if left_knee_angle > 160 else 'knee_bend' if left_knee_angle < 120 else 'neutral'}")
|
||||
else:
|
||||
print("❌ No pose detected")
|
||||
|
||||
pose.close()
|
||||
print("\n✅ Pose test completed")
|
||||
|
||||
|
||||
def test_hands():
|
||||
"""
|
||||
Test MediaPipe Hands (21 keypoints per hand)
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing MediaPipe Hands")
|
||||
print("=" * 60)
|
||||
|
||||
mp_hands = mp.solutions.hands
|
||||
|
||||
hands = mp_hands.Hands(
|
||||
static_image_mode=True,
|
||||
max_num_hands=2,
|
||||
min_detection_confidence=0.5,
|
||||
)
|
||||
|
||||
print("✅ Hands model created")
|
||||
|
||||
test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
|
||||
|
||||
if Path(test_image_path).exists():
|
||||
image = cv2.imread(test_image_path)
|
||||
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
results = hands.process(image_rgb)
|
||||
|
||||
if results.multi_hand_landmarks:
|
||||
for idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
|
||||
hand_label = results.multi_handedness[idx].classification[0].label
|
||||
|
||||
print(f"\n✅ Hand {idx+1} detected ({hand_label}): 21 keypoints")
|
||||
|
||||
landmarks = hand_landmarks.landmark
|
||||
|
||||
# Key landmarks
|
||||
key_indices = {
|
||||
"wrist": 0,
|
||||
"thumb_tip": 4,
|
||||
"index_tip": 8,
|
||||
"middle_tip": 12,
|
||||
"ring_tip": 16,
|
||||
"pinky_tip": 20,
|
||||
}
|
||||
|
||||
print(f" Key landmarks:")
|
||||
for name, i in key_indices.items():
|
||||
lm = landmarks[i]
|
||||
print(f" {name} ({i}): x={lm.x:.3f}, y={lm.y:.3f}")
|
||||
|
||||
# Detect gesture
|
||||
thumb_tip = landmarks[4]
|
||||
index_tip = landmarks[8]
|
||||
middle_tip = landmarks[12]
|
||||
ring_tip = landmarks[16]
|
||||
pinky_tip = landmarks[20]
|
||||
wrist = landmarks[0]
|
||||
|
||||
# Calculate finger extensions
|
||||
def is_finger_extended(tip, base, wrist):
|
||||
return tip.y < base.y # Extended upward
|
||||
|
||||
thumb_extended = is_finger_extended(landmarks[4], landmarks[2], wrist)
|
||||
index_extended = is_finger_extended(landmarks[8], landmarks[5], wrist)
|
||||
middle_extended = is_finger_extended(landmarks[12], landmarks[9], wrist)
|
||||
ring_extended = is_finger_extended(landmarks[16], landmarks[13], wrist)
|
||||
pinky_extended = is_finger_extended(landmarks[20], landmarks[17], wrist)
|
||||
|
||||
extensions = [thumb_extended, index_extended, middle_extended, ring_extended, pinky_extended]
|
||||
|
||||
print(f"\n Finger extensions: {['thumb', 'index', 'middle', 'ring', 'pinky']}")
|
||||
print(f" {extensions}")
|
||||
|
||||
# Detect gesture
|
||||
gesture = "unknown"
|
||||
if all(extensions):
|
||||
gesture = "open_hand"
|
||||
elif not any(extensions):
|
||||
gesture = "fist"
|
||||
elif thumb_extended and not any(extensions[1:]):
|
||||
gesture = "thumbs_up"
|
||||
elif index_extended and middle_extended and not any(extensions[2:]):
|
||||
gesture = "peace_sign"
|
||||
elif index_extended and not any(extensions[2:]) and not thumb_extended:
|
||||
gesture = "pointing"
|
||||
|
||||
print(f" Detected gesture: {gesture}")
|
||||
else:
|
||||
print("❌ No hands detected")
|
||||
|
||||
hands.close()
|
||||
print("\n✅ Hands test completed")
|
||||
|
||||
|
||||
def test_holistic():
|
||||
"""
|
||||
Test MediaPipe Holistic (Face + Pose + Hands combined)
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing MediaPipe Holistic")
|
||||
print("=" * 60)
|
||||
|
||||
mp_holistic = mp.solutions.holistic
|
||||
|
||||
holistic = mp_holistic.Holistic(
|
||||
static_image_mode=True,
|
||||
model_complexity=2,
|
||||
enable_segmentation=False,
|
||||
refine_face_landmarks=True,
|
||||
)
|
||||
|
||||
print("✅ Holistic model created")
|
||||
|
||||
test_image_path = "/Users/accusys/momentry_core_0.1/output/quick_preview/frame_220.jpg"
|
||||
|
||||
if Path(test_image_path).exists():
|
||||
image = cv2.imread(test_image_path)
|
||||
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
results = holistic.process(image_rgb)
|
||||
|
||||
detected_count = 0
|
||||
|
||||
if results.face_landmarks:
|
||||
num_face = len(results.face_landmarks.landmark)
|
||||
print(f"✅ Face: {num_face} landmarks")
|
||||
detected_count += 1
|
||||
|
||||
if results.pose_landmarks:
|
||||
num_pose = len(results.pose_landmarks.landmark)
|
||||
print(f"✅ Pose: {num_pose} keypoints")
|
||||
detected_count += 1
|
||||
|
||||
if results.left_hand_landmarks:
|
||||
num_left_hand = len(results.left_hand_landmarks.landmark)
|
||||
print(f"✅ Left hand: {num_left_hand} keypoints")
|
||||
detected_count += 1
|
||||
|
||||
if results.right_hand_landmarks:
|
||||
num_right_hand = len(results.right_hand_landmarks.landmark)
|
||||
print(f"✅ Right hand: {num_right_hand} keypoints")
|
||||
detected_count += 1
|
||||
|
||||
if detected_count == 0:
|
||||
print("❌ No landmarks detected")
|
||||
else:
|
||||
print(f"\nTotal detections: {detected_count} components")
|
||||
|
||||
holistic.close()
|
||||
print("\n✅ Holistic test completed")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("MediaPipe Installation Test")
|
||||
print("=" * 70)
|
||||
|
||||
print(f"\nMediaPipe version: {mp.__version__}")
|
||||
print()
|
||||
|
||||
# Test all modules
|
||||
test_face_mesh()
|
||||
test_pose()
|
||||
test_hands()
|
||||
test_holistic()
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("✅ All MediaPipe tests completed!")
|
||||
print("=" * 70)
|
||||
|
||||
print("\nNext steps:")
|
||||
print(" 1. Face Mesh: Use for eye/mouth action detection")
|
||||
print(" 2. Pose: Use for arm/leg/feet action detection")
|
||||
print(" 3. Hands: Use for hand gesture detection")
|
||||
print(" 4. Holistic: Use for full-body action detection")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user