feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/find_kids_pose.py
+++ b/scripts/find_kids_pose.py
@@ -0,0 +1,169 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Find "Kids" in pose data based on Head-to-Body Ratio.
+Heuristic: Kids have a larger head relative to their body height (approx 1:5 or 1:6) compared to adults (approx 1:7.5).
+"""
+
+import json
+import math
+import sys
+
+# Configuration
+POSE_JSON_PATH = "output/384b0ff44aaaa1f1/384b0ff44aaaa1f1.pose.json"
+# Heuristic Threshold: Kids typically have a body length < 6.0 * head_width
+# Adults are usually > 6.5. 
+# We look for Ratio < 5.5 to be safe (smaller is "more kid-like" relative to head size)
+BODY_TO_HEAD_RATIO_THRESHOLD = 5.8 
+
+def distance(p1, p2):
+    return math.sqrt((p1['x'] - p2['x'])**2 + (p1['y'] - p2['y'])**2)
+
+def get_midpoint(p1, p2):
+    return {'x': (p1['x'] + p2['x'])/2, 'y': (p1['y'] + p2['y'])/2}
+
+def find_kids():
+    try:
+        with open(POSE_JSON_PATH, 'r') as f:
+            data = json.load(f)
+    except Exception as e:
+        print(f"Error loading JSON: {e}")
+        return
+
+    frames = data.get("frames", {})
+    potential_kids = []
+    
+    # Counters for debugging
+    total_poses = 0
+    analyzed_poses = 0
+
+    for frame_idx_str, frame_data in frames.items():
+        # Structure: frames -> { "frame_index": { "timestamp": ..., "poses": [...] } }
+        # Or maybe just "poses" list directly?
+        # Checking structure: result["frames"][str(idx)] = { "timestamp": ..., "poses": frame_poses }
+        # Wait, in the processor code:
+        # result["frames"][str(idx)] = { "timestamp": idx / fps ..., "poses": frame_poses }
+        # But the loop iterates over `frames.items()`.
+        
+        # Actually, looking at the JSON structure saved:
+        # It saves the whole result dict. 
+        # result = { ... "frames": { "0": { ... }, "10": { ... } } }
+        # So `frame_data` is { "timestamp": ..., "poses": [...] }
+        
+        timestamp = frame_data.get("timestamp", 0)
+        
+        # "poses" in this JSON is the list of person detections
+        # Each detection has "keypoints" list
+        # But wait, looking at the processor code:
+        # frame_poses.append({"keypoints": person_keypoints, "person_id": person_idx})
+        # The saved JSON structure in process_video_pose is:
+        # result["frames"][str(idx)] = { "timestamp": ..., "poses": frame_poses }
+        
+        # Let's check the actual JSON structure of the file generated.
+        # It is likely: frames -> { "frame_index": { "timestamp": ..., "poses": [...] } }
+        
+        people_in_frame = frame_data.get("poses", [])
+
+        for person in people_in_frame:
+            total_poses += 1
+            kps_list = person.get("keypoints", [])
+            
+            # Map keypoints by name for easier access
+            kp_dict = {kp['name']: kp for kp in kps_list}
+
+            # We need visible keypoints
+            nose = kp_dict.get('nose')
+            l_shoulder = kp_dict.get('left_shoulder')
+            r_shoulder = kp_dict.get('right_shoulder')
+            l_hip = kp_dict.get('left_hip')
+            r_hip = kp_dict.get('right_hip')
+            l_ankle = kp_dict.get('left_ankle')
+            r_ankle = kp_dict.get('right_ankle')
+
+            # Check visibility
+            if not nose or not (l_shoulder or r_shoulder):
+                continue
+            
+            analyzed_poses += 1
+
+            # Estimate Head Size
+            # Distance Nose -> Mid-Shoulders is approx half head height.
+            if l_shoulder and r_shoulder:
+                mid_shoulder = get_midpoint(l_shoulder, r_shoulder)
+            elif l_shoulder:
+                mid_shoulder = l_shoulder
+            else:
+                mid_shoulder = r_shoulder
+            
+            if not mid_shoulder:
+                continue
+
+            # Head Height approx = 2 * distance(Nose, Mid_Shoulder)
+            # Why 2? Nose is roughly in the middle of the face vertically (eyes/nose/mouth).
+            # Distance from nose to shoulder top is roughly "Neck + Half Head".
+            # A rough proxy for Head Height is 1/2 shoulder width? No.
+            # Let's use: Head_Height ~ 1.0 * distance(Nose, Shoulder) is risky.
+            # Let's assume Head_Height is roughly constant relative to shoulder width.
+            
+            # Better metric: Body Length / Shoulder Width?
+            # No, shoulder width varies with build.
+            
+            # Let's go back to: Total Visible Height / Estimated Head Height.
+            # Head Height Estimate = Distance(Nose, Mid_Shoulder) * 2.5 (Rough guess for full head).
+            # Actually, let's use: Head_Height = Distance(Left Ear, Right Ear) if visible? No, usually not reliable.
+            # Let's use: Head_Height = Distance(Nose, Mid_Shoulder) * 1.8 (Empirical factor).
+            head_height_est = distance(nose, mid_shoulder) * 1.8
+            
+            if head_height_est < 10: # Too small/noisy
+                continue
+
+            # Body Height: Distance from Nose to lowest visible point (Hip or Ankle)
+            # We want to estimate Total Height.
+            # If Ankles visible:
+            if l_ankle and r_ankle:
+                mid_ankle = get_midpoint(l_ankle, r_ankle)
+                # Height from Top of Head to Ankle
+                # Nose is inside head. Distance(Nose, Ankle) + Top_of_Head_offset.
+                # Let's just use Distance(Nose, Ankle) as the "Body Length below nose".
+                # Total Height ≈ Dist(Nose, Ankle) + Head_Height/2.
+                dist_nose_ankle = distance(nose, mid_ankle)
+                total_height = dist_nose_ankle + (head_height_est / 2)
+                
+                # Check for valid height (avoid division by zero or weird angles)
+                if total_height > head_height_est:
+                    ratio = total_height / head_height_est
+                    
+                    # Heuristic:
+                    # Adults: ~7.0 - 8.0
+                    # Kids: ~4.5 - 6.0
+                    # We look for < 6.5
+                    if ratio < BODY_TO_HEAD_RATIO_THRESHOLD:
+                        potential_kids.append({
+                            "frame": frame_idx_str,
+                            "timestamp": timestamp,
+                            "ratio": round(ratio, 2),
+                            "person_id": person.get("person_id", "?")
+                        })
+            else:
+                # If legs not visible (sitting/crouching), harder to judge ratio.
+                # We could use Shoulder-to-Hip vs Head, but let's stick to full body for safety.
+                pass
+
+    print(f"Analyzed {analyzed_poses} poses out of {total_poses} total detections.")
+    print(f"Found {len(potential_kids)} potential 'kids' (Ratio < {BODY_TO_HEAD_RATIO_THRESHOLD}).")
+    
+    # Group by timestamp to avoid duplicates (same person in consecutive frames)
+    unique_kids = {}
+    for k in potential_kids:
+        ts = round(k['timestamp'], 1) # Round to 0.1s
+        if ts not in unique_kids:
+            unique_kids[ts] = k
+    
+    # Sort by timestamp
+    sorted_kids = sorted(unique_kids.values(), key=lambda x: x['timestamp'])
+    
+    print(f"\nUnique potential kid detections (timestamps):")
+    for k in sorted_kids:
+        print(f"  -> Timestamp: {k['timestamp']:.2f}s | Ratio: {k['ratio']}")
+
+if __name__ == "__main__":
+    find_kids()