fix: ASRX duplication, TKG edges, trace ingest, and add pipeline progress publishing

- ASRX handler no longer stores duplicate 'asr' pre_chunks - Pre_chunks storage made idempotent (delete-before-insert) - Rule 1 + trace_ingest changed to query 'asrx' not 'asr' - Trace chunks removed (dynamic from TKG/Qdrant) - TKG scroll_face_points fixed: trace_id >= 1 (not == 1) - TKG AsrxSegmentEntry: start/end -> start_time/end_time (match ASRX JSON) - Unregister error handling: log instead of silent discard - Add publish_pipeline_progress calls at each pipeline stage (processors, rule1, face_trace, identity_agent, TKG, rule2, completion)
2026-07-02 10:43:46 +08:00
parent d791d138f2
commit 3eabd45882
65 changed files with 9481 additions and 3856 deletions
--- a/scripts/pose_processor.py
+++ b/scripts/pose_processor.py
@@ -33,7 +33,54 @@ def process_pose(
    uuid: str = "",
    sample_interval: int = 3,  # Changed from 30 to match Face
    publisher: RedisPublisher = None,
+    target_frames: list = None,
 ) -> dict:
+    # Check if pose.json or pose.json.tmp already exists (from swift_face_pose)
+    # executor.rs renames output to .json.tmp before running Python script
+    tmp_path = output_path.replace('.json', '.json.tmp')
+    
+    source_path = None
+    if os.path.exists(output_path):
+        source_path = output_path
+        print(f"[Pose] Output exists from swift_face_pose: {output_path}", file=sys.stderr)
+    elif os.path.exists(tmp_path):
+        source_path = tmp_path
+        print(f"[Pose] Temp output exists from swift_face_pose: {tmp_path}", file=sys.stderr)
+    
+    if source_path:
+        with open(source_path) as f:
+            data = json.load(f)
+        
+        detected_frames = len(data.get('frames', []))
+        print(f"[Pose] Loaded {detected_frames} detected frames", file=sys.stderr)
+        
+        # When target_frames is provided (8Hz sampling), skip interpolation
+        # Swift already outputs at sample_interval=3, matching 8Hz for 24fps
+        if target_frames is not None:
+            print(f"[Pose] 8Hz mode: returning {detected_frames} frames without interpolation", file=sys.stderr)
+            if publisher:
+                publisher.progress("pose", 100, 100, f"{detected_frames} frames (8Hz, no interpolation)")
+            return data
+        
+        # Interpolate keypoints for all frames
+        interpolated_data = interpolate_pose(data, video_path)
+        
+        # Write interpolated output
+        with open(output_path, 'w') as f:
+            json.dump(interpolated_data, f)
+        
+        # Delete .json.tmp file so executor.rs won't restore it
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+            print(f"[Pose] Deleted temp file: {tmp_path}", file=sys.stderr)
+        
+        total_frames = len(interpolated_data.get('frames', []))
+        print(f"[Pose] Interpolated to {total_frames} frames", file=sys.stderr)
+        
+        if publisher:
+            publisher.progress("pose", 100, 100, f"Interpolated {total_frames} frames")
+        return interpolated_data
+
    swift_bin = SWIFT_POSE_PATH
    if not os.path.exists(swift_bin):
        swift_bin = SWIFT_POSE_ALT
@@ -81,6 +128,126 @@ def process_pose(
        return json.load(f)


+def interpolate_pose(detected_data: dict, video_path: str) -> dict:
+    """Interpolate keypoints for all frames between detected frames"""
+    import cv2
+    import numpy as np
+    
+    cap = cv2.VideoCapture(video_path)
+    total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = detected_data.get('fps', 30.0)
+    
+    detected_frames = detected_data.get('frames', [])
+    if not detected_frames:
+        cap.release()
+        return detected_data
+    
+    # Build frame index map
+    frame_map = {f['frame']: f for f in detected_frames}
+    detected_frame_nums = sorted(frame_map.keys())
+    
+    print(f"[Pose] Interpolating from {len(detected_frame_nums)} detected frames to {total_video_frames} total frames", file=sys.stderr)
+    
+    # Get all persons from detected frames (assume same person tracking)
+    all_persons = {}
+    for f in detected_frames:
+        for i, p in enumerate(f.get('persons', [])):
+            if i not in all_persons:
+                all_persons[i] = []
+            all_persons[i].append((f['frame'], p))
+    
+    # Interpolate each person's keypoints for each frame
+    interpolated_frames = []
+    
+    for frame_num in range(total_video_frames):
+        ts = frame_num / fps
+        
+        persons_in_frame = []
+        
+        for person_id, person_frames in all_persons.items():
+            # Find closest detected frames before and after
+            before = None
+            after = None
+            for fn, p in person_frames:
+                if fn <= frame_num:
+                    before = (fn, p)
+                if fn >= frame_num and after is None:
+                    after = (fn, p)
+            
+            if before is None and after is None:
+                continue
+            
+            # Interpolate keypoints
+            interpolated_keypoints = []
+            bbox = None
+            
+            if before and after and before[0] != after[0]:
+                # Linear interpolation
+                t0, t1 = before[0], after[0]
+                t = (frame_num - t0) / (t1 - t0) if t1 != t0 else 0
+                
+                kp_before = before[1].get('keypoints', [])
+                kp_after = after[1].get('keypoints', [])
+                bbox_before = before[1].get('bbox', {})
+                bbox_after = after[1].get('bbox', {})
+                
+                # Interpolate keypoints
+                for i in range(max(len(kp_before), len(kp_after))):
+                    kp0 = kp_before[i] if i < len(kp_before) else kp_after[i]
+                    kp1 = kp_after[i] if i < len(kp_after) else kp_before[i]
+                    
+                    x = kp0['x'] + t * (kp1['x'] - kp0['x'])
+                    y = kp0['y'] + t * (kp1['y'] - kp0['y'])
+                    c = kp0['confidence'] + t * (kp1['confidence'] - kp0['confidence'])
+                    
+                    interpolated_keypoints.append({
+                        'name': kp0['name'],
+                        'x': x,
+                        'y': y,
+                        'confidence': c
+                    })
+                
+                # Interpolate bbox
+                if bbox_before and bbox_after:
+                    bbox = {
+                        'x': int(bbox_before['x'] + t * (bbox_after['x'] - bbox_before['x'])),
+                        'y': int(bbox_before['y'] + t * (bbox_after['y'] - bbox_before['y'])),
+                        'width': int(bbox_before['width'] + t * (bbox_after['width'] - bbox_before['width'])),
+                        'height': int(bbox_before['height'] + t * (bbox_after['height'] - bbox_before['height']))
+                    }
+            
+            elif before:
+                # Use before frame's data
+                interpolated_keypoints = before[1].get('keypoints', [])
+                bbox = before[1].get('bbox', {})
+            
+            elif after:
+                # Use after frame's data
+                interpolated_keypoints = after[1].get('keypoints', [])
+                bbox = after[1].get('bbox', {})
+            
+            if bbox and bbox.get('width', 0) > 0 and bbox.get('height', 0) > 0:
+                persons_in_frame.append({
+                    'keypoints': interpolated_keypoints,
+                    'bbox': bbox
+                })
+        
+        if persons_in_frame:
+            interpolated_frames.append({
+                'frame': frame_num,
+                'timestamp': ts,
+                'persons': persons_in_frame
+            })
+    
+    cap.release()
+    
+    return {
+        'frame_count': len(interpolated_frames),
+        'fps': fps,
+        'frames': interpolated_frames
+    }
+
+
 def _fallback(video_path, output_path, uuid, sample_interval):
    """Fallback to YOLOv8 Pose"""
    from ultralytics import YOLO
@@ -135,14 +302,21 @@ if __name__ == "__main__":
    parser.add_argument("output_path")
    parser.add_argument("--uuid", "-u", default="")
    parser.add_argument("--sample-interval", type=int, default=3)  # Changed from 30 to match Face
+    parser.add_argument("--frames", type=str, default=None,
+                        help="Comma-separated frame numbers for 8Hz sampling")
    args = parser.parse_args()

+    target_frames = None
+    if args.frames:
+        target_frames = [int(f) for f in args.frames.split(",") if f.strip()]
+        print(f"[Pose] 8Hz target frames: {len(target_frames)} frames", file=sys.stderr)
+
    publisher = RedisPublisher(args.uuid) if args.uuid else None
    if publisher:
        publisher.info("pose", "POSE_START")

    result = process_pose(args.video_path, args.output_path, args.uuid,
-                          args.sample_interval, publisher)
+                          args.sample_interval, publisher, target_frames)
    with open(args.output_path, "w") as f:
        json.dump(result, f, indent=2)
    print(f"Pose: {len(result.get('frames', []))} frames with poses")