feat: media API (video/bbox/thumbnail), UUID unification, dot matrix text, portal fixes, API dictionary V1.3

2026-05-06 13:34:49 +08:00
parent e75c4d6f07
commit 74b6182eba
197 changed files with 17511 additions and 8759 deletions
--- a/scripts/pose_processor.py
+++ b/scripts/pose_processor.py
@@ -1,255 +1,119 @@
 #!/opt/homebrew/bin/python3.11
 """
-Pose Processor - Pose Estimation with Resume Support
-Uses YOLOv8 Pose via ultralytics (local model)
-
-Resume Feature:
- Auto-detect existing results and resume from last frame
- Auto-save at configurable intervals (default: 30 seconds)
- Ctrl+C gracefully saves and exits
-
-Note: YOLOv8 Pose uses stream mode which is optimized for video processing.
-For resume support, we need to process frames manually with OpenCV.
+Pose Processor Wrapper
+Calls Swift Vision Framework pose (swift_pose) with fallback to YOLOv8 Pose.
+Uses VNDetectHumanBodyPoseRequest with ANE acceleration.
 """

 import sys
 import json
-import argparse
 import os
-import time
+import subprocess
+import argparse

-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from resume_framework import ResumeFramework, format_time, print_progress
-
-
-KEYPOINT_NAMES = [
-    "nose",
-    "left_eye",
-    "right_eye",
-    "left_ear",
-    "right_ear",
-    "left_shoulder",
-    "right_shoulder",
-    "left_elbow",
-    "right_elbow",
-    "left_wrist",
-    "right_wrist",
-    "left_hip",
-    "right_hip",
-    "left_knee",
-    "right_knee",
-    "left_ankle",
-    "right_ankle",
-]
+SWIFT_POSE_PATH = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)),
+    "swift_processors/.build/debug/swift_pose"
+)
+SWIFT_POSE_ALT = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)),
+    "swift_processors/.build/arm64-apple-macosx/debug/swift_pose"
+)


 def process_pose(
    video_path: str,
    output_path: str,
    uuid: str = "",
-    auto_save_interval: int = 30,
-    auto_save_frames: int = 300,
-    force_restart: bool = False,
-):
-    """Process video for pose estimation using YOLOv8 Pose with resume support"""
+    sample_interval: int = 30,
+) -> dict:
+    swift_bin = SWIFT_POSE_PATH
+    if not os.path.exists(swift_bin):
+        swift_bin = SWIFT_POSE_ALT

-    framework = ResumeFramework(
-        output_path=output_path,
-        processor_name="pose",
-        uuid=uuid,
-        auto_save_interval=auto_save_interval,
-        auto_save_frames=auto_save_frames,
-        force_restart=force_restart,
-    )
+    if not os.path.exists(swift_bin):
+        print("[Pose] Swift binary not found, using YOLOv8 fallback", file=sys.stderr)
+        return _fallback(video_path, output_path, uuid, sample_interval)

-    framework.publish_info("POSE_START")
+    cmd = [swift_bin, video_path, output_path,
+           "--sample-interval", str(sample_interval),
+           "--uuid", uuid]

-    try:
-        from ultralytics import YOLO
-    except ImportError:
-        framework.publish_error("ultralytics not installed")
-        result = {
-            "metadata": {"status": "error", "error": "ultralytics not installed"},
-            "frames": {},
-        }
-        with open(output_path, "w") as f:
-            json.dump(result, f, indent=2)
-        return result
+    print(f"[Pose] Running Swift Pose (Vision Framework)", file=sys.stderr)
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=7200)

-    framework.publish_info("POSE_LOADING_MODEL")
+    if result.stdout:
+        for line in result.stdout.strip().split("\n"):
+            print(f"  {line}", file=sys.stderr)
+    if result.stderr:
+        for line in result.stderr.strip().split("\n"):
+            print(f"  {line}", file=sys.stderr)

-    model = YOLO("yolov8n-pose.pt")
+    if result.returncode != 0 or not os.path.exists(output_path):
+        print(f"[Pose] Swift Pose failed, falling back to YOLOv8", file=sys.stderr)
+        return _fallback(video_path, output_path, uuid, sample_interval)

+    with open(output_path) as f:
+        return json.load(f)
+
+
+def _fallback(video_path, output_path, uuid, sample_interval):
+    """Fallback to YOLOv8 Pose"""
+    from ultralytics import YOLO
    import cv2
-
+    model = YOLO("yolov8n-pose.pt")
    cap = cv2.VideoCapture(video_path)
-
-    if not cap.isOpened():
-        print(f"Error: Cannot open video: {video_path}")
-        return {"metadata": {"status": "error"}, "frames": {}}
-
    fps = cap.get(cv2.CAP_PROP_FPS)
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    total_duration = total_frames / fps if fps > 0 else 0
-    cap.release()
-
-    framework.publish_info(f"fps={fps}, frames={total_frames}")
-
-    existing_data, last_checkpoint = framework.load_existing_data()
-    resume_mode = existing_data is not None and last_checkpoint > 0 and not force_restart
-
-    if resume_mode:
-        print(f"\nFound existing data: {output_path}")
-        print(f"Last processed frame: {last_checkpoint}")
-        print(f"Will resume from frame {last_checkpoint + 1}")
-
-    if resume_mode and existing_data:
-        pose_data = existing_data
-        frame_count = last_checkpoint
-        processed_frames = set(int(k) for k in existing_data.get("frames", {}).keys())
-        cap = cv2.VideoCapture(video_path)
-        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)
-    else:
-        pose_data = {
-            "metadata": framework.init_metadata(
-                video_path=video_path,
-                fps=fps,
-                width=width,
-                height=height,
-                total_frames=total_frames,
-                total_duration=total_duration,
-                extra={"model": "yolov8n-pose"},
-            ),
-            "frames": {},
-        }
-        frame_count = 0
-        processed_frames = set()
-        cap = cv2.VideoCapture(video_path)
-
-    framework.set_data(pose_data)
-
-    start_time = time.time()
-    framework.last_save_time = start_time
-
-    print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
-    print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
-    print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
-    print()
-
-    while True:
+    frame_count = 0
+    frames = []
+    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
-
+        if frame_count % sample_interval == 0:
+            ts = frame_count / fps if fps > 0 else 0
+            results = model(frame, verbose=False, device="cpu")
+            persons = []
+            for r in results:
+                if r.keypoints is None:
+                    continue
+                for kp_data in r.keypoints:
+                    kps = kp_data.xy[0].cpu().numpy() if hasattr(kp_data, 'xy') else []
+                    confs = kp_data.conf[0].cpu().numpy() if hasattr(kp_data, 'conf') else []
+                    keypoints = []
+                    names = ["nose", "left_eye", "right_eye", "left_ear", "right_ear",
+                             "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
+                             "left_wrist", "right_wrist", "left_hip", "right_hip",
+                             "left_knee", "right_knee", "left_ankle", "right_ankle"]
+                    for j, name in enumerate(names):
+                        if j < len(kps):
+                            x, y = float(kps[j][0]), float(kps[j][1])
+                            c = float(confs[j]) if j < len(confs) else 0
+                            keypoints.append({"name": name, "x": x, "y": y, "confidence": c})
+                    if keypoints:
+                        xs = [k["x"] for k in keypoints if k["confidence"] > 0.1]
+                        ys = [k["y"] for k in keypoints if k["confidence"] > 0.1]
+                        bbox = {"x": int(min(xs)), "y": int(min(ys)), "width": int(max(xs)-min(xs)), "height": int(max(ys)-min(ys))} if xs else {"x": 0, "y": 0, "width": 0, "height": 0}
+                        persons.append({"keypoints": keypoints, "bbox": bbox})
+            if persons:
+                frames.append({"frame": frame_count, "timestamp": ts, "persons": persons})
        frame_count += 1
-        current_time = (frame_count - 1) / fps if fps > 0 else 0
-
-        if frame_count in processed_frames:
-            continue
-
-        results = model(frame, conf=0.5, verbose=False, pose=True)
-        result = results[0]
-
-        persons = []
-
-        if result.keypoints is not None:
-            for person in result.keypoints:
-                keypoints = []
-
-                for i, kp in enumerate(person):
-                    if len(kp) >= 3:
-                        keypoints.append(
-                            {
-                                "name": KEYPOINT_NAMES[i]
-                                if i < len(KEYPOINT_NAMES)
-                                else f"kp_{i}",
-                                "x": float(kp[0]),
-                                "y": float(kp[1]),
-                                "confidence": float(kp[2]),
-                            }
-                        )
-
-                valid_kps = [kp for kp in keypoints if kp["confidence"] > 0.3]
-                if valid_kps:
-                    xs = [kp["x"] for kp in valid_kps]
-                    ys = [kp["y"] for kp in valid_kps]
-                    bbox = {
-                        "x": int(min(xs)),
-                        "y": int(min(ys)),
-                        "width": int(max(xs) - min(xs)),
-                        "height": int(max(ys) - min(ys)),
-                    }
-                else:
-                    bbox = {"x": 0, "y": 0, "width": 0, "height": 0}
-
-                persons.append({"keypoints": keypoints, "bbox": bbox})
-
-        if persons or frame_count % 30 == 0:
-            pose_data["frames"][str(frame_count)] = {
-                "frame_number": frame_count,
-                "time_seconds": round(current_time, 3),
-                "time_formatted": format_time(current_time),
-                "persons": persons,
-            }
-            processed_frames.add(frame_count)
-
-        if frame_count % 500 == 0:
-            elapsed = time.time() - start_time
-            print_progress(frame_count, total_frames, elapsed, f"{len(persons)} persons")
-            framework.publish_progress(frame_count, total_frames, f"frame {frame_count}")
-
-        if framework.should_auto_save(frame_count):
-            framework.save_progress(frame_count, silent=True)
-
    cap.release()
-
-    total_processed = len(processed_frames)
-
-    framework.finalize(
-        total_processed=total_processed,
-        extra_metadata={"model": "yolov8n-pose"},
-    )
-
-    print(f"\nPose estimation completed: {total_processed} frames processed")
-    print(f"Frames with poses: {len([f for f in pose_data['frames'].values() if f['persons']])}")
-
-    return pose_data
+    result = {"frame_count": len(frames), "fps": fps, "frames": frames}
+    with open(output_path, "w") as f:
+        json.dump(result, f, indent=2)
+    return result


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pose Estimation with Resume Support")
-    parser.add_argument("video_path", help="Path to video file")
-    parser.add_argument("output_path", help="Output JSON path")
-    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
-    parser.add_argument(
-        "--auto-save-interval",
-        "-a",
-        help="Auto-save interval in seconds",
-        type=int,
-        default=30,
-    )
-    parser.add_argument(
-        "--auto-save-frames",
-        "-f",
-        help="Auto-save interval in frames",
-        type=int,
-        default=300,
-    )
-    parser.add_argument(
-        "--force-restart",
-        "-r",
-        help="Force restart (ignore existing data)",
-        action="store_true",
-    )
+    parser = argparse.ArgumentParser(description="Pose Processor (Swift Vision)")
+    parser.add_argument("video_path")
+    parser.add_argument("output_path")
+    parser.add_argument("--uuid", "-u", default="")
+    parser.add_argument("--sample-interval", type=int, default=30)
    args = parser.parse_args()

-    process_pose(
-        args.video_path,
-        args.output_path,
-        args.uuid,
-        args.auto_save_interval,
-        args.auto_save_frames,
-        args.force_restart,
-    )
+    result = process_pose(args.video_path, args.output_path, args.uuid, args.sample_interval)
+    with open(args.output_path, "w") as f:
+        json.dump(result, f, indent=2)
+    print(f"Pose: {len(result.get('frames', []))} frames with poses")