feat: media API (video/bbox/thumbnail), UUID unification, dot matrix text, portal fixes, API dictionary V1.3
This commit is contained in:
@@ -1,255 +1,119 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Pose Processor - Pose Estimation with Resume Support
|
||||
Uses YOLOv8 Pose via ultralytics (local model)
|
||||
|
||||
Resume Feature:
|
||||
- Auto-detect existing results and resume from last frame
|
||||
- Auto-save at configurable intervals (default: 30 seconds)
|
||||
- Ctrl+C gracefully saves and exits
|
||||
|
||||
Note: YOLOv8 Pose uses stream mode which is optimized for video processing.
|
||||
For resume support, we need to process frames manually with OpenCV.
|
||||
Pose Processor Wrapper
|
||||
Calls Swift Vision Framework pose (swift_pose) with fallback to YOLOv8 Pose.
|
||||
Uses VNDetectHumanBodyPoseRequest with ANE acceleration.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import subprocess
|
||||
import argparse
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from resume_framework import ResumeFramework, format_time, print_progress
|
||||
|
||||
|
||||
KEYPOINT_NAMES = [
|
||||
"nose",
|
||||
"left_eye",
|
||||
"right_eye",
|
||||
"left_ear",
|
||||
"right_ear",
|
||||
"left_shoulder",
|
||||
"right_shoulder",
|
||||
"left_elbow",
|
||||
"right_elbow",
|
||||
"left_wrist",
|
||||
"right_wrist",
|
||||
"left_hip",
|
||||
"right_hip",
|
||||
"left_knee",
|
||||
"right_knee",
|
||||
"left_ankle",
|
||||
"right_ankle",
|
||||
]
|
||||
SWIFT_POSE_PATH = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"swift_processors/.build/debug/swift_pose"
|
||||
)
|
||||
SWIFT_POSE_ALT = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"swift_processors/.build/arm64-apple-macosx/debug/swift_pose"
|
||||
)
|
||||
|
||||
|
||||
def process_pose(
|
||||
video_path: str,
|
||||
output_path: str,
|
||||
uuid: str = "",
|
||||
auto_save_interval: int = 30,
|
||||
auto_save_frames: int = 300,
|
||||
force_restart: bool = False,
|
||||
):
|
||||
"""Process video for pose estimation using YOLOv8 Pose with resume support"""
|
||||
sample_interval: int = 30,
|
||||
) -> dict:
|
||||
swift_bin = SWIFT_POSE_PATH
|
||||
if not os.path.exists(swift_bin):
|
||||
swift_bin = SWIFT_POSE_ALT
|
||||
|
||||
framework = ResumeFramework(
|
||||
output_path=output_path,
|
||||
processor_name="pose",
|
||||
uuid=uuid,
|
||||
auto_save_interval=auto_save_interval,
|
||||
auto_save_frames=auto_save_frames,
|
||||
force_restart=force_restart,
|
||||
)
|
||||
if not os.path.exists(swift_bin):
|
||||
print("[Pose] Swift binary not found, using YOLOv8 fallback", file=sys.stderr)
|
||||
return _fallback(video_path, output_path, uuid, sample_interval)
|
||||
|
||||
framework.publish_info("POSE_START")
|
||||
cmd = [swift_bin, video_path, output_path,
|
||||
"--sample-interval", str(sample_interval),
|
||||
"--uuid", uuid]
|
||||
|
||||
try:
|
||||
from ultralytics import YOLO
|
||||
except ImportError:
|
||||
framework.publish_error("ultralytics not installed")
|
||||
result = {
|
||||
"metadata": {"status": "error", "error": "ultralytics not installed"},
|
||||
"frames": {},
|
||||
}
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
return result
|
||||
print(f"[Pose] Running Swift Pose (Vision Framework)", file=sys.stderr)
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=7200)
|
||||
|
||||
framework.publish_info("POSE_LOADING_MODEL")
|
||||
if result.stdout:
|
||||
for line in result.stdout.strip().split("\n"):
|
||||
print(f" {line}", file=sys.stderr)
|
||||
if result.stderr:
|
||||
for line in result.stderr.strip().split("\n"):
|
||||
print(f" {line}", file=sys.stderr)
|
||||
|
||||
model = YOLO("yolov8n-pose.pt")
|
||||
if result.returncode != 0 or not os.path.exists(output_path):
|
||||
print(f"[Pose] Swift Pose failed, falling back to YOLOv8", file=sys.stderr)
|
||||
return _fallback(video_path, output_path, uuid, sample_interval)
|
||||
|
||||
with open(output_path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def _fallback(video_path, output_path, uuid, sample_interval):
|
||||
"""Fallback to YOLOv8 Pose"""
|
||||
from ultralytics import YOLO
|
||||
import cv2
|
||||
|
||||
model = YOLO("yolov8n-pose.pt")
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
|
||||
if not cap.isOpened():
|
||||
print(f"Error: Cannot open video: {video_path}")
|
||||
return {"metadata": {"status": "error"}, "frames": {}}
|
||||
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
total_duration = total_frames / fps if fps > 0 else 0
|
||||
cap.release()
|
||||
|
||||
framework.publish_info(f"fps={fps}, frames={total_frames}")
|
||||
|
||||
existing_data, last_checkpoint = framework.load_existing_data()
|
||||
resume_mode = existing_data is not None and last_checkpoint > 0 and not force_restart
|
||||
|
||||
if resume_mode:
|
||||
print(f"\nFound existing data: {output_path}")
|
||||
print(f"Last processed frame: {last_checkpoint}")
|
||||
print(f"Will resume from frame {last_checkpoint + 1}")
|
||||
|
||||
if resume_mode and existing_data:
|
||||
pose_data = existing_data
|
||||
frame_count = last_checkpoint
|
||||
processed_frames = set(int(k) for k in existing_data.get("frames", {}).keys())
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)
|
||||
else:
|
||||
pose_data = {
|
||||
"metadata": framework.init_metadata(
|
||||
video_path=video_path,
|
||||
fps=fps,
|
||||
width=width,
|
||||
height=height,
|
||||
total_frames=total_frames,
|
||||
total_duration=total_duration,
|
||||
extra={"model": "yolov8n-pose"},
|
||||
),
|
||||
"frames": {},
|
||||
}
|
||||
frame_count = 0
|
||||
processed_frames = set()
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
|
||||
framework.set_data(pose_data)
|
||||
|
||||
start_time = time.time()
|
||||
framework.last_save_time = start_time
|
||||
|
||||
print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
|
||||
print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
|
||||
print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
|
||||
print()
|
||||
|
||||
while True:
|
||||
frame_count = 0
|
||||
frames = []
|
||||
while cap.isOpened():
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
if frame_count % sample_interval == 0:
|
||||
ts = frame_count / fps if fps > 0 else 0
|
||||
results = model(frame, verbose=False, device="cpu")
|
||||
persons = []
|
||||
for r in results:
|
||||
if r.keypoints is None:
|
||||
continue
|
||||
for kp_data in r.keypoints:
|
||||
kps = kp_data.xy[0].cpu().numpy() if hasattr(kp_data, 'xy') else []
|
||||
confs = kp_data.conf[0].cpu().numpy() if hasattr(kp_data, 'conf') else []
|
||||
keypoints = []
|
||||
names = ["nose", "left_eye", "right_eye", "left_ear", "right_ear",
|
||||
"left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
|
||||
"left_wrist", "right_wrist", "left_hip", "right_hip",
|
||||
"left_knee", "right_knee", "left_ankle", "right_ankle"]
|
||||
for j, name in enumerate(names):
|
||||
if j < len(kps):
|
||||
x, y = float(kps[j][0]), float(kps[j][1])
|
||||
c = float(confs[j]) if j < len(confs) else 0
|
||||
keypoints.append({"name": name, "x": x, "y": y, "confidence": c})
|
||||
if keypoints:
|
||||
xs = [k["x"] for k in keypoints if k["confidence"] > 0.1]
|
||||
ys = [k["y"] for k in keypoints if k["confidence"] > 0.1]
|
||||
bbox = {"x": int(min(xs)), "y": int(min(ys)), "width": int(max(xs)-min(xs)), "height": int(max(ys)-min(ys))} if xs else {"x": 0, "y": 0, "width": 0, "height": 0}
|
||||
persons.append({"keypoints": keypoints, "bbox": bbox})
|
||||
if persons:
|
||||
frames.append({"frame": frame_count, "timestamp": ts, "persons": persons})
|
||||
frame_count += 1
|
||||
current_time = (frame_count - 1) / fps if fps > 0 else 0
|
||||
|
||||
if frame_count in processed_frames:
|
||||
continue
|
||||
|
||||
results = model(frame, conf=0.5, verbose=False, pose=True)
|
||||
result = results[0]
|
||||
|
||||
persons = []
|
||||
|
||||
if result.keypoints is not None:
|
||||
for person in result.keypoints:
|
||||
keypoints = []
|
||||
|
||||
for i, kp in enumerate(person):
|
||||
if len(kp) >= 3:
|
||||
keypoints.append(
|
||||
{
|
||||
"name": KEYPOINT_NAMES[i]
|
||||
if i < len(KEYPOINT_NAMES)
|
||||
else f"kp_{i}",
|
||||
"x": float(kp[0]),
|
||||
"y": float(kp[1]),
|
||||
"confidence": float(kp[2]),
|
||||
}
|
||||
)
|
||||
|
||||
valid_kps = [kp for kp in keypoints if kp["confidence"] > 0.3]
|
||||
if valid_kps:
|
||||
xs = [kp["x"] for kp in valid_kps]
|
||||
ys = [kp["y"] for kp in valid_kps]
|
||||
bbox = {
|
||||
"x": int(min(xs)),
|
||||
"y": int(min(ys)),
|
||||
"width": int(max(xs) - min(xs)),
|
||||
"height": int(max(ys) - min(ys)),
|
||||
}
|
||||
else:
|
||||
bbox = {"x": 0, "y": 0, "width": 0, "height": 0}
|
||||
|
||||
persons.append({"keypoints": keypoints, "bbox": bbox})
|
||||
|
||||
if persons or frame_count % 30 == 0:
|
||||
pose_data["frames"][str(frame_count)] = {
|
||||
"frame_number": frame_count,
|
||||
"time_seconds": round(current_time, 3),
|
||||
"time_formatted": format_time(current_time),
|
||||
"persons": persons,
|
||||
}
|
||||
processed_frames.add(frame_count)
|
||||
|
||||
if frame_count % 500 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
print_progress(frame_count, total_frames, elapsed, f"{len(persons)} persons")
|
||||
framework.publish_progress(frame_count, total_frames, f"frame {frame_count}")
|
||||
|
||||
if framework.should_auto_save(frame_count):
|
||||
framework.save_progress(frame_count, silent=True)
|
||||
|
||||
cap.release()
|
||||
|
||||
total_processed = len(processed_frames)
|
||||
|
||||
framework.finalize(
|
||||
total_processed=total_processed,
|
||||
extra_metadata={"model": "yolov8n-pose"},
|
||||
)
|
||||
|
||||
print(f"\nPose estimation completed: {total_processed} frames processed")
|
||||
print(f"Frames with poses: {len([f for f in pose_data['frames'].values() if f['persons']])}")
|
||||
|
||||
return pose_data
|
||||
result = {"frame_count": len(frames), "fps": fps, "frames": frames}
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Pose Estimation with Resume Support")
|
||||
parser.add_argument("video_path", help="Path to video file")
|
||||
parser.add_argument("output_path", help="Output JSON path")
|
||||
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
|
||||
parser.add_argument(
|
||||
"--auto-save-interval",
|
||||
"-a",
|
||||
help="Auto-save interval in seconds",
|
||||
type=int,
|
||||
default=30,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--auto-save-frames",
|
||||
"-f",
|
||||
help="Auto-save interval in frames",
|
||||
type=int,
|
||||
default=300,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-restart",
|
||||
"-r",
|
||||
help="Force restart (ignore existing data)",
|
||||
action="store_true",
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Pose Processor (Swift Vision)")
|
||||
parser.add_argument("video_path")
|
||||
parser.add_argument("output_path")
|
||||
parser.add_argument("--uuid", "-u", default="")
|
||||
parser.add_argument("--sample-interval", type=int, default=30)
|
||||
args = parser.parse_args()
|
||||
|
||||
process_pose(
|
||||
args.video_path,
|
||||
args.output_path,
|
||||
args.uuid,
|
||||
args.auto_save_interval,
|
||||
args.auto_save_frames,
|
||||
args.force_restart,
|
||||
)
|
||||
result = process_pose(args.video_path, args.output_path, args.uuid, args.sample_interval)
|
||||
with open(args.output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
print(f"Pose: {len(result.get('frames', []))} frames with poses")
|
||||
|
||||
Reference in New Issue
Block a user