feat: media API (video/bbox/thumbnail), UUID unification, dot matrix text, portal fixes, API dictionary V1.3

This commit is contained in:
Warren
2026-05-06 13:34:49 +08:00
parent e75c4d6f07
commit 74b6182eba
197 changed files with 17511 additions and 8759 deletions

View File

@@ -1,255 +1,119 @@
#!/opt/homebrew/bin/python3.11
"""
Pose Processor - Pose Estimation with Resume Support
Uses YOLOv8 Pose via ultralytics (local model)
Resume Feature:
- Auto-detect existing results and resume from last frame
- Auto-save at configurable intervals (default: 30 seconds)
- Ctrl+C gracefully saves and exits
Note: YOLOv8 Pose uses stream mode which is optimized for video processing.
For resume support, we need to process frames manually with OpenCV.
Pose Processor Wrapper
Calls Swift Vision Framework pose (swift_pose) with fallback to YOLOv8 Pose.
Uses VNDetectHumanBodyPoseRequest with ANE acceleration.
"""
import sys
import json
import argparse
import os
import time
import subprocess
import argparse
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from resume_framework import ResumeFramework, format_time, print_progress
KEYPOINT_NAMES = [
"nose",
"left_eye",
"right_eye",
"left_ear",
"right_ear",
"left_shoulder",
"right_shoulder",
"left_elbow",
"right_elbow",
"left_wrist",
"right_wrist",
"left_hip",
"right_hip",
"left_knee",
"right_knee",
"left_ankle",
"right_ankle",
]
SWIFT_POSE_PATH = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"swift_processors/.build/debug/swift_pose"
)
SWIFT_POSE_ALT = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"swift_processors/.build/arm64-apple-macosx/debug/swift_pose"
)
def process_pose(
video_path: str,
output_path: str,
uuid: str = "",
auto_save_interval: int = 30,
auto_save_frames: int = 300,
force_restart: bool = False,
):
"""Process video for pose estimation using YOLOv8 Pose with resume support"""
sample_interval: int = 30,
) -> dict:
swift_bin = SWIFT_POSE_PATH
if not os.path.exists(swift_bin):
swift_bin = SWIFT_POSE_ALT
framework = ResumeFramework(
output_path=output_path,
processor_name="pose",
uuid=uuid,
auto_save_interval=auto_save_interval,
auto_save_frames=auto_save_frames,
force_restart=force_restart,
)
if not os.path.exists(swift_bin):
print("[Pose] Swift binary not found, using YOLOv8 fallback", file=sys.stderr)
return _fallback(video_path, output_path, uuid, sample_interval)
framework.publish_info("POSE_START")
cmd = [swift_bin, video_path, output_path,
"--sample-interval", str(sample_interval),
"--uuid", uuid]
try:
from ultralytics import YOLO
except ImportError:
framework.publish_error("ultralytics not installed")
result = {
"metadata": {"status": "error", "error": "ultralytics not installed"},
"frames": {},
}
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
return result
print(f"[Pose] Running Swift Pose (Vision Framework)", file=sys.stderr)
result = subprocess.run(cmd, capture_output=True, text=True, timeout=7200)
framework.publish_info("POSE_LOADING_MODEL")
if result.stdout:
for line in result.stdout.strip().split("\n"):
print(f" {line}", file=sys.stderr)
if result.stderr:
for line in result.stderr.strip().split("\n"):
print(f" {line}", file=sys.stderr)
model = YOLO("yolov8n-pose.pt")
if result.returncode != 0 or not os.path.exists(output_path):
print(f"[Pose] Swift Pose failed, falling back to YOLOv8", file=sys.stderr)
return _fallback(video_path, output_path, uuid, sample_interval)
with open(output_path) as f:
return json.load(f)
def _fallback(video_path, output_path, uuid, sample_interval):
"""Fallback to YOLOv8 Pose"""
from ultralytics import YOLO
import cv2
model = YOLO("yolov8n-pose.pt")
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error: Cannot open video: {video_path}")
return {"metadata": {"status": "error"}, "frames": {}}
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
total_duration = total_frames / fps if fps > 0 else 0
cap.release()
framework.publish_info(f"fps={fps}, frames={total_frames}")
existing_data, last_checkpoint = framework.load_existing_data()
resume_mode = existing_data is not None and last_checkpoint > 0 and not force_restart
if resume_mode:
print(f"\nFound existing data: {output_path}")
print(f"Last processed frame: {last_checkpoint}")
print(f"Will resume from frame {last_checkpoint + 1}")
if resume_mode and existing_data:
pose_data = existing_data
frame_count = last_checkpoint
processed_frames = set(int(k) for k in existing_data.get("frames", {}).keys())
cap = cv2.VideoCapture(video_path)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)
else:
pose_data = {
"metadata": framework.init_metadata(
video_path=video_path,
fps=fps,
width=width,
height=height,
total_frames=total_frames,
total_duration=total_duration,
extra={"model": "yolov8n-pose"},
),
"frames": {},
}
frame_count = 0
processed_frames = set()
cap = cv2.VideoCapture(video_path)
framework.set_data(pose_data)
start_time = time.time()
framework.last_save_time = start_time
print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
print()
while True:
frame_count = 0
frames = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_count % sample_interval == 0:
ts = frame_count / fps if fps > 0 else 0
results = model(frame, verbose=False, device="cpu")
persons = []
for r in results:
if r.keypoints is None:
continue
for kp_data in r.keypoints:
kps = kp_data.xy[0].cpu().numpy() if hasattr(kp_data, 'xy') else []
confs = kp_data.conf[0].cpu().numpy() if hasattr(kp_data, 'conf') else []
keypoints = []
names = ["nose", "left_eye", "right_eye", "left_ear", "right_ear",
"left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
"left_wrist", "right_wrist", "left_hip", "right_hip",
"left_knee", "right_knee", "left_ankle", "right_ankle"]
for j, name in enumerate(names):
if j < len(kps):
x, y = float(kps[j][0]), float(kps[j][1])
c = float(confs[j]) if j < len(confs) else 0
keypoints.append({"name": name, "x": x, "y": y, "confidence": c})
if keypoints:
xs = [k["x"] for k in keypoints if k["confidence"] > 0.1]
ys = [k["y"] for k in keypoints if k["confidence"] > 0.1]
bbox = {"x": int(min(xs)), "y": int(min(ys)), "width": int(max(xs)-min(xs)), "height": int(max(ys)-min(ys))} if xs else {"x": 0, "y": 0, "width": 0, "height": 0}
persons.append({"keypoints": keypoints, "bbox": bbox})
if persons:
frames.append({"frame": frame_count, "timestamp": ts, "persons": persons})
frame_count += 1
current_time = (frame_count - 1) / fps if fps > 0 else 0
if frame_count in processed_frames:
continue
results = model(frame, conf=0.5, verbose=False, pose=True)
result = results[0]
persons = []
if result.keypoints is not None:
for person in result.keypoints:
keypoints = []
for i, kp in enumerate(person):
if len(kp) >= 3:
keypoints.append(
{
"name": KEYPOINT_NAMES[i]
if i < len(KEYPOINT_NAMES)
else f"kp_{i}",
"x": float(kp[0]),
"y": float(kp[1]),
"confidence": float(kp[2]),
}
)
valid_kps = [kp for kp in keypoints if kp["confidence"] > 0.3]
if valid_kps:
xs = [kp["x"] for kp in valid_kps]
ys = [kp["y"] for kp in valid_kps]
bbox = {
"x": int(min(xs)),
"y": int(min(ys)),
"width": int(max(xs) - min(xs)),
"height": int(max(ys) - min(ys)),
}
else:
bbox = {"x": 0, "y": 0, "width": 0, "height": 0}
persons.append({"keypoints": keypoints, "bbox": bbox})
if persons or frame_count % 30 == 0:
pose_data["frames"][str(frame_count)] = {
"frame_number": frame_count,
"time_seconds": round(current_time, 3),
"time_formatted": format_time(current_time),
"persons": persons,
}
processed_frames.add(frame_count)
if frame_count % 500 == 0:
elapsed = time.time() - start_time
print_progress(frame_count, total_frames, elapsed, f"{len(persons)} persons")
framework.publish_progress(frame_count, total_frames, f"frame {frame_count}")
if framework.should_auto_save(frame_count):
framework.save_progress(frame_count, silent=True)
cap.release()
total_processed = len(processed_frames)
framework.finalize(
total_processed=total_processed,
extra_metadata={"model": "yolov8n-pose"},
)
print(f"\nPose estimation completed: {total_processed} frames processed")
print(f"Frames with poses: {len([f for f in pose_data['frames'].values() if f['persons']])}")
return pose_data
result = {"frame_count": len(frames), "fps": fps, "frames": frames}
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
return result
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Pose Estimation with Resume Support")
parser.add_argument("video_path", help="Path to video file")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
parser.add_argument(
"--auto-save-interval",
"-a",
help="Auto-save interval in seconds",
type=int,
default=30,
)
parser.add_argument(
"--auto-save-frames",
"-f",
help="Auto-save interval in frames",
type=int,
default=300,
)
parser.add_argument(
"--force-restart",
"-r",
help="Force restart (ignore existing data)",
action="store_true",
)
parser = argparse.ArgumentParser(description="Pose Processor (Swift Vision)")
parser.add_argument("video_path")
parser.add_argument("output_path")
parser.add_argument("--uuid", "-u", default="")
parser.add_argument("--sample-interval", type=int, default=30)
args = parser.parse_args()
process_pose(
args.video_path,
args.output_path,
args.uuid,
args.auto_save_interval,
args.auto_save_frames,
args.force_restart,
)
result = process_pose(args.video_path, args.output_path, args.uuid, args.sample_interval)
with open(args.output_path, "w") as f:
json.dump(result, f, indent=2)
print(f"Pose: {len(result.get('frames', []))} frames with poses")