feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/pose_processor.py
+++ b/scripts/pose_processor.py
@@ -1,114 +1,159 @@
 #!/opt/homebrew/bin/python3.11
 """
-Pose Processor - Pose Estimation
+Pose Processor - Pose Estimation with Resume Support
 Uses YOLOv8 Pose via ultralytics (local model)
+
+Resume Feature:
+- Auto-detect existing results and resume from last frame
+- Auto-save at configurable intervals (default: 30 seconds)
+- Ctrl+C gracefully saves and exits
+
+Note: YOLOv8 Pose uses stream mode which is optimized for video processing.
+For resume support, we need to process frames manually with OpenCV.
 """

 import sys
 import json
 import argparse
 import os
-import signal
+import time
+from datetime import datetime

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from redis_publisher import RedisPublisher
+from resume_framework import ResumeFramework, format_time, print_progress


-def signal_handler(signum, frame):
-    print(f"POSE: Received signal {signum}, exiting...")
-    sys.exit(1)
+KEYPOINT_NAMES = [
+    "nose",
+    "left_eye",
+    "right_eye",
+    "left_ear",
+    "right_ear",
+    "left_shoulder",
+    "right_shoulder",
+    "left_elbow",
+    "right_elbow",
+    "left_wrist",
+    "right_wrist",
+    "left_hip",
+    "right_hip",
+    "left_knee",
+    "right_knee",
+    "left_ankle",
+    "right_ankle",
+]


-def process_pose(video_path: str, output_path: str, uuid: str = ""):
-    """Process video for pose estimation using YOLOv8 Pose"""
+def process_pose(
+    video_path: str,
+    output_path: str,
+    uuid: str = "",
+    auto_save_interval: int = 30,
+    auto_save_frames: int = 300,
+    force_restart: bool = False,
+):
+    """Process video for pose estimation using YOLOv8 Pose with resume support"""

-    # Set up signal handlers
-    signal.signal(signal.SIGTERM, signal_handler)
-    signal.signal(signal.SIGINT, signal_handler)
+    framework = ResumeFramework(
+        output_path=output_path,
+        processor_name="pose",
+        uuid=uuid,
+        auto_save_interval=auto_save_interval,
+        auto_save_frames=auto_save_frames,
+        force_restart=force_restart,
+    )

-    publisher = RedisPublisher(uuid) if uuid else None
-    if publisher:
-        publisher.info("pose", "POSE_START")
+    framework.publish_info("POSE_START")

    try:
-        from ultralytics import YOLO  # pyright: ignore
+        from ultralytics import YOLO
    except ImportError:
-        if publisher:
-            publisher.error("pose", "ultralytics not installed")
-        result = {"frame_count": 0, "fps": 0.0, "frames": []}
-        if publisher:
-            publisher.complete("pose", "0 frames")
+        framework.publish_error("ultralytics not installed")
+        result = {
+            "metadata": {"status": "error", "error": "ultralytics not installed"},
+            "frames": {},
+        }
        with open(output_path, "w") as f:
            json.dump(result, f, indent=2)
        return result

-    if publisher:
-        publisher.info("pose", "POSE_LOADING_MODEL")
+    framework.publish_info("POSE_LOADING_MODEL")

-    # Load YOLOv8 Pose model
-    # yolov8n-pose.pt = nano (fastest)
-    # yolov8s-pose.pt = small
-    # yolov8m-pose.pt = medium
    model = YOLO("yolov8n-pose.pt")

-    # Get video info
    import cv2

    cap = cv2.VideoCapture(video_path)
+
+    if not cap.isOpened():
+        print(f"Error: Cannot open video: {video_path}")
+        return {"metadata": {"status": "error"}, "frames": {}}
+
    fps = cap.get(cv2.CAP_PROP_FPS)
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    total_duration = total_frames / fps if fps > 0 else 0
    cap.release()

-    if publisher:
-        publisher.info("pose", f"fps={fps}, frames={total_frames}")
-        publisher.progress("pose", 0, total_frames, "Starting")
+    framework.publish_info(f"fps={fps}, frames={total_frames}")

-    # Process video with YOLO Pose
-    results = model(
-        video_path,
-        conf=0.5,  # confidence threshold
-        save=False,
-        stream=True,
-        verbose=False,
-        pose=True,  # Enable pose estimation
-    )
+    existing_data, last_checkpoint = framework.load_existing_data()
+    resume_mode = existing_data is not None and last_checkpoint > 0 and not force_restart

-    # COCO keypoint names
-    KEYPOINT_NAMES = [
-        "nose",
-        "left_eye",
-        "right_eye",
-        "left_ear",
-        "right_ear",
-        "left_shoulder",
-        "right_shoulder",
-        "left_elbow",
-        "right_elbow",
-        "left_wrist",
-        "right_wrist",
-        "left_hip",
-        "right_hip",
-        "left_knee",
-        "right_knee",
-        "left_ankle",
-        "right_ankle",
-    ]
+    if resume_mode:
+        print(f"\nFound existing data: {output_path}")
+        print(f"Last processed frame: {last_checkpoint}")
+        print(f"Will resume from frame {last_checkpoint + 1}")

-    frames = []
-    frame_count = 0
+    if resume_mode and existing_data:
+        pose_data = existing_data
+        frame_count = last_checkpoint
+        processed_frames = set(int(k) for k in existing_data.get("frames", {}).keys())
+        cap = cv2.VideoCapture(video_path)
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)
+    else:
+        pose_data = {
+            "metadata": framework.init_metadata(
+                video_path=video_path,
+                fps=fps,
+                width=width,
+                height=height,
+                total_frames=total_frames,
+                total_duration=total_duration,
+                extra={"model": "yolov8n-pose"},
+            ),
+            "frames": {},
+        }
+        frame_count = 0
+        processed_frames = set()
+        cap = cv2.VideoCapture(video_path)
+
+    framework.set_data(pose_data)
+
+    start_time = time.time()
+    framework.last_save_time = start_time
+
+    print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
+    print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
+    print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
+    print()
+
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break

-    for result in results:
        frame_count += 1
+        current_time = (frame_count - 1) / fps if fps > 0 else 0

-        # Get frame number and timestamp
-        frame_idx = (
-            result.orig_frame_idx
-            if hasattr(result, "orig_frame_idx")
-            else frame_count - 1
-        )
-        timestamp = frame_idx / fps if fps > 0 else 0
+        if frame_count in processed_frames:
+            continue
+
+        results = model(frame, conf=0.5, verbose=False, pose=True)
+        result = results[0]

-        # Get pose keypoints
        persons = []

        if result.keypoints is not None:
@@ -128,7 +173,6 @@ def process_pose(video_path: str, output_path: str, uuid: str = ""):
                            }
                        )

-                # Get bounding box from keypoints if available
                valid_kps = [kp for kp in keypoints if kp["confidence"] > 0.3]
                if valid_kps:
                    xs = [kp["x"] for kp in valid_kps]
@@ -144,35 +188,70 @@ def process_pose(video_path: str, output_path: str, uuid: str = ""):

                persons.append({"keypoints": keypoints, "bbox": bbox})

-        # Only add frames with poses or sample periodically
        if persons or frame_count % 30 == 0:
-            frames.append(
-                {
-                    "frame": frame_idx,
-                    "timestamp": round(timestamp, 3),
-                    "persons": persons,
-                }
-            )
+            pose_data["frames"][str(frame_count)] = {
+                "frame_number": frame_count,
+                "time_seconds": round(current_time, 3),
+                "time_formatted": format_time(current_time),
+                "persons": persons,
+            }
+            processed_frames.add(frame_count)

-        if publisher:
-            publisher.progress("pose", frame_count, total_frames, f"Frame {frame_idx}")
+        if frame_count % 500 == 0:
+            elapsed = time.time() - start_time
+            print_progress(frame_count, total_frames, elapsed, f"{len(persons)} persons")
+            framework.publish_progress(frame_count, total_frames, f"frame {frame_count}")

-    result = {"frame_count": total_frames, "fps": fps, "frames": frames}
+        if framework.should_auto_save(frame_count):
+            framework.save_progress(frame_count, silent=True)

-    if publisher:
-        publisher.complete("pose", f"{len(frames)} frames with poses")
+    cap.release()

-    with open(output_path, "w") as f:
-        json.dump(result, f, indent=2)
+    total_processed = len(processed_frames)

-    return result
+    framework.finalize(
+        total_processed=total_processed,
+        extra_metadata={"model": "yolov8n-pose"},
+    )
+
+    print(f"\nPose estimation completed: {total_processed} frames processed")
+    print(f"Frames with poses: {len([f for f in pose_data['frames'].values() if f['persons']])}")
+
+    return pose_data


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Pose Estimation")
+    parser = argparse.ArgumentParser(description="Pose Estimation with Resume Support")
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
+    parser.add_argument(
+        "--auto-save-interval",
+        "-a",
+        help="Auto-save interval in seconds",
+        type=int,
+        default=30,
+    )
+    parser.add_argument(
+        "--auto-save-frames",
+        "-f",
+        help="Auto-save interval in frames",
+        type=int,
+        default=300,
+    )
+    parser.add_argument(
+        "--force-restart",
+        "-r",
+        help="Force restart (ignore existing data)",
+        action="store_true",
+    )
    args = parser.parse_args()

-    process_pose(args.video_path, args.output_path, args.uuid)
+    process_pose(
+        args.video_path,
+        args.output_path,
+        args.uuid,
+        args.auto_save_interval,
+        args.auto_save_frames,
+        args.force_restart,
+    )