feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/face_processor.py
+++ b/scripts/face_processor.py
@@ -1,25 +1,52 @@
 #!/opt/homebrew/bin/python3.11
 """
-Face Processor - Face Detection & Demographics
-Uses InsightFace for detection, age, and gender analysis.
-Falls back to OpenCV Haar Cascade if InsightFace fails.
+Face Processor - Face Detection & Demographics with Resume Support
+Uses InsightFace for detection, age, gender, and embedding extraction.
+
+IMPORTANT: InsightFace is REQUIRED. No Haar fallback.
+- InsightFace provides 512-dim ArcFace embedding for identity matching
+- Haar Cascade cannot generate embedding, only detection
+- If InsightFace fails, processor will ERROR and exit
+
+Resume Feature:
+- Auto-detect existing results and resume from last frame
+- Auto-save at configurable intervals (default: 30 seconds)
+- Ctrl+C gracefully saves and exits
 """

 import sys
 import json
 import argparse
 import os
+import time

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from redis_publisher import RedisPublisher
+from resume_framework import ResumeFramework, format_time, print_progress
+from utils.pose_analyzer import calculate_pose_angle_v2


-def process_face(video_path: str, output_path: str, uuid: str = ""):
-    """Process video for face detection and demographics analysis"""
+def process_face(
+    video_path: str,
+    output_path: str,
+    uuid: str = "",
+    auto_save_interval: int = 30,
+    auto_save_frames: int = 300,
+    force_restart: bool = False,
+    sample_interval: int = 30,
+):
+    """Process video for face detection and demographics analysis with resume support"""

-    publisher = RedisPublisher(uuid) if uuid else None
-    if publisher:
-        publisher.info("face", "FACE_START")
+    framework = ResumeFramework(
+        output_path=output_path,
+        processor_name="face",
+        uuid=uuid,
+        auto_save_interval=auto_save_interval,
+        auto_save_frames=auto_save_frames,
+        force_restart=force_restart,
+    )
+
+    framework.publish_info("FACE_START")

    try:
        import cv2
@@ -27,78 +54,95 @@ def process_face(video_path: str, output_path: str, uuid: str = ""):
        import insightface
    except ImportError as e:
        error_msg = f"Missing dependency: {e.name}"
-        if publisher:
-            publisher.error("face", error_msg)
-        result = {"frame_count": 0, "fps": 0.0, "frames": []}
+        framework.publish_error(error_msg)
+        result = {
+            "metadata": {"status": "error", "error": error_msg},
+            "frames": {},
+        }
        with open(output_path, "w") as f:
            json.dump(result, f, indent=2)
        return result

-    # 1. Initialize InsightFace
-    use_insightface = False
    app = None
    try:
-        if publisher:
-            publisher.info("face", "LOADING_INSIGHTFACE")
-        # 'buffalo_l' is a robust model. det_size can be adjusted.
+        framework.publish_info("LOADING_INSIGHTFACE")
        app = insightface.app.FaceAnalysis(
            name="buffalo_l", providers=["CPUExecutionProvider"]
        )
        app.prepare(ctx_id=0, det_size=(320, 320))
-        use_insightface = True
-        if publisher:
-            publisher.info("face", "INSIGHTFACE_LOADED")
+        framework.publish_info("INSIGHTFACE_LOADED")
    except Exception as e:
-        print(f"[WARNING] InsightFace failed to load: {e}")
-        use_insightface = False
-
-    # 2. Fallback to Haar Cascade
-    face_cascade = None
-    if not use_insightface:
-        if publisher:
-            publisher.info("face", "LOADING_HAAR_CASCADE")
-        face_cascade = cv2.CascadeClassifier(
-            cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
-        )
-        if face_cascade.empty():
-            if publisher:
-                publisher.error("face", "Could not load Haar Cascade")
-            result = {"frame_count": 0, "fps": 0.0, "frames": []}
-            with open(output_path, "w") as f:
-                json.dump(result, f, indent=2)
-            return result
-        if publisher:
-            publisher.info("face", "HAAR_CASCADE_LOADED")
-
-    if publisher:
-        publisher.info("face", "PROCESSING_VIDEO")
-
-    cap = cv2.VideoCapture(video_path)
-    if not cap.isOpened():
-        if publisher:
-            publisher.error("face", "Could not open video")
-        result = {"frame_count": 0, "fps": 0.0, "frames": []}
+        error_msg = f"InsightFace failed to load (REQUIRED): {e}"
+        framework.publish_error(error_msg)
+        result = {
+            "metadata": {"status": "error", "error": error_msg},
+            "frames": {},
+        }
        with open(output_path, "w") as f:
            json.dump(result, f, indent=2)
        return result

+    framework.publish_info("PROCESSING_VIDEO")
+
+    cap = cv2.VideoCapture(video_path)
+
+    if not cap.isOpened():
+        print(f"Error: Cannot open video: {video_path}")
+        return {"metadata": {"status": "error"}, "frames": {}}
+
    fps = cap.get(cv2.CAP_PROP_FPS)
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    total_duration = total_frames / fps if fps > 0 else 0
+    cap.release()

-    # Optimization: Process every N frames to speed up analysis
-    # Since we just need attributes for the person identity, we don't need every single frame.
-    sample_interval = 30
-    if total_frames > 0:
-        estimated_samples = total_frames // sample_interval
+    framework.publish_info(f"fps={fps}, frames={total_frames}")
+
+    existing_data, last_checkpoint = framework.load_existing_data()
+    resume_mode = existing_data is not None and last_checkpoint > 0 and not force_restart
+
+    if resume_mode:
+        print(f"\nFound existing data: {output_path}")
+        print(f"Last processed frame: {last_checkpoint}")
+        print(f"Will resume from frame {last_checkpoint + 1}")
+
+    if resume_mode and existing_data:
+        face_data = existing_data
+        frame_count = last_checkpoint
+        processed_frames = set(int(k) for k in existing_data.get("frames", {}).keys())
+        cap = cv2.VideoCapture(video_path)
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)
    else:
-        estimated_samples = 0
+        face_data = {
+            "metadata": framework.init_metadata(
+                video_path=video_path,
+                fps=fps,
+                width=width,
+                height=height,
+                total_frames=total_frames,
+                total_duration=total_duration,
+                extra={
+                    "sample_interval": sample_interval,
+                    "detection_method": "insightface",
+                },
+            ),
+            "frames": {},
+        }
+        frame_count = 0
+        processed_frames = set()
+        cap = cv2.VideoCapture(video_path)

-    frame_count = 0
-    processed_count = 0
-    frames_data = []
+    framework.set_data(face_data)

-    if publisher:
-        publisher.progress("face", 0, estimated_samples, "Starting")
+    start_time = time.time()
+    framework.last_save_time = start_time
+
+    print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
+    print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
+    print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
+    print(f"Detection method: InsightFace (REQUIRED)")
+    print()

    while True:
        ret, frame = cap.read()
@@ -106,105 +150,151 @@ def process_face(video_path: str, output_path: str, uuid: str = ""):
            break

        frame_count += 1
+        current_time = (frame_count - 1) / fps if fps > 0 else 0

-        # Sampling
-        if frame_count % sample_interval != 0:
+        if frame_count in processed_frames:
            continue

-        processed_count += 1
-        timestamp = (frame_count - 1) / fps if fps > 0 else 0
+        if frame_count % sample_interval != 0:
+            continue

        face_list = []

        try:
-            if use_insightface and app:
-                # InsightFace Detection & Analysis
-                faces = app.get(frame)
-                for face in faces:
-                    bbox = face.bbox.astype(int)
-                    bx, by, bw, bh = (
-                        bbox[0],
-                        bbox[1],
-                        bbox[2] - bbox[0],
-                        bbox[3] - bbox[1],
-                    )
-
-                    # Extract Attributes
-                    age = int(face.age) if hasattr(face, "age") else None
-                    gender_val = face.gender if hasattr(face, "gender") else None
-                    gender = (
-                        "female"
-                        if gender_val == 0
-                        else ("male" if gender_val == 1 else None)
-                    )
-
-                    face_list.append(
-                        {
-                            "x": int(bx),
-                            "y": int(by),
-                            "width": int(bw),
-                            "height": int(bh),
-                            "confidence": float(face.det_score)
-                            if hasattr(face, "det_score")
-                            else 0.9,
-                            "attributes": {"age": age, "gender": gender},
-                        }
-                    )
-            else:
-                # Haar Cascade Fallback (No Age/Gender)
-                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-                faces = face_cascade.detectMultiScale(
-                    gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)
+            faces = app.get(frame)
+            for face in faces:
+                bbox = face.bbox.astype(int)
+                bx, by, bw, bh = (
+                    bbox[0],
+                    bbox[1],
+                    bbox[2] - bbox[0],
+                    bbox[3] - bbox[1],
                )
-                for x, y, w, h in faces:
-                    face_list.append(
-                        {
-                            "x": int(x),
-                            "y": int(y),
-                            "width": int(w),
-                            "height": int(h),
-                            "confidence": 0.8,
-                            "attributes": {"age": None, "gender": None},
+
+                age = int(face.age) if hasattr(face, "age") else None
+                gender_val = face.gender if hasattr(face, "gender") else None
+                gender = (
+                    "female"
+                    if gender_val == 0
+                    else ("male" if gender_val == 1 else None)
+                )
+
+                embedding = None
+                if hasattr(face, "embedding"):
+                    embedding = face.embedding.tolist()
+
+                landmarks = None
+                if hasattr(face, "kps"):
+                    landmarks = face.kps.tolist()
+                elif hasattr(face, "landmark_3d_68"):
+                    landmarks = face.landmark_3d_68.tolist()
+
+                pose_angle = None
+                if landmarks and len(landmarks) >= 5:
+                    try:
+                        pose_result = calculate_pose_angle_v2(landmarks)
+                        pose_angle = {
+                            "angle": pose_result.get("angle", "unknown"),
+                            "confidence": pose_result.get("confidence", 0.0),
+                            "pitch": pose_result.get("pitch", "neutral"),
+                            "features": pose_result.get("features", {}),
                        }
-                    )
+                    except Exception as e:
+                        pass
+
+                face_list.append(
+                    {
+                        "x": int(bx),
+                        "y": int(by),
+                        "width": int(bw),
+                        "height": int(bh),
+                        "confidence": float(face.det_score)
+                        if hasattr(face, "det_score")
+                        else 0.9,
+                        "embedding": embedding,
+                        "landmarks": landmarks,
+                        "pose_angle": pose_angle,
+                        "attributes": {"age": age, "gender": gender},
+                    }
+                )
        except Exception as e:
            print(f"[ERROR] Frame processing error: {e}")

        if face_list:
-            frames_data.append(
-                {
-                    "frame": frame_count - 1,
-                    "timestamp": round(timestamp, 3),
-                    "faces": face_list,
-                }
-            )
+            face_data["frames"][str(frame_count)] = {
+                "frame_number": frame_count,
+                "time_seconds": round(current_time, 3),
+                "time_formatted": format_time(current_time),
+                "faces": face_list,
+            }
+            processed_frames.add(frame_count)

-            if publisher:
-                publisher.progress(
-                    "face",
-                    processed_count,
-                    estimated_samples,
-                    f"Frame {frame_count}",
-                )
+        if frame_count % 500 == 0:
+            elapsed = time.time() - start_time
+            print_progress(frame_count, total_frames, elapsed, f"{len(face_list)} faces")
+            framework.publish_progress(frame_count, total_frames, f"frame {frame_count}")
+
+        if framework.should_auto_save(frame_count):
+            framework.save_progress(frame_count, silent=True)

    cap.release()

-    result = {"frame_count": total_frames, "fps": fps, "frames": frames_data}
+    total_processed = len(processed_frames)

-    if publisher:
-        publisher.complete("face", f"{len(frames_data)} frames processed")
+    framework.finalize(
+        total_processed=total_processed,
+        extra_metadata={
+            "sample_interval": sample_interval,
+            "detection_method": "insightface",
+        },
+    )

-    with open(output_path, "w") as f:
-        json.dump(result, f, indent=2)
+    print(f"\nFace detection completed: {total_processed} frames processed")
+    print(f"Frames with faces: {len(face_data['frames'])}")

-    return result
+    return face_data


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Face Detection & Demographics")
+    parser = argparse.ArgumentParser(description="Face Detection & Demographics with Resume Support")
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
+    parser.add_argument(
+        "--auto-save-interval",
+        "-a",
+        help="Auto-save interval in seconds",
+        type=int,
+        default=30,
+    )
+    parser.add_argument(
+        "--auto-save-frames",
+        "-f",
+        help="Auto-save interval in frames",
+        type=int,
+        default=300,
+    )
+    parser.add_argument(
+        "--force-restart",
+        "-r",
+        help="Force restart (ignore existing data)",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--sample-interval",
+        "-s",
+        help="Frame sample interval",
+        type=int,
+        default=30,
+    )
    args = parser.parse_args()

-    process_face(args.video_path, args.output_path, args.uuid)
+    process_face(
+        args.video_path,
+        args.output_path,
+        args.uuid,
+        args.auto_save_interval,
+        args.auto_save_frames,
+        args.force_restart,
+        args.sample_interval,
+    )