feat: media API (video/bbox/thumbnail), UUID unification, dot matrix text, portal fixes, API dictionary V1.3

2026-05-06 13:34:49 +08:00
parent e75c4d6f07
commit 74b6182eba
197 changed files with 17511 additions and 8759 deletions
--- a/scripts/face_processor.py
+++ b/scripts/face_processor.py
@@ -1,341 +1,283 @@
 #!/opt/homebrew/bin/python3.11
 """
-Face Processor - Face Detection & Demographics with Resume Support
-Uses InsightFace for detection, age, gender, and embedding extraction.
+Face Processor V2 - Apple Vision detection + CoreML FaceNet embedding

-IMPORTANT: InsightFace is REQUIRED. No Haar fallback.
- InsightFace provides 512-dim ArcFace embedding for identity matching
- Haar Cascade cannot generate embedding, only detection
- If InsightFace fails, processor will ERROR and exit
+Flow:
+1. swift_face (Vision/ANE) → bbox + pose per frame
+2. cv2 opens video, crops faces from bbox
+3. CoreML FaceNet → 512D embedding per face
+4. Output face.json in standard format

-Resume Feature:
- Auto-detect existing results and resume from last frame
- Auto-save at configurable intervals (default: 30 seconds)
- Ctrl+C gracefully saves and exits
+Replaces face_processor.py (no more InsightFace CPU detection).
+Detection cost: near-zero CPU (Vision ANE)
+Embedding cost: near-zero CPU (CoreML ANE)
 """

 import sys
+import os
 import json
 import argparse
-import os
+import subprocess
 import time
+from typing import Optional, Dict
+
+import cv2
+import numpy as np
+from pathlib import Path
+
+# CoreML
+import coremltools as ct

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from resume_framework import ResumeFramework, format_time, print_progress
-from utils.pose_analyzer import calculate_pose_angle_v2

+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+SWIFT_BIN = os.path.join(SCRIPT_DIR, "swift_processors", ".build", "debug", "swift_face")
+FACENET_PATH = os.path.join(SCRIPT_DIR, "..", "models", "facenet512.mlpackage")

-def process_face(
-    video_path: str,
-    output_path: str,
-    uuid: str = "",
-    auto_save_interval: int = 30,
-    auto_save_frames: int = 300,
-    force_restart: bool = False,
-    sample_interval: int = 30,
-):
-    """Process video for face detection and demographics analysis with resume support"""
-
-    framework = ResumeFramework(
-        output_path=output_path,
-        processor_name="face",
-        uuid=uuid,
-        auto_save_interval=auto_save_interval,
-        auto_save_frames=auto_save_frames,
-        force_restart=force_restart,
-    )
-
-    framework.publish_info("FACE_START")
-
-    try:
-        import cv2
-        import numpy as np
-        import insightface
-    except ImportError as e:
-        error_msg = f"Missing dependency: {e.name}"
-        framework.publish_error(error_msg)
-        result = {
-            "metadata": {"status": "error", "error": error_msg},
-            "frames": {},
-        }
-        with open(output_path, "w") as f:
-            json.dump(result, f, indent=2)
-        return result
-
-    app = None
-    try:
-        framework.publish_info("LOADING_INSIGHTFACE")
-        app = insightface.app.FaceAnalysis(
-            name="buffalo_l", providers=["CPUExecutionProvider"]
-        )
-        app.prepare(ctx_id=0, det_size=(320, 320))
-        framework.publish_info("INSIGHTFACE_LOADED")
-    except Exception as e:
-        error_msg = f"InsightFace failed to load (REQUIRED): {e}"
-        framework.publish_error(error_msg)
-        result = {
-            "metadata": {"status": "error", "error": error_msg},
-            "frames": {},
-        }
-        with open(output_path, "w") as f:
-            json.dump(result, f, indent=2)
-        return result
-
-    framework.publish_info("PROCESSING_VIDEO")
-
-    cap = cv2.VideoCapture(video_path)
-
-    if not cap.isOpened():
-        print(f"Error: Cannot open video: {video_path}")
-        return {"metadata": {"status": "error"}, "frames": {}}
-
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    total_duration = total_frames / fps if fps > 0 else 0
-    cap.release()
-
-    framework.publish_info(f"fps={fps}, frames={total_frames}")
-
-    existing_data, last_checkpoint = framework.load_existing_data()
-    resume_mode = existing_data is not None and last_checkpoint > 0 and not force_restart
-
-    if resume_mode:
-        print(f"\nFound existing data: {output_path}")
-        print(f"Last processed frame: {last_checkpoint}")
-        print(f"Will resume from frame {last_checkpoint + 1}")
-
-    if resume_mode and existing_data:
-        face_data = existing_data
-        frame_count = last_checkpoint
-        processed_frames = set(int(k) for k in existing_data.get("frames", {}).keys())
-        cap = cv2.VideoCapture(video_path)
-        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)
+# Pose angle classification from roll/yaw
+def classify_pose(roll: float, yaw: float) -> str:
+    """Convert roll/yaw to pose angle label"""
+    abs_yaw = abs(yaw)
+    abs_roll = abs(roll)
+    if abs_yaw < 15 and abs_roll < 15:
+        return "frontal"
+    elif abs_yaw > 30:
+        return "profile_right" if yaw > 0 else "profile_left"
    else:
-        face_data = {
-            "metadata": framework.init_metadata(
-                video_path=video_path,
-                fps=fps,
-                width=width,
-                height=height,
-                total_frames=total_frames,
-                total_duration=total_duration,
-                extra={
-                    "sample_interval": sample_interval,
-                    "detection_method": "insightface",
-                },
-            ),
-            "frames": {},
-        }
-        frame_count = 0
-        processed_frames = set()
-        cap = cv2.VideoCapture(video_path)
+        return "three_quarter"

-    framework.set_data(face_data)

-    start_time = time.time()
-    framework.last_save_time = start_time
+class FaceProcessorVision:
+    def __init__(self, video_path: str, output_path: str, uuid: str = "",
+                 sample_interval: int = 30):
+        self.video_path = video_path
+        self.output_path = output_path
+        self.uuid = uuid
+        self.sample_interval = sample_interval

-    print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
-    print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
-    print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
-    print("Detection method: InsightFace (REQUIRED)")
-    print()
+        # Load CoreML FaceNet
+        self.coreml_model = None
+        facenet = os.path.normpath(FACENET_PATH)
+        if os.path.exists(facenet):
+            try:
+                self.coreml_model = ct.models.MLModel(facenet)
+                print(f"[FACE_V2] CoreML FaceNet loaded: {facenet}")
+            except Exception as e:
+                print(f"[FACE_V2] CoreML load failed: {e}")

-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
+        self.video = None
+        self.fps = 30.0
+        self.total_frames = 0
+        self.width = 0
+        self.height = 0

-        frame_count += 1
-        current_time = (frame_count - 1) / fps if fps > 0 else 0
-
-        if frame_count in processed_frames:
-            continue
-
-        if frame_count % sample_interval != 0:
-            continue
-
-        face_list = []
+    def open_video(self):
+        self.video = cv2.VideoCapture(self.video_path)
+        if not self.video.isOpened():
+            raise RuntimeError(f"Cannot open: {self.video_path}")
+        self.fps = self.video.get(cv2.CAP_PROP_FPS)
+        self.total_frames = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))
+        self.width = int(self.video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        self.height = int(self.video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        print(f"[FACE_V2] Video: {self.width}x{self.height}, {self.fps:.1f}fps, {self.total_frames}f")

+    def extract_face_embedding(self, face_img: np.ndarray) -> Optional[list]:
+        """Run CoreML FaceNet on cropped face"""
+        if self.coreml_model is None:
+            return None
        try:
-            faces = app.get(frame)
-            for face in faces:
-                bbox = face.bbox.astype(int)
-                bx, by, bw, bh = (
-                    bbox[0],
-                    bbox[1],
-                    bbox[2] - bbox[0],
-                    bbox[3] - bbox[1],
-                )
-
-                age = int(face.age) if hasattr(face, "age") else None
-                gender_val = face.gender if hasattr(face, "gender") else None
-                gender = (
-                    "female"
-                    if gender_val == 0
-                    else ("male" if gender_val == 1 else None)
-                )
-
-                embedding = None
-                if hasattr(face, "embedding"):
-                    embedding = face.embedding.tolist()
-
-                landmarks = None
-                if hasattr(face, "kps"):
-                    landmarks = face.kps.tolist()
-                elif hasattr(face, "landmark_3d_68"):
-                    landmarks = face.landmark_3d_68.tolist()
-
-                pose_angle = None
-                if landmarks and len(landmarks) >= 5:
-                    try:
-                        pose_result = calculate_pose_angle_v2(landmarks)
-                        pose_angle = {
-                            "angle": pose_result.get("angle", "unknown"),
-                            "confidence": pose_result.get("confidence", 0.0),
-                            "pitch": pose_result.get("pitch", "neutral"),
-                            "features": pose_result.get("features", {}),
-                        }
-                    except Exception:
-                        pass
-
-                face_list.append(
-                    {
-                        "x": int(bx),
-                        "y": int(by),
-                        "width": int(bw),
-                        "height": int(bh),
-                        "confidence": float(face.det_score)
-                        if hasattr(face, "det_score")
-                        else 0.9,
-                        "embedding": embedding,
-                        "landmarks": landmarks,
-                        "pose_angle": pose_angle,
-                        "attributes": {"age": age, "gender": gender},
-                    }
-                )
+            # Resize to 160x160
+            resized = cv2.resize(face_img, (160, 160))
+            # Convert HWC to CHW and normalize to [-1, 1]
+            normalized = (resized.astype(np.float32) / 127.5) - 1.0
+            normalized = np.transpose(normalized, (2, 0, 1))  # HWC -> CHW
+            # Add batch dim: (1, 3, 160, 160)
+            input_array = np.expand_dims(normalized, axis=0)
+            result = self.coreml_model.predict({"input": input_array})
+            # Find output key (var_xxx)
+            emb_key = [k for k in result.keys() if k.startswith("var_")][0]
+            emb = result[emb_key].flatten().tolist()
+            return emb
        except Exception as e:
-            print(f"[ERROR] Frame processing error: {e}")
+            print(f"[FACE_V2] Embedding error: {e}")
+            return None

-        if face_list:
-            face_data["frames"][str(frame_count)] = {
-                "frame_number": frame_count,
-                "time_seconds": round(current_time, 3),
-                "time_formatted": format_time(current_time),
-                "faces": face_list,
-            }
-            processed_frames.add(frame_count)
+    def process_with_swift(self) -> Dict:
+        """Step 1: Run swift_face to get bbox + pose"""
+        print(f"[FACE_V2] Step 1: Vision detection...")

-        if frame_count % 500 == 0:
-            elapsed = time.time() - start_time
-            print_progress(frame_count, total_frames, elapsed, f"{len(face_list)} faces")
-            framework.publish_progress(frame_count, total_frames, f"frame {frame_count}")
+        # Build swift_face if needed
+        if not os.path.exists(SWIFT_BIN):
+            build_dir = os.path.join(SCRIPT_DIR, "swift_processors")
+            print(f"[FACE_V2] Building swift_face in {build_dir}...")
+            subprocess.run(
+                ["swift", "build", "-c", "debug", "--product", "swift_face"],
+                cwd=build_dir, check=True
+            )

-        if framework.should_auto_save(frame_count):
-            framework.save_progress(frame_count, silent=True)
+        swift_out = self.output_path.replace(".json", "_detect.json")
+        cmd = [
+            SWIFT_BIN,
+            self.video_path,
+            swift_out,
+            "--sample-interval", str(self.sample_interval),
+        ]
+        if self.uuid:
+            cmd.extend(["--uuid", self.uuid])

-    cap.release()
+        print(f"[FACE_V2] Running: {' '.join(cmd)}")
+        t0 = time.time()
+        subprocess.run(cmd, check=True)
+        elapsed = time.time() - t0
+        print(f"[FACE_V2] Detection done in {elapsed:.1f}s")

-    total_processed = len(processed_frames)
+        with open(swift_out) as f:
+            return json.load(f)

-    framework.finalize(
-        total_processed=total_processed,
-        extra_metadata={
-            "sample_interval": sample_interval,
-            "detection_method": "insightface",
-        },
+    def embed_and_save(self, detection_data: Dict):
+        """Step 2: Crop faces + CoreML embedding + save face.json"""
+        print(f"[FACE_V2] Step 2: CoreML embedding...")
+
+        frames = detection_data.get("frames", [])
+        self.open_video()
+
+        face_data = {
+            "metadata": {
+                "video_path": os.path.abspath(self.video_path),
+                "fps": self.fps, "width": self.width, "height": self.height,
+                "sample_interval": self.sample_interval,
+                "detection_method": "apple_vision",
+                "embedding_method": "coreml_facenet",
+                "status": "in_progress",
+                "total_frames": self.total_frames,
+            },
+            "frames": {}
+        }
+
+        t0 = time.time()
+        embed_count = 0
+
+        for frame_info in frames:
+            frame_num = frame_info["frame"]
+            faces = []
+            for face in frame_info.get("faces", []):
+                bb = face["bbox"]
+                x, y, w, h = bb["x"], bb["y"], bb["width"], bb["height"]
+
+                if w <= 10 or h <= 10:
+                    continue  # skip tiny faces
+
+                # Seek to frame and read
+                self.video.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
+                ret, frame = self.video.read()
+                if not ret:
+                    continue
+
+                # Crop face
+                x1, y1 = max(0, x), max(0, y)
+                x2, y2 = min(self.width, x + w), min(self.height, y + h)
+                if x2 <= x1 or y2 <= y1:
+                    continue
+                face_img = frame[y1:y2, x1:x2]
+                if face_img.size == 0:
+                    continue
+
+                # CoreML embedding
+                emb = self.extract_face_embedding(face_img)
+                if emb is not None:
+                    embed_count += 1
+
+                # Pose classification
+                pose_info = face.get("pose", {})
+                pose_angle = classify_pose(
+                    pose_info.get("roll", 0),
+                    pose_info.get("yaw", 0)
+                )
+
+                faces.append({
+                    "x": x, "y": y, "width": w, "height": h,
+                    "confidence": face.get("confidence", 0.5),
+                    "embedding": emb,
+                    "pose_angle": {
+                        "angle": pose_angle,
+                        "roll": pose_info.get("roll", 0),
+                        "yaw": pose_info.get("yaw", 0),
+                        "pitch": pose_info.get("pitch", 0),
+                    },
+                    "lips": face.get("lips"),
+                    "landmarks": None,
+                    "attributes": None,
+                })
+
+            if faces:
+                face_data["frames"][str(frame_num)] = {
+                    "frame_number": frame_num,
+                    "time_seconds": frame_info.get("timestamp", frame_num / self.fps),
+                    "time_formatted": f"{frame_num / self.fps:.1f}s",
+                    "faces": faces,
+                }
+
+            if len(face_data["frames"]) % 100 == 0:
+                elapsed = time.time() - t0
+                print(f"[FACE_V2] {len(face_data['frames'])} frames, {embed_count} embeddings, {elapsed:.0f}s")
+
+        self.video.release()
+
+        # Finalize
+        face_data["metadata"]["status"] = "completed"
+        face_data["metadata"]["total_embeddings"] = embed_count
+        face_data["metadata"]["embedder"] = "coreml_facenet"
+
+        # Convert dict frames to list for Rust FaceResult format
+        frames_list = []
+        for fnum_str, fdata in sorted(face_data["frames"].items(), key=lambda x: int(x[0])):
+            frames_list.append({
+                "frame": int(fnum_str),
+                "timestamp": fdata["time_seconds"],
+                "faces": fdata["faces"],
+            })
+
+        output = {
+            "frame_count": len(frames_list),
+            "fps": self.fps,
+            "frames": frames_list,
+        }
+
+        with open(self.output_path, "w") as f:
+            json.dump(output, f, indent=2, ensure_ascii=False)
+
+        elapsed = time.time() - t0
+        print(f"[FACE_V2] Done: {len(frames_list)} frames, {embed_count} embeddings, {elapsed:.0f}s")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Apple Vision Face Processor V2")
+    parser.add_argument("video_path", help="Video file path")
+    parser.add_argument("output_path", help="Output JSON path")
+    parser.add_argument("--uuid", "-u", default="")
+    parser.add_argument("--sample-interval", type=int, default=30)
+    parser.add_argument("--force", action="store_true")
+    args = parser.parse_args()
+
+    if args.force and os.path.exists(args.output_path):
+        os.remove(args.output_path)
+
+    processor = FaceProcessorVision(
+        args.video_path, args.output_path,
+        args.uuid, args.sample_interval
    )

-    print(f"\nFace detection completed: {total_processed} frames processed")
-    print(f"Frames with faces: {len(face_data['frames'])}")
+    # Step 1: Vision detection (bbox + pose via ANE)
+    detection = processor.process_with_swift()

-    return face_data
+    # Step 2: CoreML embedding + save
+    processor.embed_and_save(detection)

-
-def _convert_to_face_result(face_data: dict) -> dict:
-    """Convert ResumeFramework output to FaceResult format expected by Rust."""
-    metadata = face_data.get("metadata", {})
-    raw_frames = face_data.get("frames", {})
-    fps = metadata.get("fps", 30.0)
-    frames = []
-    for frame_key in sorted(raw_frames.keys(), key=lambda k: int(k)):
-        f = raw_frames[frame_key]
-        faces = []
-        for raw_face in f.get("faces", []):
-            pose = raw_face.get("pose_angle")
-            attributes = raw_face.get("attributes", {})
-            face = {
-                "face_id": None,
-                "x": raw_face["x"],
-                "y": raw_face["y"],
-                "width": raw_face["width"],
-                "height": raw_face["height"],
-                "confidence": raw_face.get("confidence", 0.0),
-                "embedding": raw_face.get("embedding"),
-                "landmarks": raw_face.get("landmarks"),
-                "attributes": {
-                    "age": attributes.get("age") if attributes else None,
-                    "gender": attributes.get("gender") if attributes else None,
-                },
-            }
-            faces.append(face)
-        frames.append({
-            "frame": f["frame_number"],
-            "timestamp": f["time_seconds"],
-            "faces": faces,
-        })
-    return {
-        "frame_count": len(frames),
-        "fps": fps,
-        "frames": frames,
-    }
+    # Clean up temp detection file
+    swift_out = args.output_path.replace(".json", "_detect.json")
+    if os.path.exists(swift_out):
+        os.remove(swift_out)


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Face Detection & Demographics with Resume Support")
-    parser.add_argument("video_path", help="Path to video file")
-    parser.add_argument("output_path", help="Output JSON path")
-    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
-    parser.add_argument(
-        "--auto-save-interval",
-        "-a",
-        help="Auto-save interval in seconds",
-        type=int,
-        default=30,
-    )
-    parser.add_argument(
-        "--auto-save-frames",
-        "-f",
-        help="Auto-save interval in frames",
-        type=int,
-        default=300,
-    )
-    parser.add_argument(
-        "--force-restart",
-        "-r",
-        help="Force restart (ignore existing data)",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--sample-interval",
-        "-s",
-        help="Frame sample interval",
-        type=int,
-        default=5,
-    )
-    args = parser.parse_args()
-
-    result = process_face(
-        args.video_path,
-        args.output_path,
-        args.uuid,
-        args.auto_save_interval,
-        args.auto_save_frames,
-        args.force_restart,
-        args.sample_interval,
-    )
-    face_result = _convert_to_face_result(result)
-    with open(args.output_path, "w") as f:
-        json.dump(face_result, f, indent=2)
+    main()