feat: media API (video/bbox/thumbnail), UUID unification, dot matrix text, portal fixes, API dictionary V1.3

This commit is contained in:
Warren
2026-05-06 13:34:49 +08:00
parent e75c4d6f07
commit 74b6182eba
197 changed files with 17511 additions and 8759 deletions

552
scripts/face_processor.py Executable file → Normal file
View File

@@ -1,341 +1,283 @@
#!/opt/homebrew/bin/python3.11
"""
Face Processor - Face Detection & Demographics with Resume Support
Uses InsightFace for detection, age, gender, and embedding extraction.
Face Processor V2 - Apple Vision detection + CoreML FaceNet embedding
IMPORTANT: InsightFace is REQUIRED. No Haar fallback.
- InsightFace provides 512-dim ArcFace embedding for identity matching
- Haar Cascade cannot generate embedding, only detection
- If InsightFace fails, processor will ERROR and exit
Flow:
1. swift_face (Vision/ANE) → bbox + pose per frame
2. cv2 opens video, crops faces from bbox
3. CoreML FaceNet → 512D embedding per face
4. Output face.json in standard format
Resume Feature:
- Auto-detect existing results and resume from last frame
- Auto-save at configurable intervals (default: 30 seconds)
- Ctrl+C gracefully saves and exits
Replaces face_processor.py (no more InsightFace CPU detection).
Detection cost: near-zero CPU (Vision ANE)
Embedding cost: near-zero CPU (CoreML ANE)
"""
import sys
import os
import json
import argparse
import os
import subprocess
import time
from typing import Optional, Dict
import cv2
import numpy as np
from pathlib import Path
# CoreML
import coremltools as ct
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from resume_framework import ResumeFramework, format_time, print_progress
from utils.pose_analyzer import calculate_pose_angle_v2
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
SWIFT_BIN = os.path.join(SCRIPT_DIR, "swift_processors", ".build", "debug", "swift_face")
FACENET_PATH = os.path.join(SCRIPT_DIR, "..", "models", "facenet512.mlpackage")
def process_face(
video_path: str,
output_path: str,
uuid: str = "",
auto_save_interval: int = 30,
auto_save_frames: int = 300,
force_restart: bool = False,
sample_interval: int = 30,
):
"""Process video for face detection and demographics analysis with resume support"""
framework = ResumeFramework(
output_path=output_path,
processor_name="face",
uuid=uuid,
auto_save_interval=auto_save_interval,
auto_save_frames=auto_save_frames,
force_restart=force_restart,
)
framework.publish_info("FACE_START")
try:
import cv2
import numpy as np
import insightface
except ImportError as e:
error_msg = f"Missing dependency: {e.name}"
framework.publish_error(error_msg)
result = {
"metadata": {"status": "error", "error": error_msg},
"frames": {},
}
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
return result
app = None
try:
framework.publish_info("LOADING_INSIGHTFACE")
app = insightface.app.FaceAnalysis(
name="buffalo_l", providers=["CPUExecutionProvider"]
)
app.prepare(ctx_id=0, det_size=(320, 320))
framework.publish_info("INSIGHTFACE_LOADED")
except Exception as e:
error_msg = f"InsightFace failed to load (REQUIRED): {e}"
framework.publish_error(error_msg)
result = {
"metadata": {"status": "error", "error": error_msg},
"frames": {},
}
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
return result
framework.publish_info("PROCESSING_VIDEO")
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error: Cannot open video: {video_path}")
return {"metadata": {"status": "error"}, "frames": {}}
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
total_duration = total_frames / fps if fps > 0 else 0
cap.release()
framework.publish_info(f"fps={fps}, frames={total_frames}")
existing_data, last_checkpoint = framework.load_existing_data()
resume_mode = existing_data is not None and last_checkpoint > 0 and not force_restart
if resume_mode:
print(f"\nFound existing data: {output_path}")
print(f"Last processed frame: {last_checkpoint}")
print(f"Will resume from frame {last_checkpoint + 1}")
if resume_mode and existing_data:
face_data = existing_data
frame_count = last_checkpoint
processed_frames = set(int(k) for k in existing_data.get("frames", {}).keys())
cap = cv2.VideoCapture(video_path)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)
# Pose angle classification from roll/yaw
def classify_pose(roll: float, yaw: float) -> str:
"""Convert roll/yaw to pose angle label"""
abs_yaw = abs(yaw)
abs_roll = abs(roll)
if abs_yaw < 15 and abs_roll < 15:
return "frontal"
elif abs_yaw > 30:
return "profile_right" if yaw > 0 else "profile_left"
else:
face_data = {
"metadata": framework.init_metadata(
video_path=video_path,
fps=fps,
width=width,
height=height,
total_frames=total_frames,
total_duration=total_duration,
extra={
"sample_interval": sample_interval,
"detection_method": "insightface",
},
),
"frames": {},
}
frame_count = 0
processed_frames = set()
cap = cv2.VideoCapture(video_path)
return "three_quarter"
framework.set_data(face_data)
start_time = time.time()
framework.last_save_time = start_time
class FaceProcessorVision:
def __init__(self, video_path: str, output_path: str, uuid: str = "",
sample_interval: int = 30):
self.video_path = video_path
self.output_path = output_path
self.uuid = uuid
self.sample_interval = sample_interval
print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
print("Detection method: InsightFace (REQUIRED)")
print()
# Load CoreML FaceNet
self.coreml_model = None
facenet = os.path.normpath(FACENET_PATH)
if os.path.exists(facenet):
try:
self.coreml_model = ct.models.MLModel(facenet)
print(f"[FACE_V2] CoreML FaceNet loaded: {facenet}")
except Exception as e:
print(f"[FACE_V2] CoreML load failed: {e}")
while True:
ret, frame = cap.read()
if not ret:
break
self.video = None
self.fps = 30.0
self.total_frames = 0
self.width = 0
self.height = 0
frame_count += 1
current_time = (frame_count - 1) / fps if fps > 0 else 0
if frame_count in processed_frames:
continue
if frame_count % sample_interval != 0:
continue
face_list = []
def open_video(self):
self.video = cv2.VideoCapture(self.video_path)
if not self.video.isOpened():
raise RuntimeError(f"Cannot open: {self.video_path}")
self.fps = self.video.get(cv2.CAP_PROP_FPS)
self.total_frames = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))
self.width = int(self.video.get(cv2.CAP_PROP_FRAME_WIDTH))
self.height = int(self.video.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"[FACE_V2] Video: {self.width}x{self.height}, {self.fps:.1f}fps, {self.total_frames}f")
def extract_face_embedding(self, face_img: np.ndarray) -> Optional[list]:
"""Run CoreML FaceNet on cropped face"""
if self.coreml_model is None:
return None
try:
faces = app.get(frame)
for face in faces:
bbox = face.bbox.astype(int)
bx, by, bw, bh = (
bbox[0],
bbox[1],
bbox[2] - bbox[0],
bbox[3] - bbox[1],
)
age = int(face.age) if hasattr(face, "age") else None
gender_val = face.gender if hasattr(face, "gender") else None
gender = (
"female"
if gender_val == 0
else ("male" if gender_val == 1 else None)
)
embedding = None
if hasattr(face, "embedding"):
embedding = face.embedding.tolist()
landmarks = None
if hasattr(face, "kps"):
landmarks = face.kps.tolist()
elif hasattr(face, "landmark_3d_68"):
landmarks = face.landmark_3d_68.tolist()
pose_angle = None
if landmarks and len(landmarks) >= 5:
try:
pose_result = calculate_pose_angle_v2(landmarks)
pose_angle = {
"angle": pose_result.get("angle", "unknown"),
"confidence": pose_result.get("confidence", 0.0),
"pitch": pose_result.get("pitch", "neutral"),
"features": pose_result.get("features", {}),
}
except Exception:
pass
face_list.append(
{
"x": int(bx),
"y": int(by),
"width": int(bw),
"height": int(bh),
"confidence": float(face.det_score)
if hasattr(face, "det_score")
else 0.9,
"embedding": embedding,
"landmarks": landmarks,
"pose_angle": pose_angle,
"attributes": {"age": age, "gender": gender},
}
)
# Resize to 160x160
resized = cv2.resize(face_img, (160, 160))
# Convert HWC to CHW and normalize to [-1, 1]
normalized = (resized.astype(np.float32) / 127.5) - 1.0
normalized = np.transpose(normalized, (2, 0, 1)) # HWC -> CHW
# Add batch dim: (1, 3, 160, 160)
input_array = np.expand_dims(normalized, axis=0)
result = self.coreml_model.predict({"input": input_array})
# Find output key (var_xxx)
emb_key = [k for k in result.keys() if k.startswith("var_")][0]
emb = result[emb_key].flatten().tolist()
return emb
except Exception as e:
print(f"[ERROR] Frame processing error: {e}")
print(f"[FACE_V2] Embedding error: {e}")
return None
if face_list:
face_data["frames"][str(frame_count)] = {
"frame_number": frame_count,
"time_seconds": round(current_time, 3),
"time_formatted": format_time(current_time),
"faces": face_list,
}
processed_frames.add(frame_count)
def process_with_swift(self) -> Dict:
"""Step 1: Run swift_face to get bbox + pose"""
print(f"[FACE_V2] Step 1: Vision detection...")
if frame_count % 500 == 0:
elapsed = time.time() - start_time
print_progress(frame_count, total_frames, elapsed, f"{len(face_list)} faces")
framework.publish_progress(frame_count, total_frames, f"frame {frame_count}")
# Build swift_face if needed
if not os.path.exists(SWIFT_BIN):
build_dir = os.path.join(SCRIPT_DIR, "swift_processors")
print(f"[FACE_V2] Building swift_face in {build_dir}...")
subprocess.run(
["swift", "build", "-c", "debug", "--product", "swift_face"],
cwd=build_dir, check=True
)
if framework.should_auto_save(frame_count):
framework.save_progress(frame_count, silent=True)
swift_out = self.output_path.replace(".json", "_detect.json")
cmd = [
SWIFT_BIN,
self.video_path,
swift_out,
"--sample-interval", str(self.sample_interval),
]
if self.uuid:
cmd.extend(["--uuid", self.uuid])
cap.release()
print(f"[FACE_V2] Running: {' '.join(cmd)}")
t0 = time.time()
subprocess.run(cmd, check=True)
elapsed = time.time() - t0
print(f"[FACE_V2] Detection done in {elapsed:.1f}s")
total_processed = len(processed_frames)
with open(swift_out) as f:
return json.load(f)
framework.finalize(
total_processed=total_processed,
extra_metadata={
"sample_interval": sample_interval,
"detection_method": "insightface",
},
def embed_and_save(self, detection_data: Dict):
"""Step 2: Crop faces + CoreML embedding + save face.json"""
print(f"[FACE_V2] Step 2: CoreML embedding...")
frames = detection_data.get("frames", [])
self.open_video()
face_data = {
"metadata": {
"video_path": os.path.abspath(self.video_path),
"fps": self.fps, "width": self.width, "height": self.height,
"sample_interval": self.sample_interval,
"detection_method": "apple_vision",
"embedding_method": "coreml_facenet",
"status": "in_progress",
"total_frames": self.total_frames,
},
"frames": {}
}
t0 = time.time()
embed_count = 0
for frame_info in frames:
frame_num = frame_info["frame"]
faces = []
for face in frame_info.get("faces", []):
bb = face["bbox"]
x, y, w, h = bb["x"], bb["y"], bb["width"], bb["height"]
if w <= 10 or h <= 10:
continue # skip tiny faces
# Seek to frame and read
self.video.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = self.video.read()
if not ret:
continue
# Crop face
x1, y1 = max(0, x), max(0, y)
x2, y2 = min(self.width, x + w), min(self.height, y + h)
if x2 <= x1 or y2 <= y1:
continue
face_img = frame[y1:y2, x1:x2]
if face_img.size == 0:
continue
# CoreML embedding
emb = self.extract_face_embedding(face_img)
if emb is not None:
embed_count += 1
# Pose classification
pose_info = face.get("pose", {})
pose_angle = classify_pose(
pose_info.get("roll", 0),
pose_info.get("yaw", 0)
)
faces.append({
"x": x, "y": y, "width": w, "height": h,
"confidence": face.get("confidence", 0.5),
"embedding": emb,
"pose_angle": {
"angle": pose_angle,
"roll": pose_info.get("roll", 0),
"yaw": pose_info.get("yaw", 0),
"pitch": pose_info.get("pitch", 0),
},
"lips": face.get("lips"),
"landmarks": None,
"attributes": None,
})
if faces:
face_data["frames"][str(frame_num)] = {
"frame_number": frame_num,
"time_seconds": frame_info.get("timestamp", frame_num / self.fps),
"time_formatted": f"{frame_num / self.fps:.1f}s",
"faces": faces,
}
if len(face_data["frames"]) % 100 == 0:
elapsed = time.time() - t0
print(f"[FACE_V2] {len(face_data['frames'])} frames, {embed_count} embeddings, {elapsed:.0f}s")
self.video.release()
# Finalize
face_data["metadata"]["status"] = "completed"
face_data["metadata"]["total_embeddings"] = embed_count
face_data["metadata"]["embedder"] = "coreml_facenet"
# Convert dict frames to list for Rust FaceResult format
frames_list = []
for fnum_str, fdata in sorted(face_data["frames"].items(), key=lambda x: int(x[0])):
frames_list.append({
"frame": int(fnum_str),
"timestamp": fdata["time_seconds"],
"faces": fdata["faces"],
})
output = {
"frame_count": len(frames_list),
"fps": self.fps,
"frames": frames_list,
}
with open(self.output_path, "w") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
elapsed = time.time() - t0
print(f"[FACE_V2] Done: {len(frames_list)} frames, {embed_count} embeddings, {elapsed:.0f}s")
def main():
parser = argparse.ArgumentParser(description="Apple Vision Face Processor V2")
parser.add_argument("video_path", help="Video file path")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", "-u", default="")
parser.add_argument("--sample-interval", type=int, default=30)
parser.add_argument("--force", action="store_true")
args = parser.parse_args()
if args.force and os.path.exists(args.output_path):
os.remove(args.output_path)
processor = FaceProcessorVision(
args.video_path, args.output_path,
args.uuid, args.sample_interval
)
print(f"\nFace detection completed: {total_processed} frames processed")
print(f"Frames with faces: {len(face_data['frames'])}")
# Step 1: Vision detection (bbox + pose via ANE)
detection = processor.process_with_swift()
return face_data
# Step 2: CoreML embedding + save
processor.embed_and_save(detection)
def _convert_to_face_result(face_data: dict) -> dict:
"""Convert ResumeFramework output to FaceResult format expected by Rust."""
metadata = face_data.get("metadata", {})
raw_frames = face_data.get("frames", {})
fps = metadata.get("fps", 30.0)
frames = []
for frame_key in sorted(raw_frames.keys(), key=lambda k: int(k)):
f = raw_frames[frame_key]
faces = []
for raw_face in f.get("faces", []):
pose = raw_face.get("pose_angle")
attributes = raw_face.get("attributes", {})
face = {
"face_id": None,
"x": raw_face["x"],
"y": raw_face["y"],
"width": raw_face["width"],
"height": raw_face["height"],
"confidence": raw_face.get("confidence", 0.0),
"embedding": raw_face.get("embedding"),
"landmarks": raw_face.get("landmarks"),
"attributes": {
"age": attributes.get("age") if attributes else None,
"gender": attributes.get("gender") if attributes else None,
},
}
faces.append(face)
frames.append({
"frame": f["frame_number"],
"timestamp": f["time_seconds"],
"faces": faces,
})
return {
"frame_count": len(frames),
"fps": fps,
"frames": frames,
}
# Clean up temp detection file
swift_out = args.output_path.replace(".json", "_detect.json")
if os.path.exists(swift_out):
os.remove(swift_out)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Face Detection & Demographics with Resume Support")
parser.add_argument("video_path", help="Path to video file")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
parser.add_argument(
"--auto-save-interval",
"-a",
help="Auto-save interval in seconds",
type=int,
default=30,
)
parser.add_argument(
"--auto-save-frames",
"-f",
help="Auto-save interval in frames",
type=int,
default=300,
)
parser.add_argument(
"--force-restart",
"-r",
help="Force restart (ignore existing data)",
action="store_true",
)
parser.add_argument(
"--sample-interval",
"-s",
help="Frame sample interval",
type=int,
default=5,
)
args = parser.parse_args()
result = process_face(
args.video_path,
args.output_path,
args.uuid,
args.auto_save_interval,
args.auto_save_frames,
args.force_restart,
args.sample_interval,
)
face_result = _convert_to_face_result(result)
with open(args.output_path, "w") as f:
json.dump(face_result, f, indent=2)
main()