feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/utils/face_tracker.py
+++ b/scripts/utils/face_tracker.py
@@ -0,0 +1,452 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Face Tracker - Track faces across frames using embedding similarity and bbox proximity
+
+Purpose:
+1. Assign unique trace_id to each face across frames
+2. Track face movement across adjacent frames
+3. Output trace statistics (duration, path, confidence)
+
+Algorithm:
+1. For first frame: assign new trace_id to each face
+2. For subsequent frames:
+   - Calculate bbox overlap with previous frame faces
+   - Calculate embedding cosine similarity
+   - Match faces if both conditions met
+   - Assign same trace_id if matched, new trace_id if not
+
+Matching Conditions:
+- bbox overlap > 0.3 (IoU)
+- embedding similarity > 0.7
+- OR single condition > threshold (fallback)
+
+Output:
+- face.json with trace_id added to each face
+- trace statistics report
+"""
+
+import sys
+import json
+import argparse
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+from collections import defaultdict
+
+
+def calculate_bbox_iou(bbox1: Dict, bbox2: Dict) -> float:
+    """
+    Calculate Intersection over Union (IoU) between two bboxes
+    
+    Args:
+        bbox1: {"x": int, "y": int, "width": int, "height": int}
+        bbox2: same structure
+    
+    Returns:
+        IoU score (0.0 - 1.0)
+    """
+    x1, y1, w1, h1 = bbox1["x"], bbox1["y"], bbox1["width"], bbox1["height"]
+    x2, y2, w2, h2 = bbox2["x"], bbox2["y"], bbox2["width"], bbox2["height"]
+    
+    x1_min, x1_max = x1, x1 + w1
+    y1_min, y1_max = y1, y1 + h1
+    x2_min, x2_max = x2, x2 + w2
+    y2_min, y2_max = y2, y2 + h2
+    
+    inter_x_min = max(x1_min, x2_min)
+    inter_x_max = min(x1_max, x2_max)
+    inter_y_min = max(y1_min, y2_min)
+    inter_y_max = min(y1_max, y2_max)
+    
+    if inter_x_max <= inter_x_min or inter_y_max <= inter_y_min:
+        return 0.0
+    
+    inter_area = (inter_x_max - inter_x_min) * (inter_y_max - inter_y_min)
+    area1 = w1 * h1
+    area2 = w2 * h2
+    union_area = area1 + area2 - inter_area
+    
+    return inter_area / union_area if union_area > 0 else 0.0
+
+
+def calculate_bbox_distance(bbox1: Dict, bbox2: Dict) -> float:
+    """
+    Calculate center distance between two bboxes
+    
+    Returns:
+        Euclidean distance between centers
+    """
+    cx1 = bbox1["x"] + bbox1["width"] / 2
+    cy1 = bbox1["y"] + bbox1["height"] / 2
+    cx2 = bbox2["x"] + bbox2["width"] / 2
+    cy2 = bbox2["y"] + bbox2["height"] / 2
+    
+    return np.sqrt((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2)
+
+
+def calculate_embedding_similarity(emb1: List[float], emb2: List[float]) -> float:
+    """
+    Calculate cosine similarity between two embeddings
+    
+    Returns:
+        Cosine similarity (-1.0 - 1.0)
+    """
+    if emb1 is None or emb2 is None:
+        return 0.0
+    
+    v1 = np.array(emb1)
+    v2 = np.array(emb2)
+    
+    norm1 = np.linalg.norm(v1)
+    norm2 = np.linalg.norm(v2)
+    
+    if norm1 == 0 or norm2 == 0:
+        return 0.0
+    
+    return np.dot(v1, v2) / (norm1 * norm2)
+
+
+def match_faces(
+    current_faces: List[Dict],
+    previous_faces: List[Dict],
+    iou_threshold: float = 0.3,
+    similarity_threshold: float = 0.7,
+    distance_threshold: float = 100.0,
+    use_embedding: bool = True,
+) -> Dict[int, int]:
+    """
+    Match current frame faces to previous frame faces
+    
+    Args:
+        current_faces: Faces in current frame
+        previous_faces: Faces in previous frame
+        iou_threshold: Minimum IoU for matching
+        similarity_threshold: Minimum embedding similarity for matching
+        distance_threshold: Maximum bbox center distance for matching
+        use_embedding: Whether to use embedding similarity
+    
+    Returns:
+        Dict mapping current_face_index -> previous_face_index (or -1 if new)
+    """
+    if not previous_faces:
+        return {i: -1 for i in range(len(current_faces))}
+    
+    matches = {}
+    used_prev = set()
+    
+    for curr_idx, curr_face in enumerate(current_faces):
+        best_prev_idx = -1
+        best_score = 0.0
+        
+        curr_bbox = {
+            "x": curr_face["x"],
+            "y": curr_face["y"],
+            "width": curr_face["width"],
+            "height": curr_face["height"],
+        }
+        curr_emb = curr_face.get("embedding")
+        
+        for prev_idx, prev_face in enumerate(previous_faces):
+            if prev_idx in used_prev:
+                continue
+            
+            prev_bbox = {
+                "x": prev_face["x"],
+                "y": prev_face["y"],
+                "width": prev_face["width"],
+                "height": prev_face["height"],
+            }
+            prev_emb = prev_face.get("embedding")
+            
+            iou = calculate_bbox_iou(curr_bbox, prev_bbox)
+            distance = calculate_bbox_distance(curr_bbox, prev_bbox)
+            
+            similarity = 0.0
+            if use_embedding and curr_emb and prev_emb:
+                similarity = calculate_embedding_similarity(curr_emb, prev_emb)
+            
+            score = 0.0
+            
+            if iou > iou_threshold and similarity > similarity_threshold:
+                score = iou + similarity
+            elif iou > 0.5:
+                score = iou * 2
+            elif similarity > 0.85:
+                score = similarity * 2
+            elif distance < distance_threshold and similarity > 0.6:
+                score = similarity - distance / 1000
+            
+            if score > best_score:
+                best_score = score
+                best_prev_idx = prev_idx
+        
+        if best_prev_idx >= 0 and best_score > 0:
+            matches[curr_idx] = best_prev_idx
+            used_prev.add(best_prev_idx)
+        else:
+            matches[curr_idx] = -1
+    
+    return matches
+
+
+def track_faces(
+    face_data: Dict,
+    iou_threshold: float = 0.3,
+    similarity_threshold: float = 0.7,
+    distance_threshold: float = 100.0,
+    use_embedding: bool = True,
+) -> Dict:
+    """
+    Track faces across all frames
+    
+    Args:
+        face_data: face.json data
+        iou_threshold: IoU threshold for matching
+        similarity_threshold: Embedding similarity threshold
+        distance_threshold: Distance threshold for matching
+        use_embedding: Whether to use embedding
+    
+    Returns:
+        Updated face_data with trace_id added to each face
+    """
+    frames = face_data.get("frames", {})
+    
+    if not frames:
+        print("No frames found in face.json")
+        return face_data
+    
+    sorted_frames = sorted(frames.items(), key=lambda x: int(x[0]))
+    
+    next_trace_id = 0
+    traces = defaultdict(list)
+    
+    prev_faces = []
+    prev_trace_ids = []
+    
+    print(f"\nTracking faces across {len(sorted_frames)} frames...")
+    print(f"Parameters: iou={iou_threshold}, similarity={similarity_threshold}, distance={distance_threshold}")
+    print()
+    
+    for frame_num_str, frame_data in sorted_frames:
+        frame_num = int(frame_num_str)
+        faces = frame_data.get("faces", [])
+        
+        if not faces:
+            prev_faces = []
+            prev_trace_ids = []
+            continue
+        
+        matches = match_faces(
+            faces,
+            prev_faces,
+            iou_threshold,
+            similarity_threshold,
+            distance_threshold,
+            use_embedding,
+        )
+        
+        trace_ids = []
+        for curr_idx, prev_idx in matches.items():
+            if prev_idx >= 0:
+                trace_id = prev_trace_ids[prev_idx]
+            else:
+                trace_id = next_trace_id
+                next_trace_id += 1
+            
+            faces[curr_idx]["trace_id"] = trace_id
+            trace_ids.append(trace_id)
+            traces[trace_id].append({
+                "frame": frame_num,
+                "face_index": curr_idx,
+                "bbox": {
+                    "x": faces[curr_idx]["x"],
+                    "y": faces[curr_idx]["y"],
+                    "width": faces[curr_idx]["width"],
+                    "height": faces[curr_idx]["height"],
+                },
+                "confidence": faces[curr_idx].get("confidence", 0.0),
+                "pose_angle": faces[curr_idx].get("pose_angle", {}).get("angle", "unknown"),
+                "pose_full": faces[curr_idx].get("pose_angle", {}),  # 完整 pose 信息
+            })
+        
+        prev_faces = faces
+        prev_trace_ids = trace_ids
+        
+        if frame_num % 100 == 0:
+            print(f"  Frame {frame_num}: {len(faces)} faces, {len(set(trace_ids))} active traces")
+    
+    face_data["traces"] = {}
+    for trace_id, path in traces.items():
+        if len(path) >= 1:
+            duration_frames = path[-1]["frame"] - path[0]["frame"] + 1
+            avg_confidence = sum(p["confidence"] for p in path) / len(path)
+            pose_angles = [p["pose_angle"] for p in path]
+            
+            # Pose Trace: 完整 pose 信息
+            pose_trace = []
+            for p in path:
+                pose_info = p.get("pose_full", {})
+                pose_trace.append({
+                    "frame": p["frame"],
+                    "angle": pose_info.get("angle", "unknown"),
+                    "confidence": pose_info.get("confidence", 0.0),
+                    "pitch": pose_info.get("pitch", "neutral"),
+                    "features": pose_info.get("features", {}),
+                })
+            
+            # Pose Statistics
+            pose_counts = defaultdict(int)
+            pose_confidence_by_angle = defaultdict(list)
+            for pose in pose_trace:
+                pose_counts[pose["angle"]] += 1
+                pose_confidence_by_angle[pose["angle"]].append(pose["confidence"])
+            
+            pose_statistics = {
+                "distribution": dict(pose_counts),
+                "avg_confidence_by_angle": {
+                    angle: round(sum(conf_list) / len(conf_list), 3)
+                    for angle, conf_list in pose_confidence_by_angle.items()
+                },
+                "dominant_angle": max(pose_counts.items(), key=lambda x: x[1])[0] if pose_counts else "unknown",
+                "pose_count": len(pose_counts),
+            }
+            
+            # Pose Transitions: pose 变化事件
+            pose_transitions = []
+            prev_pose = None
+            for i, pose in enumerate(pose_trace):
+                if prev_pose is not None and pose["angle"] != prev_pose["angle"]:
+                    pose_transitions.append({
+                        "frame": pose["frame"],
+                        "from_angle": prev_pose["angle"],
+                        "to_angle": pose["angle"],
+                        "transition_index": len(pose_transitions) + 1,
+                    })
+                prev_pose = pose
+            
+            face_data["traces"][str(trace_id)] = {
+                "trace_id": trace_id,
+                "start_frame": path[0]["frame"],
+                "end_frame": path[-1]["frame"],
+                "duration_frames": duration_frames,
+                "duration_seconds": duration_frames / face_data["metadata"]["fps"],
+                "total_appearances": len(path),
+                "avg_confidence": avg_confidence,
+                "pose_angles": pose_angles,
+                "pose_trace": pose_trace,
+                "pose_statistics": pose_statistics,
+                "pose_transitions": pose_transitions,
+                "path": path,
+            }
+    
+    face_data["metadata"]["trace_stats"] = {
+        "total_traces": next_trace_id,
+        "active_traces": len(traces),
+        "long_traces": len([t for t in traces.values() if len(t) >= 2]),
+    }
+    
+    return face_data
+
+
+def analyze_traces(face_data: Dict) -> None:
+    """
+    Analyze and print trace statistics
+    """
+    traces = face_data.get("traces", {})
+    metadata = face_data.get("metadata", {})
+    
+    print("\n" + "=" * 60)
+    print("Face Trace Analysis")
+    print("=" * 60)
+    
+    print(f"\nTotal traces: {metadata.get('trace_stats', {}).get('total_traces', 0)}")
+    print(f"Long traces (>= 2 frames): {len(traces)}")
+    
+    if not traces:
+        return
+    
+    sorted_traces = sorted(traces.values(), key=lambda x: x["duration_frames"], reverse=True)
+    
+    print("\n=== Top 10 Longest Traces ===")
+    for i, trace in enumerate(sorted_traces[:10]):
+        print(f"\nTrace {trace['trace_id']}:")
+        print(f"  Frames: {trace['start_frame']} - {trace['end_frame']} ({trace['duration_frames']} frames)")
+        print(f"  Duration: {trace['duration_seconds']:.2f} seconds")
+        print(f"  Appearances: {trace['total_appearances']}")
+        print(f"  Avg Confidence: {trace['avg_confidence']:.3f}")
+        
+        # Pose Statistics
+        pose_stats = trace.get("pose_statistics", {})
+        print(f"  Pose Distribution: {pose_stats.get('distribution', {})}")
+        print(f"  Dominant Angle: {pose_stats.get('dominant_angle', 'unknown')}")
+        
+        # Pose Transitions
+        transitions = trace.get("pose_transitions", [])
+        if transitions:
+            print(f"  Pose Transitions: {len(transitions)} events")
+            for t in transitions[:3]:  # 只显示前 3 个
+                print(f"    - Frame {t['frame']}: {t['from_angle']} → {t['to_angle']}")
+    
+    pose_stats = defaultdict(int)
+    for trace in traces.values():
+        for pose in trace["pose_angles"]:
+            pose_stats[pose] += 1
+    
+    print("\n=== Pose Distribution in Traces ===")
+    for pose, count in sorted(pose_stats.items(), key=lambda x: x[1], reverse=True):
+        print(f"  {pose}: {count}")
+    
+    duration_distribution = defaultdict(int)
+    for trace in traces.values():
+        d = trace["duration_frames"]
+        if d <= 30:
+            duration_distribution["short (<= 30 frames)"] += 1
+        elif d <= 90:
+            duration_distribution["medium (31-90 frames)"] += 1
+        else:
+            duration_distribution["long (> 90 frames)"] += 1
+    
+    print("\n=== Trace Duration Distribution ===")
+    for duration, count in sorted(duration_distribution.items()):
+        print(f"  {duration}: {count}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Track faces across frames")
+    parser.add_argument("--face-json", required=True, help="Path to face.json")
+    parser.add_argument("--output", help="Output path (default: face_traced.json)")
+    parser.add_argument("--iou-threshold", type=float, default=0.3, help="IoU threshold")
+    parser.add_argument("--similarity-threshold", type=float, default=0.7, help="Embedding similarity threshold")
+    parser.add_argument("--distance-threshold", type=float, default=100.0, help="Distance threshold")
+    parser.add_argument("--no-embedding", action="store_true", help="Disable embedding matching")
+    parser.add_argument("--analyze-only", action="store_true", help="Only analyze, don't output")
+    args = parser.parse_args()
+    
+    print("=" * 60)
+    print("Face Tracker")
+    print("=" * 60)
+    
+    with open(args.face_json) as f:
+        face_data = json.load(f)
+    
+    print(f"\nInput: {args.face_json}")
+    print(f"Frames: {len(face_data.get('frames', {}))}")
+    
+    face_data = track_faces(
+        face_data,
+        iou_threshold=args.iou_threshold,
+        similarity_threshold=args.similarity_threshold,
+        distance_threshold=args.distance_threshold,
+        use_embedding=not args.no_embedding,
+    )
+    
+    analyze_traces(face_data)
+    
+    if not args.analyze_only:
+        output_path = args.output or args.face_json.replace(".json", "_traced.json")
+        with open(output_path, "w") as f:
+            json.dump(face_data, f, indent=2)
+        print(f"\n✅ Output saved to: {output_path}")
+
+
+if __name__ == "__main__":
+    main()