momentry_core/scripts/match_faces_to_tmdb.py

#!/opt/homebrew/bin/python3.11
"""
Match face_detections against TMDb identities via face embedding similarity.
Port of match_faces_against_tmdb from src/core/tmdb/face_agent.rs

Usage: python3 scripts/match_faces_to_tmdb.py <file_uuid> [--schema dev]
"""

import sys
import psycopg2
import psycopg2.extras
import numpy as np
from collections import defaultdict

DATABASE_URL = "postgres://accusys@localhost:5432/momentry"
THRESHOLD = 0.50
QC_MIN_FACES = 4  # Minimum faces per trace for QC


def cosine_similarity(a, b):
    a = np.array(a, dtype=np.float64)
    b = np.array(b, dtype=np.float64)
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return np.dot(a, b) / (na * nb)


def match_faces_to_tmdb(file_uuid: str, schema: str = "dev"):
    conn = psycopg2.connect(DATABASE_URL)
    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

    # Step 1: Load TMDb identities with face embeddings
    cur.execute(f"""
        SELECT id, name, tmdb_id, face_embedding::real[] as embedding
        FROM {schema}.identities
        WHERE source = 'tmdb' AND face_embedding IS NOT NULL
    """)
    tmdb_identities = []
    for row in cur.fetchall():
        emb = row["embedding"]
        if emb and len(emb) > 0:
            tmdb_identities.append({
                "id": row["id"],
                "name": row["name"],
                "tmdb_id": row["tmdb_id"],
                "embedding": emb,
            })

    print(f"[TMDB-MATCH] Loaded {len(tmdb_identities)} TMDb identities")

    if not tmdb_identities:
        print("[TMDB-MATCH] No TMDb identities with embeddings")
        cur.close()
        conn.close()
        return 0

    # Step 2: Load face_detections with trace_id and embedding
    cur.execute(f"""
        SELECT id, trace_id, frame_number, embedding::real[] as embedding, confidence
        FROM {schema}.face_detections
        WHERE file_uuid = %s AND trace_id IS NOT NULL AND embedding IS NOT NULL
        ORDER BY trace_id, frame_number
    """, (file_uuid,))

    fd_rows = cur.fetchall()
    if not fd_rows:
        print(f"[TMDB-MATCH] No face detections for {file_uuid}")
        cur.close()
        conn.close()
        return 0

    # Group by trace_id
    trace_faces = defaultdict(list)
    for row in fd_rows:
        trace_id = row["trace_id"]
        emb = row["embedding"]
        if emb:
            trace_faces[trace_id].append({
                "id": row["id"],
                "embedding": emb,
                "frame": row["frame_number"],
                "confidence": row["confidence"],
            })

    # Dedup near-identical embeddings within trace (sim > 0.99)
    for tid, faces in trace_faces.items():
        faces.sort(key=lambda x: x["embedding"][0])
        unique = []
        for f in faces:
            if not unique or cosine_similarity(f["embedding"], unique[-1]["embedding"]) <= 0.99:
                unique.append(f)
        trace_faces[tid] = unique

    total_traces = len(trace_faces)
    total_faces = len(fd_rows)
    print(f"[TMDB-MATCH] {total_traces} traces with {total_faces} faces")

    # Step 3: Single-pass matching (one round only for performance)
    matched = {}  # trace_id → (identity_id, name)

    # Build reference pool from TMDb seeds only
    reference_pool = []
    for tmdb in tmdb_identities:
        reference_pool.append({
            "embedding": tmdb["embedding"],
            "identity_id": tmdb["id"],
            "name": tmdb["name"],
        })

    print(f"[TMDB-MATCH] Matching {total_traces} traces against {len(reference_pool)} TMDb identities (threshold={THRESHOLD})")

    # Match each trace against TMDb seeds
    for tid, faces in trace_faces.items():
        trace_scores = defaultdict(list)
        for f in faces:
            for ref in reference_pool:
                sim = cosine_similarity(f["embedding"], ref["embedding"])
                if sim >= THRESHOLD:
                    trace_scores[ref["identity_id"]].append((sim, ref["name"]))

        if not trace_scores:
            continue

        # Select identity with highest aggregate score
        best_identity = None
        best_score = 0
        best_name = None

        for identity_id, scores in trace_scores.items():
            avg_sim = np.mean([s[0] for s in scores])
            if avg_sim > best_score:
                best_score = avg_sim
                best_identity = identity_id
                best_name = scores[0][1]

        if best_identity:
            matched[tid] = (best_identity, best_name, best_score)

    # Step 4: Quality Control - minimum faces per trace
    qc_removed = 0
    for tid, faces in trace_faces.items():
        if tid in matched and len(faces) < QC_MIN_FACES:
            del matched[tid]
            qc_removed += 1

    # Step 5: Temporal collision check
    frame_identity_count = defaultdict(lambda: defaultdict(int))
    for tid, faces in trace_faces.items():
        if tid in matched:
            identity_id = matched[tid][0]
            for f in faces:
                frame_identity_count[f["frame"]][identity_id] += 1

    for frame, identity_counts in frame_identity_count.items():
        for identity_id, count in identity_counts.items():
            if count > 1:
                conflicting = []
                for tid, faces in trace_faces.items():
                    if tid in matched and matched[tid][0] == identity_id:
                        for f in faces:
                            if f["frame"] == frame:
                                conflicting.append((tid, f["confidence"]))

                conflicting.sort(key=lambda x: x[1], reverse=True)
                for tid, _ in conflicting[1:]:
                    if tid in matched:
                        del matched[tid]
                        qc_removed += 1

    if qc_removed > 0:
        print(f"[TMDB-MATCH] QC removed {qc_removed} traces")

    # Step 6: Update face_detections.identity_id
    bindings_created = 0
    for tid, (identity_id, name, score) in matched.items():
        for f in trace_faces[tid]:
            cur.execute(f"""
                UPDATE {schema}.face_detections
                SET identity_id = %s
                WHERE id = %s AND identity_id IS NULL
            """, (identity_id, f["id"]))
            bindings_created += cur.rowcount

    conn.commit()
    cur.close()
    conn.close()

    print(f"[TMDB-MATCH] {bindings_created} bindings created, {len(matched)} traces matched")
    return bindings_created


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("file_uuid", help="Video file UUID")
    parser.add_argument("--schema", default="dev", help="Database schema")
    args = parser.parse_args()

    match_faces_to_tmdb(args.file_uuid, args.schema)