momentry_core/scripts/match_identities_to_tmdb.py

#!/opt/homebrew/bin/python3.11
"""
Match auto-generated identities to TMDB identities via centroid embedding similarity.
Updates identity name, tmdb_id, source for matches above threshold.

Usage: python3 match_identities_to_tmdb.py <file_uuid>
"""
import sys
import psycopg2
import psycopg2.extras
import numpy as np

DB = "dbname=momentry user=accusys host=localhost"
THRESHOLD = 0.55


def cosine_similarity(a, b):
    dot = np.dot(a, b)
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return dot / (na * nb)


def main():
    uuid = sys.argv[1] if len(sys.argv) > 1 else "aeed71342a899fe4b4c57b7d41bcb692"
    conn = psycopg2.connect(DB)
    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

    # Load TMDB identities with face_embedding (pgvector)
    cur.execute("""
        SELECT id, name, tmdb_id, face_embedding::text as emb_text
        FROM dev.identities
        WHERE source = 'tmdb' AND face_embedding IS NOT NULL
    """)
    tmdb_identities = []
    for row in cur.fetchall():
        emb_str = row["emb_text"]
        if not emb_str:
            continue
        emb = np.array([float(x) for x in emb_str.strip("[]").split(",")])
        tmdb_identities.append({
            "id": row["id"],
            "name": row["name"],
            "tmdb_id": row["tmdb_id"],
            "embedding": emb,
        })
    print(f"Loaded {len(tmdb_identities)} TMDB identities with embeddings")

    if not tmdb_identities:
        print("No TMDB identities found. Run tmdb_embed_extractor.py first.")
        cur.close()
        conn.close()
        return

    # Get auto identities linked to this file with their centroid embeddings
    cur.execute("""
        SELECT DISTINCT i.id, i.name
        FROM dev.identities i
        INNER JOIN dev.face_detections fd ON fd.identity_id = i.id
        WHERE fd.file_uuid = %s AND i.source = 'auto'
    """, (uuid,))
    auto_rows = cur.fetchall()
    print(f"Auto identities for {uuid[:8]}...: {len(auto_rows)}")

    matched = 0
    for row in auto_rows:
        auto_id = row["id"]
        auto_name = row["name"]

        # Get face embeddings from face_detections for this identity
        cur.execute("""
            SELECT embedding
            FROM dev.face_detections
            WHERE file_uuid = %s AND identity_id = %s AND embedding IS NOT NULL
            LIMIT 500
        """, (uuid, auto_id))
        emb_rows = cur.fetchall()
        if not emb_rows:
            continue

        # Compute centroid
        all_embs = [np.array(r["embedding"], dtype=np.float32) for r in emb_rows]
        centroid = np.mean(all_embs, axis=0)

        # Match against TMDB identities
        best_sim = 0.0
        best_tmdb = None
        for tmdb in tmdb_identities:
            sim = cosine_similarity(centroid, tmdb["embedding"])
            if sim > best_sim:
                best_sim = sim
                best_tmdb = tmdb

        if best_tmdb and best_sim >= THRESHOLD:
            fm = best_tmdb["name"]
            tmdb_identity_id = best_tmdb["id"]
            print(f"  {auto_name} → {fm} (sim={best_sim:.3f})")

            # Update face_detections to point to TMDB identity
            cur.execute("""
                UPDATE dev.face_detections
                SET identity_id = %s
                WHERE file_uuid = %s AND identity_id = %s
            """, (tmdb_identity_id, uuid, auto_id))

            # Update identity_bindings to point to TMDB identity
            cur.execute("""
                UPDATE dev.identity_bindings
                SET identity_id = %s
                WHERE identity_id = %s
            """, (tmdb_identity_id, auto_id))

            # Mark auto identity as merged (or we could delete it)
            cur.execute("""
                UPDATE dev.identities
                SET source = 'merged', tmdb_id = %s
                WHERE id = %s
            """, (best_tmdb["tmdb_id"], auto_id))

            matched += 1

    conn.commit()
    print(f"\nMatched {matched}/{len(auto_rows)} auto identities to TMDB")
    print(f"Threshold: {THRESHOLD}")

    cur.close()
    conn.close()


if __name__ == "__main__":
    main()