#!/opt/homebrew/bin/python3.11 """ Match auto-generated identities to TMDB identities via centroid embedding similarity. Updates identity name, tmdb_id, source for matches above threshold. Usage: python3 match_identities_to_tmdb.py """ import sys import psycopg2 import psycopg2.extras import numpy as np DB = "dbname=momentry user=accusys host=localhost" THRESHOLD = 0.55 def cosine_similarity(a, b): dot = np.dot(a, b) na = np.linalg.norm(a) nb = np.linalg.norm(b) if na == 0 or nb == 0: return 0.0 return dot / (na * nb) def main(): uuid = sys.argv[1] if len(sys.argv) > 1 else "aeed71342a899fe4b4c57b7d41bcb692" conn = psycopg2.connect(DB) cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) # Load TMDB identities with face_embedding (pgvector) cur.execute(""" SELECT id, name, tmdb_id, face_embedding::text as emb_text FROM dev.identities WHERE source = 'tmdb' AND face_embedding IS NOT NULL """) tmdb_identities = [] for row in cur.fetchall(): emb_str = row["emb_text"] if not emb_str: continue emb = np.array([float(x) for x in emb_str.strip("[]").split(",")]) tmdb_identities.append({ "id": row["id"], "name": row["name"], "tmdb_id": row["tmdb_id"], "embedding": emb, }) print(f"Loaded {len(tmdb_identities)} TMDB identities with embeddings") if not tmdb_identities: print("No TMDB identities found. Run tmdb_embed_extractor.py first.") cur.close() conn.close() return # Get auto identities linked to this file with their centroid embeddings cur.execute(""" SELECT DISTINCT i.id, i.name FROM dev.identities i INNER JOIN dev.face_detections fd ON fd.identity_id = i.id WHERE fd.file_uuid = %s AND i.source = 'auto' """, (uuid,)) auto_rows = cur.fetchall() print(f"Auto identities for {uuid[:8]}...: {len(auto_rows)}") matched = 0 for row in auto_rows: auto_id = row["id"] auto_name = row["name"] # Get face embeddings from face_detections for this identity cur.execute(""" SELECT embedding FROM dev.face_detections WHERE file_uuid = %s AND identity_id = %s AND embedding IS NOT NULL LIMIT 500 """, (uuid, auto_id)) emb_rows = cur.fetchall() if not emb_rows: continue # Compute centroid all_embs = [np.array(r["embedding"], dtype=np.float32) for r in emb_rows] centroid = np.mean(all_embs, axis=0) # Match against TMDB identities best_sim = 0.0 best_tmdb = None for tmdb in tmdb_identities: sim = cosine_similarity(centroid, tmdb["embedding"]) if sim > best_sim: best_sim = sim best_tmdb = tmdb if best_tmdb and best_sim >= THRESHOLD: fm = best_tmdb["name"] tmdb_identity_id = best_tmdb["id"] print(f" {auto_name} → {fm} (sim={best_sim:.3f})") # Update face_detections to point to TMDB identity cur.execute(""" UPDATE dev.face_detections SET identity_id = %s WHERE file_uuid = %s AND identity_id = %s """, (tmdb_identity_id, uuid, auto_id)) # Update identity_bindings to point to TMDB identity cur.execute(""" UPDATE dev.identity_bindings SET identity_id = %s WHERE identity_id = %s """, (tmdb_identity_id, auto_id)) # Mark auto identity as merged (or we could delete it) cur.execute(""" UPDATE dev.identities SET source = 'merged', tmdb_id = %s WHERE id = %s """, (best_tmdb["tmdb_id"], auto_id)) matched += 1 conn.commit() print(f"\nMatched {matched}/{len(auto_rows)} auto identities to TMDB") print(f"Threshold: {THRESHOLD}") cur.close() conn.close() if __name__ == "__main__": main()