#!/opt/homebrew/bin/python3.11 """ Identity Binding: cluster face traces → identity bindings. Uses face embeddings from face_detections, clusters per trace, creates identities. """ import json, sys, time import psycopg2 import numpy as np from sklearn.cluster import AgglomerativeClustering UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5" DB = "dbname=momentry user=accusys" DISTANCE_THRESHOLD = 0.55 # Cosine distance threshold for clustering print(f"=== Identity Binding for {UUID} ===") conn = psycopg2.connect(DB) cur = conn.cursor() # Step 1: Get trace embeddings from face_detections print("Loading face trace data...") cur.execute(""" SELECT trace_id, embedding FROM dev.face_detections WHERE file_uuid = %s AND trace_id IS NOT NULL AND embedding IS NOT NULL ORDER BY trace_id, id """, (UUID,)) rows = cur.fetchall() print(f"Face detections with embeddings: {len(rows)}") # Group by trace_id and compute average embedding trace_embs = {} for trace_id, emb in rows: if trace_id not in trace_embs: trace_embs[trace_id] = [] trace_embs[trace_id].append(emb) print(f"Unique traces: {len(trace_embs)}") # Compute mean embeddings per trace trace_ids = [] trace_vectors = [] for tid, embs in sorted(trace_embs.items()): mean_emb = np.mean(embs, axis=0) mean_emb = mean_emb / (np.linalg.norm(mean_emb) + 1e-10) trace_ids.append(tid) trace_vectors.append(mean_emb) X = np.array(trace_vectors) print(f"Trace vectors shape: {X.shape}") # Step 2: Cluster traces print("Clustering traces...") if len(X) > 1: clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=DISTANCE_THRESHOLD, metric='cosine', linkage='average' ) labels = clustering.fit_predict(X) else: labels = [0] n_clusters = len(set(labels)) print(f"Clusters/identities: {n_clusters}") # Step 3: Get or create identity records print("Creating identity records...") # Get existing identities cur.execute("SELECT id, uuid FROM dev.identities") existing = {row[0]: row[1] for row in cur.fetchall()} # Map cluster -> identity_id cluster_to_identity = {} for cluster_id in sorted(set(labels)): # Create new identity identity_uuid = None cur.execute(""" INSERT INTO dev.identities (name, identity_type, source, status, created_at, file_uuid) VALUES (%s, 'face', 'auto', 'active', NOW(), %s) ON CONFLICT (name) DO UPDATE SET status = 'active', file_uuid = COALESCE(dev.identities.file_uuid, %s) RETURNING id """, (f"stranger_{UUID}_{cluster_id}", UUID, UUID)) identity_id = cur.fetchone()[0] cluster_to_identity[cluster_id] = identity_id print(f" Cluster {cluster_id}: new identity {identity_id} (stranger_{UUID}_{cluster_id})") # Step 4: Create identity bindings print("Creating identity bindings...") bindings = 0 for tid, label in zip(trace_ids, labels): identity_id = cluster_to_identity[label] # Get a representative face_id for this trace cur.execute(""" SELECT face_id FROM dev.face_detections WHERE file_uuid = %s AND trace_id = %s LIMIT 1 """, (UUID, tid)) row = cur.fetchone() if row: face_id = row[0] # Create binding cur.execute(""" INSERT INTO dev.identity_bindings (identity_id, identity_type, identity_value, confidence, created_at) VALUES (%s, 'trace', %s, 0.8, NOW()) ON CONFLICT DO NOTHING """, (identity_id, str(tid))) bindings += 1 # Also update face_detection with identity_id cur.execute(""" UPDATE dev.face_detections SET identity_id = %s WHERE file_uuid = %s AND trace_id = %s """, (identity_id, UUID, tid)) conn.commit() print(f"Created {bindings} identity bindings for {n_clusters} identities") # Summary print(f"\n=== Summary ===") cur.execute("SELECT COUNT(*) FROM dev.identities WHERE source = 'auto'") print(f"Total auto-generated identities: {cur.fetchone()[0]}") cur.execute("SELECT COUNT(*) FROM dev.identity_bindings") print(f"Total identity bindings: {cur.fetchone()[0]}") cur.close() conn.close() print("=== Done ===")