M4 handover: coordinate fixes, detector registry, deploy v2, YOLOv8s, identity lifecycle

- Fix swift_pose/swift_ocr Y-flip bugs (BUG-003~006) - Add heuristic_scene module + post-processing trigger (replaces Places365) - YOLOv5nu → YOLOv8s CoreML (+33% detections, +390% scene indicators) - Per-table SQL export (split 4.7GB single file → 478MB max per table) - Version/build check in deploy.sh (compare /health vs file_info.json) - Add file_uuid column to identities table + backfill - Identity pre-clean step in deploy (avoids UNIQUE conflicts on re-deploy) - Stranger_xxx naming fix with UUID context - Add DETECTOR_REGISTRY.md (25 detectors), DETECTOR_SELECTION_SOP.md - Update SPATIAL_COORDINATE_REGISTRY.md (P layer, 6-layer architecture) - New IDENTITY_LIFECYCLE.md - M4 response docs for deploy_script_fix and 111614 test report
2026-05-13 20:00:47 +08:00
parent d34bcae145
commit ffc30d7377
25 changed files with 2219 additions and 118 deletions
--- a/scripts/match_identities_to_tmdb.py
+++ b/scripts/match_identities_to_tmdb.py
@@ -0,0 +1,133 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Match auto-generated identities to TMDB identities via centroid embedding similarity.
+Updates identity name, tmdb_id, source for matches above threshold.
+
+Usage: python3 match_identities_to_tmdb.py <file_uuid>
+"""
+import sys
+import psycopg2
+import psycopg2.extras
+import numpy as np
+
+DB = "dbname=momentry user=accusys host=localhost"
+THRESHOLD = 0.55
+
+
+def cosine_similarity(a, b):
+    dot = np.dot(a, b)
+    na = np.linalg.norm(a)
+    nb = np.linalg.norm(b)
+    if na == 0 or nb == 0:
+        return 0.0
+    return dot / (na * nb)
+
+
+def main():
+    uuid = sys.argv[1] if len(sys.argv) > 1 else "aeed71342a899fe4b4c57b7d41bcb692"
+    conn = psycopg2.connect(DB)
+    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+
+    # Load TMDB identities with face_embedding (pgvector)
+    cur.execute("""
+        SELECT id, name, tmdb_id, face_embedding::text as emb_text
+        FROM dev.identities
+        WHERE source = 'tmdb' AND face_embedding IS NOT NULL
+    """)
+    tmdb_identities = []
+    for row in cur.fetchall():
+        emb_str = row["emb_text"]
+        if not emb_str:
+            continue
+        emb = np.array([float(x) for x in emb_str.strip("[]").split(",")])
+        tmdb_identities.append({
+            "id": row["id"],
+            "name": row["name"],
+            "tmdb_id": row["tmdb_id"],
+            "embedding": emb,
+        })
+    print(f"Loaded {len(tmdb_identities)} TMDB identities with embeddings")
+
+    if not tmdb_identities:
+        print("No TMDB identities found. Run tmdb_embed_extractor.py first.")
+        cur.close()
+        conn.close()
+        return
+
+    # Get auto identities linked to this file with their centroid embeddings
+    cur.execute("""
+        SELECT DISTINCT i.id, i.name
+        FROM dev.identities i
+        INNER JOIN dev.face_detections fd ON fd.identity_id = i.id
+        WHERE fd.file_uuid = %s AND i.source = 'auto'
+    """, (uuid,))
+    auto_rows = cur.fetchall()
+    print(f"Auto identities for {uuid[:8]}...: {len(auto_rows)}")
+
+    matched = 0
+    for row in auto_rows:
+        auto_id = row["id"]
+        auto_name = row["name"]
+
+        # Get face embeddings from face_detections for this identity
+        cur.execute("""
+            SELECT embedding
+            FROM dev.face_detections
+            WHERE file_uuid = %s AND identity_id = %s AND embedding IS NOT NULL
+            LIMIT 500
+        """, (uuid, auto_id))
+        emb_rows = cur.fetchall()
+        if not emb_rows:
+            continue
+
+        # Compute centroid
+        all_embs = [np.array(r["embedding"], dtype=np.float32) for r in emb_rows]
+        centroid = np.mean(all_embs, axis=0)
+
+        # Match against TMDB identities
+        best_sim = 0.0
+        best_tmdb = None
+        for tmdb in tmdb_identities:
+            sim = cosine_similarity(centroid, tmdb["embedding"])
+            if sim > best_sim:
+                best_sim = sim
+                best_tmdb = tmdb
+
+        if best_tmdb and best_sim >= THRESHOLD:
+            fm = best_tmdb["name"]
+            tmdb_identity_id = best_tmdb["id"]
+            print(f"  {auto_name} → {fm} (sim={best_sim:.3f})")
+
+            # Update face_detections to point to TMDB identity
+            cur.execute("""
+                UPDATE dev.face_detections
+                SET identity_id = %s
+                WHERE file_uuid = %s AND identity_id = %s
+            """, (tmdb_identity_id, uuid, auto_id))
+
+            # Update identity_bindings to point to TMDB identity
+            cur.execute("""
+                UPDATE dev.identity_bindings
+                SET identity_id = %s
+                WHERE identity_id = %s
+            """, (tmdb_identity_id, auto_id))
+
+            # Mark auto identity as merged (or we could delete it)
+            cur.execute("""
+                UPDATE dev.identities
+                SET source = 'merged', tmdb_id = %s
+                WHERE id = %s
+            """, (best_tmdb["tmdb_id"], auto_id))
+
+            matched += 1
+
+    conn.commit()
+    print(f"\nMatched {matched}/{len(auto_rows)} auto identities to TMDB")
+    print(f"Threshold: {THRESHOLD}")
+
+    cur.close()
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()