Files
momentry_core/scripts/match_identities_to_tmdb.py
Accusys ffc30d7377 M4 handover: coordinate fixes, detector registry, deploy v2, YOLOv8s, identity lifecycle
- Fix swift_pose/swift_ocr Y-flip bugs (BUG-003~006)
- Add heuristic_scene module + post-processing trigger (replaces Places365)
- YOLOv5nu → YOLOv8s CoreML (+33% detections, +390% scene indicators)
- Per-table SQL export (split 4.7GB single file → 478MB max per table)
- Version/build check in deploy.sh (compare /health vs file_info.json)
- Add file_uuid column to identities table + backfill
- Identity pre-clean step in deploy (avoids UNIQUE conflicts on re-deploy)
- Stranger_xxx naming fix with UUID context
- Add DETECTOR_REGISTRY.md (25 detectors), DETECTOR_SELECTION_SOP.md
- Update SPATIAL_COORDINATE_REGISTRY.md (P layer, 6-layer architecture)
- New IDENTITY_LIFECYCLE.md
- M4 response docs for deploy_script_fix and 111614 test report
2026-05-13 20:00:47 +08:00

134 lines
4.1 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Match auto-generated identities to TMDB identities via centroid embedding similarity.
Updates identity name, tmdb_id, source for matches above threshold.
Usage: python3 match_identities_to_tmdb.py <file_uuid>
"""
import sys
import psycopg2
import psycopg2.extras
import numpy as np
DB = "dbname=momentry user=accusys host=localhost"
THRESHOLD = 0.55
def cosine_similarity(a, b):
dot = np.dot(a, b)
na = np.linalg.norm(a)
nb = np.linalg.norm(b)
if na == 0 or nb == 0:
return 0.0
return dot / (na * nb)
def main():
uuid = sys.argv[1] if len(sys.argv) > 1 else "aeed71342a899fe4b4c57b7d41bcb692"
conn = psycopg2.connect(DB)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
# Load TMDB identities with face_embedding (pgvector)
cur.execute("""
SELECT id, name, tmdb_id, face_embedding::text as emb_text
FROM dev.identities
WHERE source = 'tmdb' AND face_embedding IS NOT NULL
""")
tmdb_identities = []
for row in cur.fetchall():
emb_str = row["emb_text"]
if not emb_str:
continue
emb = np.array([float(x) for x in emb_str.strip("[]").split(",")])
tmdb_identities.append({
"id": row["id"],
"name": row["name"],
"tmdb_id": row["tmdb_id"],
"embedding": emb,
})
print(f"Loaded {len(tmdb_identities)} TMDB identities with embeddings")
if not tmdb_identities:
print("No TMDB identities found. Run tmdb_embed_extractor.py first.")
cur.close()
conn.close()
return
# Get auto identities linked to this file with their centroid embeddings
cur.execute("""
SELECT DISTINCT i.id, i.name
FROM dev.identities i
INNER JOIN dev.face_detections fd ON fd.identity_id = i.id
WHERE fd.file_uuid = %s AND i.source = 'auto'
""", (uuid,))
auto_rows = cur.fetchall()
print(f"Auto identities for {uuid[:8]}...: {len(auto_rows)}")
matched = 0
for row in auto_rows:
auto_id = row["id"]
auto_name = row["name"]
# Get face embeddings from face_detections for this identity
cur.execute("""
SELECT embedding
FROM dev.face_detections
WHERE file_uuid = %s AND identity_id = %s AND embedding IS NOT NULL
LIMIT 500
""", (uuid, auto_id))
emb_rows = cur.fetchall()
if not emb_rows:
continue
# Compute centroid
all_embs = [np.array(r["embedding"], dtype=np.float32) for r in emb_rows]
centroid = np.mean(all_embs, axis=0)
# Match against TMDB identities
best_sim = 0.0
best_tmdb = None
for tmdb in tmdb_identities:
sim = cosine_similarity(centroid, tmdb["embedding"])
if sim > best_sim:
best_sim = sim
best_tmdb = tmdb
if best_tmdb and best_sim >= THRESHOLD:
fm = best_tmdb["name"]
tmdb_identity_id = best_tmdb["id"]
print(f" {auto_name}{fm} (sim={best_sim:.3f})")
# Update face_detections to point to TMDB identity
cur.execute("""
UPDATE dev.face_detections
SET identity_id = %s
WHERE file_uuid = %s AND identity_id = %s
""", (tmdb_identity_id, uuid, auto_id))
# Update identity_bindings to point to TMDB identity
cur.execute("""
UPDATE dev.identity_bindings
SET identity_id = %s
WHERE identity_id = %s
""", (tmdb_identity_id, auto_id))
# Mark auto identity as merged (or we could delete it)
cur.execute("""
UPDATE dev.identities
SET source = 'merged', tmdb_id = %s
WHERE id = %s
""", (best_tmdb["tmdb_id"], auto_id))
matched += 1
conn.commit()
print(f"\nMatched {matched}/{len(auto_rows)} auto identities to TMDB")
print(f"Threshold: {THRESHOLD}")
cur.close()
conn.close()
if __name__ == "__main__":
main()