- Fix swift_pose/swift_ocr Y-flip bugs (BUG-003~006) - Add heuristic_scene module + post-processing trigger (replaces Places365) - YOLOv5nu → YOLOv8s CoreML (+33% detections, +390% scene indicators) - Per-table SQL export (split 4.7GB single file → 478MB max per table) - Version/build check in deploy.sh (compare /health vs file_info.json) - Add file_uuid column to identities table + backfill - Identity pre-clean step in deploy (avoids UNIQUE conflicts on re-deploy) - Stranger_xxx naming fix with UUID context - Add DETECTOR_REGISTRY.md (25 detectors), DETECTOR_SELECTION_SOP.md - Update SPATIAL_COORDINATE_REGISTRY.md (P layer, 6-layer architecture) - New IDENTITY_LIFECYCLE.md - M4 response docs for deploy_script_fix and 111614 test report
134 lines
4.1 KiB
Python
134 lines
4.1 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Match auto-generated identities to TMDB identities via centroid embedding similarity.
|
|
Updates identity name, tmdb_id, source for matches above threshold.
|
|
|
|
Usage: python3 match_identities_to_tmdb.py <file_uuid>
|
|
"""
|
|
import sys
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
import numpy as np
|
|
|
|
DB = "dbname=momentry user=accusys host=localhost"
|
|
THRESHOLD = 0.55
|
|
|
|
|
|
def cosine_similarity(a, b):
|
|
dot = np.dot(a, b)
|
|
na = np.linalg.norm(a)
|
|
nb = np.linalg.norm(b)
|
|
if na == 0 or nb == 0:
|
|
return 0.0
|
|
return dot / (na * nb)
|
|
|
|
|
|
def main():
|
|
uuid = sys.argv[1] if len(sys.argv) > 1 else "aeed71342a899fe4b4c57b7d41bcb692"
|
|
conn = psycopg2.connect(DB)
|
|
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
|
|
|
# Load TMDB identities with face_embedding (pgvector)
|
|
cur.execute("""
|
|
SELECT id, name, tmdb_id, face_embedding::text as emb_text
|
|
FROM dev.identities
|
|
WHERE source = 'tmdb' AND face_embedding IS NOT NULL
|
|
""")
|
|
tmdb_identities = []
|
|
for row in cur.fetchall():
|
|
emb_str = row["emb_text"]
|
|
if not emb_str:
|
|
continue
|
|
emb = np.array([float(x) for x in emb_str.strip("[]").split(",")])
|
|
tmdb_identities.append({
|
|
"id": row["id"],
|
|
"name": row["name"],
|
|
"tmdb_id": row["tmdb_id"],
|
|
"embedding": emb,
|
|
})
|
|
print(f"Loaded {len(tmdb_identities)} TMDB identities with embeddings")
|
|
|
|
if not tmdb_identities:
|
|
print("No TMDB identities found. Run tmdb_embed_extractor.py first.")
|
|
cur.close()
|
|
conn.close()
|
|
return
|
|
|
|
# Get auto identities linked to this file with their centroid embeddings
|
|
cur.execute("""
|
|
SELECT DISTINCT i.id, i.name
|
|
FROM dev.identities i
|
|
INNER JOIN dev.face_detections fd ON fd.identity_id = i.id
|
|
WHERE fd.file_uuid = %s AND i.source = 'auto'
|
|
""", (uuid,))
|
|
auto_rows = cur.fetchall()
|
|
print(f"Auto identities for {uuid[:8]}...: {len(auto_rows)}")
|
|
|
|
matched = 0
|
|
for row in auto_rows:
|
|
auto_id = row["id"]
|
|
auto_name = row["name"]
|
|
|
|
# Get face embeddings from face_detections for this identity
|
|
cur.execute("""
|
|
SELECT embedding
|
|
FROM dev.face_detections
|
|
WHERE file_uuid = %s AND identity_id = %s AND embedding IS NOT NULL
|
|
LIMIT 500
|
|
""", (uuid, auto_id))
|
|
emb_rows = cur.fetchall()
|
|
if not emb_rows:
|
|
continue
|
|
|
|
# Compute centroid
|
|
all_embs = [np.array(r["embedding"], dtype=np.float32) for r in emb_rows]
|
|
centroid = np.mean(all_embs, axis=0)
|
|
|
|
# Match against TMDB identities
|
|
best_sim = 0.0
|
|
best_tmdb = None
|
|
for tmdb in tmdb_identities:
|
|
sim = cosine_similarity(centroid, tmdb["embedding"])
|
|
if sim > best_sim:
|
|
best_sim = sim
|
|
best_tmdb = tmdb
|
|
|
|
if best_tmdb and best_sim >= THRESHOLD:
|
|
fm = best_tmdb["name"]
|
|
tmdb_identity_id = best_tmdb["id"]
|
|
print(f" {auto_name} → {fm} (sim={best_sim:.3f})")
|
|
|
|
# Update face_detections to point to TMDB identity
|
|
cur.execute("""
|
|
UPDATE dev.face_detections
|
|
SET identity_id = %s
|
|
WHERE file_uuid = %s AND identity_id = %s
|
|
""", (tmdb_identity_id, uuid, auto_id))
|
|
|
|
# Update identity_bindings to point to TMDB identity
|
|
cur.execute("""
|
|
UPDATE dev.identity_bindings
|
|
SET identity_id = %s
|
|
WHERE identity_id = %s
|
|
""", (tmdb_identity_id, auto_id))
|
|
|
|
# Mark auto identity as merged (or we could delete it)
|
|
cur.execute("""
|
|
UPDATE dev.identities
|
|
SET source = 'merged', tmdb_id = %s
|
|
WHERE id = %s
|
|
""", (best_tmdb["tmdb_id"], auto_id))
|
|
|
|
matched += 1
|
|
|
|
conn.commit()
|
|
print(f"\nMatched {matched}/{len(auto_rows)} auto identities to TMDB")
|
|
print(f"Threshold: {THRESHOLD}")
|
|
|
|
cur.close()
|
|
conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|