M4 handover: coordinate fixes, detector registry, deploy v2, YOLOv8s, identity lifecycle
- Fix swift_pose/swift_ocr Y-flip bugs (BUG-003~006) - Add heuristic_scene module + post-processing trigger (replaces Places365) - YOLOv5nu → YOLOv8s CoreML (+33% detections, +390% scene indicators) - Per-table SQL export (split 4.7GB single file → 478MB max per table) - Version/build check in deploy.sh (compare /health vs file_info.json) - Add file_uuid column to identities table + backfill - Identity pre-clean step in deploy (avoids UNIQUE conflicts on re-deploy) - Stranger_xxx naming fix with UUID context - Add DETECTOR_REGISTRY.md (25 detectors), DETECTOR_SELECTION_SOP.md - Update SPATIAL_COORDINATE_REGISTRY.md (P layer, 6-layer architecture) - New IDENTITY_LIFECYCLE.md - M4 response docs for deploy_script_fix and 111614 test report
This commit is contained in:
133
scripts/match_identities_to_tmdb.py
Normal file
133
scripts/match_identities_to_tmdb.py
Normal file
@@ -0,0 +1,133 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Match auto-generated identities to TMDB identities via centroid embedding similarity.
|
||||
Updates identity name, tmdb_id, source for matches above threshold.
|
||||
|
||||
Usage: python3 match_identities_to_tmdb.py <file_uuid>
|
||||
"""
|
||||
import sys
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import numpy as np
|
||||
|
||||
DB = "dbname=momentry user=accusys host=localhost"
|
||||
THRESHOLD = 0.55
|
||||
|
||||
|
||||
def cosine_similarity(a, b):
|
||||
dot = np.dot(a, b)
|
||||
na = np.linalg.norm(a)
|
||||
nb = np.linalg.norm(b)
|
||||
if na == 0 or nb == 0:
|
||||
return 0.0
|
||||
return dot / (na * nb)
|
||||
|
||||
|
||||
def main():
|
||||
uuid = sys.argv[1] if len(sys.argv) > 1 else "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
conn = psycopg2.connect(DB)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
# Load TMDB identities with face_embedding (pgvector)
|
||||
cur.execute("""
|
||||
SELECT id, name, tmdb_id, face_embedding::text as emb_text
|
||||
FROM dev.identities
|
||||
WHERE source = 'tmdb' AND face_embedding IS NOT NULL
|
||||
""")
|
||||
tmdb_identities = []
|
||||
for row in cur.fetchall():
|
||||
emb_str = row["emb_text"]
|
||||
if not emb_str:
|
||||
continue
|
||||
emb = np.array([float(x) for x in emb_str.strip("[]").split(",")])
|
||||
tmdb_identities.append({
|
||||
"id": row["id"],
|
||||
"name": row["name"],
|
||||
"tmdb_id": row["tmdb_id"],
|
||||
"embedding": emb,
|
||||
})
|
||||
print(f"Loaded {len(tmdb_identities)} TMDB identities with embeddings")
|
||||
|
||||
if not tmdb_identities:
|
||||
print("No TMDB identities found. Run tmdb_embed_extractor.py first.")
|
||||
cur.close()
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# Get auto identities linked to this file with their centroid embeddings
|
||||
cur.execute("""
|
||||
SELECT DISTINCT i.id, i.name
|
||||
FROM dev.identities i
|
||||
INNER JOIN dev.face_detections fd ON fd.identity_id = i.id
|
||||
WHERE fd.file_uuid = %s AND i.source = 'auto'
|
||||
""", (uuid,))
|
||||
auto_rows = cur.fetchall()
|
||||
print(f"Auto identities for {uuid[:8]}...: {len(auto_rows)}")
|
||||
|
||||
matched = 0
|
||||
for row in auto_rows:
|
||||
auto_id = row["id"]
|
||||
auto_name = row["name"]
|
||||
|
||||
# Get face embeddings from face_detections for this identity
|
||||
cur.execute("""
|
||||
SELECT embedding
|
||||
FROM dev.face_detections
|
||||
WHERE file_uuid = %s AND identity_id = %s AND embedding IS NOT NULL
|
||||
LIMIT 500
|
||||
""", (uuid, auto_id))
|
||||
emb_rows = cur.fetchall()
|
||||
if not emb_rows:
|
||||
continue
|
||||
|
||||
# Compute centroid
|
||||
all_embs = [np.array(r["embedding"], dtype=np.float32) for r in emb_rows]
|
||||
centroid = np.mean(all_embs, axis=0)
|
||||
|
||||
# Match against TMDB identities
|
||||
best_sim = 0.0
|
||||
best_tmdb = None
|
||||
for tmdb in tmdb_identities:
|
||||
sim = cosine_similarity(centroid, tmdb["embedding"])
|
||||
if sim > best_sim:
|
||||
best_sim = sim
|
||||
best_tmdb = tmdb
|
||||
|
||||
if best_tmdb and best_sim >= THRESHOLD:
|
||||
fm = best_tmdb["name"]
|
||||
tmdb_identity_id = best_tmdb["id"]
|
||||
print(f" {auto_name} → {fm} (sim={best_sim:.3f})")
|
||||
|
||||
# Update face_detections to point to TMDB identity
|
||||
cur.execute("""
|
||||
UPDATE dev.face_detections
|
||||
SET identity_id = %s
|
||||
WHERE file_uuid = %s AND identity_id = %s
|
||||
""", (tmdb_identity_id, uuid, auto_id))
|
||||
|
||||
# Update identity_bindings to point to TMDB identity
|
||||
cur.execute("""
|
||||
UPDATE dev.identity_bindings
|
||||
SET identity_id = %s
|
||||
WHERE identity_id = %s
|
||||
""", (tmdb_identity_id, auto_id))
|
||||
|
||||
# Mark auto identity as merged (or we could delete it)
|
||||
cur.execute("""
|
||||
UPDATE dev.identities
|
||||
SET source = 'merged', tmdb_id = %s
|
||||
WHERE id = %s
|
||||
""", (best_tmdb["tmdb_id"], auto_id))
|
||||
|
||||
matched += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"\nMatched {matched}/{len(auto_rows)} auto identities to TMDB")
|
||||
print(f"Threshold: {THRESHOLD}")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user