feat: trace quality agent selection report, identity clustering runner_v2 DB write, age/gender CoreML selection, updated experiment config UUID
This commit is contained in:
196
scripts/tmdb_embed_extractor.py
Normal file
196
scripts/tmdb_embed_extractor.py
Normal file
@@ -0,0 +1,196 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
TMDb Face Embedding Extractor V2.0
|
||||
|
||||
Reads TMDb-sourced identities from DB, downloads profile photos,
|
||||
extracts embeddings using CoreML FaceNet (same model as Face V2.0 pipeline).
|
||||
|
||||
V2.0 change: replaced InsightFace embedding with CoreML FaceNet
|
||||
for embedding space compatibility with face_processor.py V2.0.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
import tempfile
|
||||
import argparse
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w185"
|
||||
|
||||
|
||||
def get_db_connection(schema="dev"):
|
||||
import psycopg2
|
||||
db_url = os.getenv("DATABASE_URL", "postgres://accusys@localhost:5432/momentry")
|
||||
conn = psycopg2.connect(db_url)
|
||||
conn.autocommit = True
|
||||
return conn
|
||||
|
||||
|
||||
def load_coreml_facenet():
|
||||
"""Load CoreML FaceNet model (same as Face V2.0 pipeline)"""
|
||||
try:
|
||||
import coremltools as ct
|
||||
except ImportError:
|
||||
print("[TMDB-EMBED] coremltools not installed, cannot extract embeddings", file=sys.stderr)
|
||||
return None
|
||||
|
||||
model_path = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"..", "models", "facenet512.mlpackage"
|
||||
)
|
||||
model_path = os.path.normpath(model_path)
|
||||
if not os.path.exists(model_path):
|
||||
print(f"[TMDB-EMBED] CoreML model not found: {model_path}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
try:
|
||||
model = ct.models.MLModel(model_path)
|
||||
print(f"[TMDB-EMBED] CoreML FaceNet loaded: {model_path}", file=sys.stderr)
|
||||
return model
|
||||
except Exception as e:
|
||||
print(f"[TMDB-EMBED] Failed to load CoreML model: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def load_insightface_detector():
|
||||
"""Load InsightFace for face detection only (not embedding)"""
|
||||
try:
|
||||
import insightface
|
||||
from insightface.app import FaceAnalysis
|
||||
except ImportError:
|
||||
print("[TMDB-EMBED] insightface not installed, cannot detect faces", file=sys.stderr)
|
||||
return None
|
||||
app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
|
||||
app.prepare(ctx_id=0, det_thresh=0.5)
|
||||
return app
|
||||
|
||||
|
||||
def download_image(url):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
return resp.read()
|
||||
except Exception as e:
|
||||
print(f"[TMDB-EMBED] Failed to download {url}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def extract_embedding_v2(detector, coreml_model, image_bytes):
|
||||
"""Detect face with InsightFace, embed with CoreML FaceNet"""
|
||||
try:
|
||||
nparr = np.frombuffer(image_bytes, np.uint8)
|
||||
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
||||
if img is None:
|
||||
return None
|
||||
|
||||
# Step 1: Detect face using InsightFace
|
||||
faces = detector.get(img)
|
||||
if not faces:
|
||||
return None
|
||||
|
||||
# Use the largest face
|
||||
best = max(faces, key=lambda f: (f.bbox[2] - f.bbox[0]) * (f.bbox[3] - f.bbox[1]))
|
||||
x1, y1, x2, y2 = [int(v) for v in best.bbox]
|
||||
x1, y1 = max(0, x1), max(0, y1)
|
||||
x2, y2 = min(img.shape[1], x2), min(img.shape[0], y2)
|
||||
|
||||
if x2 <= x1 or y2 <= y1:
|
||||
return None
|
||||
|
||||
# Step 2: Crop face, resize to 160x160
|
||||
face_img = img[y1:y2, x1:x2]
|
||||
face_img = cv2.resize(face_img, (160, 160))
|
||||
|
||||
# Step 3: CoreML FaceNet embedding
|
||||
# Normalize to [-1, 1], HWC → CHW
|
||||
normalized = (face_img.astype(np.float32) / 127.5) - 1.0
|
||||
normalized = np.transpose(normalized, (2, 0, 1))
|
||||
input_array = np.expand_dims(normalized, axis=0)
|
||||
|
||||
result = coreml_model.predict({"input": input_array})
|
||||
emb_key = [k for k in result.keys() if k.startswith("var_")][0]
|
||||
embedding = result[emb_key].flatten().tolist()
|
||||
return embedding
|
||||
|
||||
except Exception as e:
|
||||
print(f"[TMDB-EMBED] Face extraction failed: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Extract TMDB face embeddings V2.0")
|
||||
parser.add_argument("--schema", default="dev", help="DB schema")
|
||||
parser.add_argument("--identity-ids", nargs="*", type=int)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load models
|
||||
coreml = load_coreml_facenet()
|
||||
if coreml is None:
|
||||
sys.exit(1)
|
||||
|
||||
detector = load_insightface_detector()
|
||||
if detector is None:
|
||||
sys.exit(1)
|
||||
|
||||
conn = get_db_connection(args.schema)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Fetch identities
|
||||
if args.identity_ids:
|
||||
placeholders = ",".join(["%s"] * len(args.identity_ids))
|
||||
cur.execute(
|
||||
f"SELECT id, name, tmdb_profile FROM {args.schema}.identities "
|
||||
f"WHERE id IN ({placeholders}) AND tmdb_profile IS NOT NULL",
|
||||
args.identity_ids
|
||||
)
|
||||
else:
|
||||
cur.execute(
|
||||
f"SELECT id, name, tmdb_profile FROM {args.schema}.identities "
|
||||
f"WHERE source = 'tmdb' AND tmdb_profile IS NOT NULL "
|
||||
)
|
||||
|
||||
rows = cur.fetchall()
|
||||
if not rows:
|
||||
print("[TMDB-EMBED] No identities to process", file=sys.stderr)
|
||||
return
|
||||
|
||||
print(f"[TMDB-EMBED] Processing {len(rows)} identities with CoreML FaceNet V2.0", file=sys.stderr)
|
||||
success = 0
|
||||
|
||||
for identity_id, name, profile_url in rows:
|
||||
if not profile_url:
|
||||
continue
|
||||
|
||||
print(f"[TMDB-EMBED] Processing: {name} (id={identity_id})", file=sys.stderr)
|
||||
image_bytes = download_image(profile_url)
|
||||
if image_bytes is None:
|
||||
continue
|
||||
|
||||
embedding = extract_embedding_v2(detector, coreml, image_bytes)
|
||||
if embedding is None:
|
||||
print(f"[TMDB-EMBED] No face detected for: {name}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
cur.execute(
|
||||
f"UPDATE {args.schema}.identities SET face_embedding = %s WHERE id = %s",
|
||||
(embedding, identity_id)
|
||||
)
|
||||
success += 1
|
||||
print(f"[TMDB-EMBED] ✓ Updated {name} (CoreML FaceNet, dim={len(embedding)})", file=sys.stderr)
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print(f"[TMDB-EMBED] Complete: {success}/{len(rows)} embeddings extracted (CoreML FaceNet V2.0)", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user