momentry_core/scripts/tmdb_embed_extractor.py

#!/opt/homebrew/bin/python3.11
"""
TMDb Face Embedding Extractor V2.0

Reads TMDb-sourced identities from DB, downloads profile photos,
extracts embeddings using CoreML FaceNet (same model as Face V2.0 pipeline).

V2.0 change: replaced InsightFace embedding with CoreML FaceNet
for embedding space compatibility with face_processor.py V2.0.
"""

import json
import os
import sys
import urllib.request
import tempfile
import argparse
import numpy as np
import cv2

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w185"


def get_db_connection(schema="dev"):
    import psycopg2
    db_url = os.getenv("DATABASE_URL", "postgres://accusys@localhost:5432/momentry")
    conn = psycopg2.connect(db_url)
    conn.autocommit = True
    return conn


def load_coreml_facenet():
    """Load CoreML FaceNet model (same as Face V2.0 pipeline)"""
    try:
        import coremltools as ct
    except ImportError:
        print("[TMDB-EMBED] coremltools not installed, cannot extract embeddings", file=sys.stderr)
        return None

    model_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        "..", "models", "facenet512.mlpackage"
    )
    model_path = os.path.normpath(model_path)
    if not os.path.exists(model_path):
        print(f"[TMDB-EMBED] CoreML model not found: {model_path}", file=sys.stderr)
        return None

    try:
        model = ct.models.MLModel(model_path)
        print(f"[TMDB-EMBED] CoreML FaceNet loaded: {model_path}", file=sys.stderr)
        return model
    except Exception as e:
        print(f"[TMDB-EMBED] Failed to load CoreML model: {e}", file=sys.stderr)
        return None


def load_insightface_detector():
    """Load InsightFace for face detection only (not embedding)"""
    try:
        import insightface
        from insightface.app import FaceAnalysis
    except ImportError:
        print("[TMDB-EMBED] insightface not installed, cannot detect faces", file=sys.stderr)
        return None
    app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
    app.prepare(ctx_id=0, det_thresh=0.5)
    return app


def download_image(url):
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=15) as resp:
            return resp.read()
    except Exception as e:
        print(f"[TMDB-EMBED] Failed to download {url}: {e}", file=sys.stderr)
        return None


def extract_embedding_v2(detector, coreml_model, image_bytes):
    """Detect face with InsightFace, embed with CoreML FaceNet"""
    try:
        nparr = np.frombuffer(image_bytes, np.uint8)
        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
        if img is None:
            return None

        # Step 1: Detect face using InsightFace
        faces = detector.get(img)
        if not faces:
            return None

        # Use the largest face
        best = max(faces, key=lambda f: (f.bbox[2] - f.bbox[0]) * (f.bbox[3] - f.bbox[1]))
        x1, y1, x2, y2 = [int(v) for v in best.bbox]
        x1, y1 = max(0, x1), max(0, y1)
        x2, y2 = min(img.shape[1], x2), min(img.shape[0], y2)

        if x2 <= x1 or y2 <= y1:
            return None

        # Step 2: Crop face, resize to 160x160
        face_img = img[y1:y2, x1:x2]
        face_img = cv2.resize(face_img, (160, 160))

        # Step 3: CoreML FaceNet embedding
        # Normalize to [-1, 1], HWC → CHW
        normalized = (face_img.astype(np.float32) / 127.5) - 1.0
        normalized = np.transpose(normalized, (2, 0, 1))
        input_array = np.expand_dims(normalized, axis=0)

        result = coreml_model.predict({"input": input_array})
        emb_key = [k for k in result.keys() if k.startswith("var_")][0]
        embedding = result[emb_key].flatten().tolist()
        return embedding

    except Exception as e:
        print(f"[TMDB-EMBED] Face extraction failed: {e}", file=sys.stderr)
        return None


def main():
    parser = argparse.ArgumentParser(description="Extract TMDB face embeddings V2.0")
    parser.add_argument("--schema", default="dev", help="DB schema")
    parser.add_argument("--identity-ids", nargs="*", type=int)
    args = parser.parse_args()

    # Load models
    coreml = load_coreml_facenet()
    if coreml is None:
        sys.exit(1)

    detector = load_insightface_detector()
    if detector is None:
        sys.exit(1)

    conn = get_db_connection(args.schema)
    cur = conn.cursor()

    # Fetch identities
    if args.identity_ids:
        placeholders = ",".join(["%s"] * len(args.identity_ids))
        cur.execute(
            f"SELECT id, name, tmdb_profile FROM {args.schema}.identities "
            f"WHERE id IN ({placeholders}) AND tmdb_profile IS NOT NULL",
            args.identity_ids
        )
    else:
        cur.execute(
            f"SELECT id, name, tmdb_profile FROM {args.schema}.identities "
            f"WHERE source = 'tmdb' AND tmdb_profile IS NOT NULL "
        )

    rows = cur.fetchall()
    if not rows:
        print("[TMDB-EMBED] No identities to process", file=sys.stderr)
        return

    print(f"[TMDB-EMBED] Processing {len(rows)} identities with CoreML FaceNet V2.0", file=sys.stderr)
    success = 0

    for identity_id, name, profile_url in rows:
        if not profile_url:
            continue

        print(f"[TMDB-EMBED] Processing: {name} (id={identity_id})", file=sys.stderr)
        image_bytes = download_image(profile_url)
        if image_bytes is None:
            continue

        embedding = extract_embedding_v2(detector, coreml, image_bytes)
        if embedding is None:
            print(f"[TMDB-EMBED] No face detected for: {name}", file=sys.stderr)
            continue

        cur.execute(
            f"UPDATE {args.schema}.identities SET face_embedding = %s WHERE id = %s",
            (embedding, identity_id)
        )
        success += 1
        print(f"[TMDB-EMBED] ✓ Updated {name} (CoreML FaceNet, dim={len(embedding)})", file=sys.stderr)

    cur.close()
    conn.close()
    print(f"[TMDB-EMBED] Complete: {success}/{len(rows)} embeddings extracted (CoreML FaceNet V2.0)", file=sys.stderr)


if __name__ == "__main__":
    main()


if __name__ == "__main__":
    main()