#!/opt/homebrew/bin/python3.11 """ TMDb Face Embedding Extractor V2.0 Reads TMDb-sourced identities from DB, downloads profile photos, extracts embeddings using CoreML FaceNet (same model as Face V2.0 pipeline). V2.0 change: replaced InsightFace embedding with CoreML FaceNet for embedding space compatibility with face_processor.py V2.0. """ import json import os import sys import urllib.request import tempfile import argparse import numpy as np import cv2 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w185" def get_db_connection(schema="dev"): import psycopg2 db_url = os.getenv("DATABASE_URL", "postgres://accusys@localhost:5432/momentry") conn = psycopg2.connect(db_url) conn.autocommit = True return conn def load_coreml_facenet(): """Load CoreML FaceNet model (same as Face V2.0 pipeline)""" try: import coremltools as ct except ImportError: print("[TMDB-EMBED] coremltools not installed, cannot extract embeddings", file=sys.stderr) return None model_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "..", "models", "facenet512.mlpackage" ) model_path = os.path.normpath(model_path) if not os.path.exists(model_path): print(f"[TMDB-EMBED] CoreML model not found: {model_path}", file=sys.stderr) return None try: model = ct.models.MLModel(model_path) print(f"[TMDB-EMBED] CoreML FaceNet loaded: {model_path}", file=sys.stderr) return model except Exception as e: print(f"[TMDB-EMBED] Failed to load CoreML model: {e}", file=sys.stderr) return None def load_insightface_detector(): """Load InsightFace for face detection only (not embedding)""" try: import insightface from insightface.app import FaceAnalysis except ImportError: print("[TMDB-EMBED] insightface not installed, cannot detect faces", file=sys.stderr) return None app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"]) app.prepare(ctx_id=0, det_thresh=0.5) return app def download_image(url): try: req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=15) as resp: return resp.read() except Exception as e: print(f"[TMDB-EMBED] Failed to download {url}: {e}", file=sys.stderr) return None def extract_embedding_v2(detector, coreml_model, image_bytes): """Detect face with InsightFace, embed with CoreML FaceNet""" try: nparr = np.frombuffer(image_bytes, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) if img is None: return None # Step 1: Detect face using InsightFace faces = detector.get(img) if not faces: return None # Use the largest face best = max(faces, key=lambda f: (f.bbox[2] - f.bbox[0]) * (f.bbox[3] - f.bbox[1])) x1, y1, x2, y2 = [int(v) for v in best.bbox] x1, y1 = max(0, x1), max(0, y1) x2, y2 = min(img.shape[1], x2), min(img.shape[0], y2) if x2 <= x1 or y2 <= y1: return None # Step 2: Crop face, resize to 160x160 face_img = img[y1:y2, x1:x2] face_img = cv2.resize(face_img, (160, 160)) # Step 3: CoreML FaceNet embedding # Normalize to [-1, 1], HWC → CHW normalized = (face_img.astype(np.float32) / 127.5) - 1.0 normalized = np.transpose(normalized, (2, 0, 1)) input_array = np.expand_dims(normalized, axis=0) result = coreml_model.predict({"input": input_array}) emb_key = [k for k in result.keys() if k.startswith("var_")][0] embedding = result[emb_key].flatten().tolist() return embedding except Exception as e: print(f"[TMDB-EMBED] Face extraction failed: {e}", file=sys.stderr) return None def main(): parser = argparse.ArgumentParser(description="Extract TMDB face embeddings V2.0") parser.add_argument("--schema", default="dev", help="DB schema") parser.add_argument("--identity-ids", nargs="*", type=int) args = parser.parse_args() # Load models coreml = load_coreml_facenet() if coreml is None: sys.exit(1) detector = load_insightface_detector() if detector is None: sys.exit(1) conn = get_db_connection(args.schema) cur = conn.cursor() # Fetch identities if args.identity_ids: placeholders = ",".join(["%s"] * len(args.identity_ids)) cur.execute( f"SELECT id, name, tmdb_profile FROM {args.schema}.identities " f"WHERE id IN ({placeholders}) AND tmdb_profile IS NOT NULL", args.identity_ids ) else: cur.execute( f"SELECT id, name, tmdb_profile FROM {args.schema}.identities " f"WHERE source = 'tmdb' AND tmdb_profile IS NOT NULL " ) rows = cur.fetchall() if not rows: print("[TMDB-EMBED] No identities to process", file=sys.stderr) return print(f"[TMDB-EMBED] Processing {len(rows)} identities with CoreML FaceNet V2.0", file=sys.stderr) success = 0 for identity_id, name, profile_url in rows: if not profile_url: continue print(f"[TMDB-EMBED] Processing: {name} (id={identity_id})", file=sys.stderr) image_bytes = download_image(profile_url) if image_bytes is None: continue embedding = extract_embedding_v2(detector, coreml, image_bytes) if embedding is None: print(f"[TMDB-EMBED] No face detected for: {name}", file=sys.stderr) continue cur.execute( f"UPDATE {args.schema}.identities SET face_embedding = %s WHERE id = %s", (embedding, identity_id) ) success += 1 print(f"[TMDB-EMBED] ✓ Updated {name} (CoreML FaceNet, dim={len(embedding)})", file=sys.stderr) cur.close() conn.close() print(f"[TMDB-EMBED] Complete: {success}/{len(rows)} embeddings extracted (CoreML FaceNet V2.0)", file=sys.stderr) if __name__ == "__main__": main() if __name__ == "__main__": main()