197 lines
6.1 KiB
Python
197 lines
6.1 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
TMDb Face Embedding Extractor V2.0
|
|
|
|
Reads TMDb-sourced identities from DB, downloads profile photos,
|
|
extracts embeddings using CoreML FaceNet (same model as Face V2.0 pipeline).
|
|
|
|
V2.0 change: replaced InsightFace embedding with CoreML FaceNet
|
|
for embedding space compatibility with face_processor.py V2.0.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import urllib.request
|
|
import tempfile
|
|
import argparse
|
|
import numpy as np
|
|
import cv2
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
TMDB_IMAGE_BASE = "https://image.tmdb.org/t/p/w185"
|
|
|
|
|
|
def get_db_connection(schema="dev"):
|
|
import psycopg2
|
|
db_url = os.getenv("DATABASE_URL", "postgres://accusys@localhost:5432/momentry")
|
|
conn = psycopg2.connect(db_url)
|
|
conn.autocommit = True
|
|
return conn
|
|
|
|
|
|
def load_coreml_facenet():
|
|
"""Load CoreML FaceNet model (same as Face V2.0 pipeline)"""
|
|
try:
|
|
import coremltools as ct
|
|
except ImportError:
|
|
print("[TMDB-EMBED] coremltools not installed, cannot extract embeddings", file=sys.stderr)
|
|
return None
|
|
|
|
model_path = os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)),
|
|
"..", "models", "facenet512.mlpackage"
|
|
)
|
|
model_path = os.path.normpath(model_path)
|
|
if not os.path.exists(model_path):
|
|
print(f"[TMDB-EMBED] CoreML model not found: {model_path}", file=sys.stderr)
|
|
return None
|
|
|
|
try:
|
|
model = ct.models.MLModel(model_path)
|
|
print(f"[TMDB-EMBED] CoreML FaceNet loaded: {model_path}", file=sys.stderr)
|
|
return model
|
|
except Exception as e:
|
|
print(f"[TMDB-EMBED] Failed to load CoreML model: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def load_insightface_detector():
|
|
"""Load InsightFace for face detection only (not embedding)"""
|
|
try:
|
|
import insightface
|
|
from insightface.app import FaceAnalysis
|
|
except ImportError:
|
|
print("[TMDB-EMBED] insightface not installed, cannot detect faces", file=sys.stderr)
|
|
return None
|
|
app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
|
|
app.prepare(ctx_id=0, det_thresh=0.5)
|
|
return app
|
|
|
|
|
|
def download_image(url):
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
return resp.read()
|
|
except Exception as e:
|
|
print(f"[TMDB-EMBED] Failed to download {url}: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def extract_embedding_v2(detector, coreml_model, image_bytes):
|
|
"""Detect face with InsightFace, embed with CoreML FaceNet"""
|
|
try:
|
|
nparr = np.frombuffer(image_bytes, np.uint8)
|
|
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
|
if img is None:
|
|
return None
|
|
|
|
# Step 1: Detect face using InsightFace
|
|
faces = detector.get(img)
|
|
if not faces:
|
|
return None
|
|
|
|
# Use the largest face
|
|
best = max(faces, key=lambda f: (f.bbox[2] - f.bbox[0]) * (f.bbox[3] - f.bbox[1]))
|
|
x1, y1, x2, y2 = [int(v) for v in best.bbox]
|
|
x1, y1 = max(0, x1), max(0, y1)
|
|
x2, y2 = min(img.shape[1], x2), min(img.shape[0], y2)
|
|
|
|
if x2 <= x1 or y2 <= y1:
|
|
return None
|
|
|
|
# Step 2: Crop face, resize to 160x160
|
|
face_img = img[y1:y2, x1:x2]
|
|
face_img = cv2.resize(face_img, (160, 160))
|
|
|
|
# Step 3: CoreML FaceNet embedding
|
|
# Normalize to [-1, 1], HWC → CHW
|
|
normalized = (face_img.astype(np.float32) / 127.5) - 1.0
|
|
normalized = np.transpose(normalized, (2, 0, 1))
|
|
input_array = np.expand_dims(normalized, axis=0)
|
|
|
|
result = coreml_model.predict({"input": input_array})
|
|
emb_key = [k for k in result.keys() if k.startswith("var_")][0]
|
|
embedding = result[emb_key].flatten().tolist()
|
|
return embedding
|
|
|
|
except Exception as e:
|
|
print(f"[TMDB-EMBED] Face extraction failed: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Extract TMDB face embeddings V2.0")
|
|
parser.add_argument("--schema", default="dev", help="DB schema")
|
|
parser.add_argument("--identity-ids", nargs="*", type=int)
|
|
args = parser.parse_args()
|
|
|
|
# Load models
|
|
coreml = load_coreml_facenet()
|
|
if coreml is None:
|
|
sys.exit(1)
|
|
|
|
detector = load_insightface_detector()
|
|
if detector is None:
|
|
sys.exit(1)
|
|
|
|
conn = get_db_connection(args.schema)
|
|
cur = conn.cursor()
|
|
|
|
# Fetch identities
|
|
if args.identity_ids:
|
|
placeholders = ",".join(["%s"] * len(args.identity_ids))
|
|
cur.execute(
|
|
f"SELECT id, name, tmdb_profile FROM {args.schema}.identities "
|
|
f"WHERE id IN ({placeholders}) AND tmdb_profile IS NOT NULL",
|
|
args.identity_ids
|
|
)
|
|
else:
|
|
cur.execute(
|
|
f"SELECT id, name, tmdb_profile FROM {args.schema}.identities "
|
|
f"WHERE source = 'tmdb' AND tmdb_profile IS NOT NULL "
|
|
)
|
|
|
|
rows = cur.fetchall()
|
|
if not rows:
|
|
print("[TMDB-EMBED] No identities to process", file=sys.stderr)
|
|
return
|
|
|
|
print(f"[TMDB-EMBED] Processing {len(rows)} identities with CoreML FaceNet V2.0", file=sys.stderr)
|
|
success = 0
|
|
|
|
for identity_id, name, profile_url in rows:
|
|
if not profile_url:
|
|
continue
|
|
|
|
print(f"[TMDB-EMBED] Processing: {name} (id={identity_id})", file=sys.stderr)
|
|
image_bytes = download_image(profile_url)
|
|
if image_bytes is None:
|
|
continue
|
|
|
|
embedding = extract_embedding_v2(detector, coreml, image_bytes)
|
|
if embedding is None:
|
|
print(f"[TMDB-EMBED] No face detected for: {name}", file=sys.stderr)
|
|
continue
|
|
|
|
cur.execute(
|
|
f"UPDATE {args.schema}.identities SET face_embedding = %s WHERE id = %s",
|
|
(embedding, identity_id)
|
|
)
|
|
success += 1
|
|
print(f"[TMDB-EMBED] ✓ Updated {name} (CoreML FaceNet, dim={len(embedding)})", file=sys.stderr)
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print(f"[TMDB-EMBED] Complete: {success}/{len(rows)} embeddings extracted (CoreML FaceNet V2.0)", file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|