momentry_core/scripts/face_clustering_processor.py

#!/opt/homebrew/bin/python3.11
"""
Face Clustering Processor
職責：將短暫的 Face ID 聚合為持續的 Person ID，並自動綁定 Speaker。
"""

import cv2
import json
import numpy as np
import os
import sys
import psycopg2
from sklearn.cluster import AgglomerativeClustering

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

# Use FaceNet embeddings from face.json instead of DeepFace
HAS_DEEPFACE = False
print("[FACE_CLUSTER] Using FaceNet embeddings from face.json (DeepFace not required)")

# 設定
UUID = os.getenv("UUID", "quick_preview")
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
VIDEO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
FACE_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face.json")
OUTPUT_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face_clustered.json")
ASRX_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.asrx.json")
DB_URL = os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")


def optimized_clustering(embeddings):
    """
    Optimized Clustering for large datasets (e.g. 25k faces).
    Strategy: Sample -> Agglomerative -> Centroid Assignment
    """
    import numpy as np
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.metrics.pairwise import cosine_distances

    n_faces = len(embeddings)
    print(f"   🚀 Starting optimized clustering for {n_faces} faces...")

    # 1. Sampling
    sample_size = min(5000, n_faces)
    if n_faces > sample_size:
        indices = np.random.choice(n_faces, sample_size, replace=False)
        sample_embeddings = embeddings[indices]
    else:
        sample_embeddings = embeddings
        indices = np.arange(n_faces)

    print(f"   📊 Sampling {len(sample_embeddings)} faces for clustering structure...")

    # 2. Agglomerative Clustering on Sample
    clustering = AgglomerativeClustering(
        n_clusters=None, distance_threshold=0.4, metric="cosine", linkage="average"
    )
    sample_labels = clustering.fit_predict(sample_embeddings)

    unique_labels = set(sample_labels)
    n_clusters = len(unique_labels)
    print(f"   🔍 Found {n_clusters} unique clusters in sample.")

    # 3. Compute Centroids for each cluster
    centroids = []
    for label in unique_labels:
        cluster_mask = sample_labels == label
        cluster_faces = sample_embeddings[cluster_mask]
        # Mean embedding
        centroid = np.mean(cluster_faces, axis=0)
        centroids.append(centroid)

    centroids = np.array(centroids)  # Shape: (n_clusters, 512)

    # 4. Assign all faces to nearest centroid
    # Batch processing to save memory
    print(f"   🏃 Assigning {n_faces} faces to {n_clusters} clusters...")
    all_labels = np.zeros(n_faces, dtype=int)

    batch_size = 5000
    for start in range(0, n_faces, batch_size):
        end = min(start + batch_size, n_faces)
        batch = embeddings[start:end]
        dists = cosine_distances(batch, centroids)
        all_labels[start:end] = np.argmin(dists, axis=1)

    return all_labels


def main():
    if not os.path.exists(FACE_JSON_PATH):
        print("❌ Face JSON not found.")
        return

    with open(FACE_JSON_PATH) as f:
        face_data = json.load(f)

    frames_list = face_data.get("frames", [])
    if not frames_list:
        print("❌ No frames in JSON.")
        return

    # Get embeddings from Qdrant
    print(f"[FACE_CLUSTER] Loading embeddings from Qdrant for {UUID}...")
    try:
        import requests
        qdrant_url = "http://localhost:6333"
        collection = "_faces"

        # Query all embeddings for this file_uuid
        response = requests.post(
            f"{qdrant_url}/collections/{collection}/points/scroll",
            json={
                "filter": {
                    "must": [
                        {"key": "file_uuid", "match": {"value": UUID}}
                    ]
                },
                "limit": 10000,
                "with_vector": True
            }
        )

        if response.status_code == 200:
            result = response.json()
            points = result.get("result", {}).get("points", [])
            print(f"[FACE_CLUSTER] Loaded {len(points)} embeddings from Qdrant")

            # Build face_id -> embedding map
            embedding_map = {}
            for point in points:
                face_id = point.get("payload", {}).get("face_id")
                vector = point.get("vector")
                if face_id and vector:
                    embedding_map[face_id] = vector
        else:
            print(f"[FACE_CLUSTER] Qdrant query failed: {response.status_code}")
            embedding_map = {}
    except Exception as e:
        print(f"[FACE_CLUSTER] Failed to load embeddings from Qdrant: {e}")
        embedding_map = {}

    # Use embeddings from Qdrant or face.json
    embeddings = []
    face_refs = []

    print(f"🔍 Collecting face embeddings for {UUID}...")

    for frame_idx, frame_obj in enumerate(frames_list):
        faces = frame_obj.get("faces", [])
        if not faces:
            continue

        for face_idx, face in enumerate(faces):
            face_id = face.get("face_id")
            if face_id and face_id in embedding_map:
                embeddings.append(embedding_map[face_id])
                face_refs.append({"frame_idx": frame_idx, "face_idx": face_idx, "face_id": face_id})

    if not embeddings:
        print("❌ No embeddings found in Qdrant.")
        return

    embeddings = np.array(embeddings)
    print(f"✅ Collected {len(embeddings)} face embeddings from Qdrant.")

    # 2. 聚類
    print(f"🧠 Clustering {len(embeddings)} faces...")
    clustering = AgglomerativeClustering(
        n_clusters=None, distance_threshold=0.4, metric="cosine", linkage="average"
    )
    labels = clustering.fit_predict(embeddings)

    unique_labels = set(labels)
    label_to_person = {l: f"Person_{i}" for i, l in enumerate(unique_labels)}
    print(
        f"👥 Detected {len(unique_labels)} unique persons: {[label_to_person[l] for l in unique_labels]}"
    )

    # 3. 更新 JSON
    for ref, label in zip(face_refs, labels):
        f_idx = ref["frame_idx"]
        face_idx = ref["face_idx"]
        person_id = label_to_person[label]

        if f_idx < len(frames_list):
            faces = frames_list[f_idx].get("faces", [])
            if face_idx < len(faces):
                frames_list[f_idx]["faces"][face_idx]["person_id"] = person_id

    # 保存
    with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
        json.dump(face_data, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved clustered data to {OUTPUT_JSON_PATH}")

    # 4. 自動綁定 Speaker
    auto_bind_speakers()


def auto_bind_speakers():
    if not os.path.exists(OUTPUT_JSON_PATH) or not os.path.exists(ASRX_JSON_PATH):
        print("⚠️ Missing data for speaker binding.")
        return

    with open(OUTPUT_JSON_PATH) as f:
        face_clustered = json.load(f)
    with open(ASRX_JSON_PATH) as f:
        asrx_data = json.load(f)

    print("🔗 Auto-binding Speakers to Persons...")

    # 建立 Face 時間列表
    face_spans = []
    for frame_obj in face_clustered.get("frames", []):
        ts = frame_obj.get("timestamp")
        for face in frame_obj.get("faces", []):
            person_id = face.get("person_id")
            if person_id and ts is not None:
                face_spans.append({"ts": ts, "person_id": person_id})

    speaker_person_counts = {}

    # 對於每個說話片段，找出畫面中出現的人
    for seg in asrx_data.get("segments", []):
        start = seg.get("start")
        end = seg.get("end")
        speaker = seg.get("speaker_id")
        if not speaker:
            continue

        # 找時間重疊
        candidates = [f for f in face_spans if start <= f["ts"] <= end]
        if candidates:
            # 投票
            person_counts = {}
            for c in candidates:
                pid = c["person_id"]
                person_counts[pid] = person_counts.get(pid, 0) + 1

            if speaker not in speaker_person_counts:
                speaker_person_counts[speaker] = {}

            best_person = max(person_counts, key=person_counts.get)
            speaker_person_counts[speaker][best_person] = (
                speaker_person_counts[speaker].get(best_person, 0) + 1
            )

    # 寫入資料庫
    try:
        conn = psycopg2.connect(DB_URL)
        cur = conn.cursor()

        for speaker, persons in speaker_person_counts.items():
            if not persons:
                continue
            best_person = max(persons, key=persons.get)
            print(
                f"   🎤 {speaker} is likely {best_person} ({persons[best_person]} votes)"
            )

            # 1. 找或建 Talent
            cur.execute("SELECT id FROM talents WHERE real_name = %s", (best_person,))
            row = cur.fetchone()

            if row:
                talent_id = row[0]
            else:
                cur.execute(
                    "INSERT INTO talents (real_name) VALUES (%s) RETURNING id",
                    (best_person,),
                )
                talent_id = cur.fetchone()[0]
                print(f"   ✨ Created Talent #{talent_id} ({best_person})")

            # 2. 綁定 Speaker
            cur.execute(
                """
                INSERT INTO identity_bindings (talent_id, binding_type, binding_value, source, confidence)
                VALUES (%s, 'speaker', %s, 'auto_cluster', 0.8)
                ON CONFLICT (binding_type, binding_value) DO UPDATE SET talent_id = EXCLUDED.talent_id
            """,
                (talent_id, speaker),
            )
            print(f"   ✅ Bound {speaker} -> {best_person}")

        conn.commit()
        cur.close()
        conn.close()
    except Exception as e:
        print(f"   ❌ DB Error: {e}")


if __name__ == "__main__":
    main()