Files
momentry_core/scripts/face_clustering_processor.py
Accusys 3eabd45882 fix: ASRX duplication, TKG edges, trace ingest, and add pipeline progress publishing
- ASRX handler no longer stores duplicate 'asr' pre_chunks
- Pre_chunks storage made idempotent (delete-before-insert)
- Rule 1 + trace_ingest changed to query 'asrx' not 'asr'
- Trace chunks removed (dynamic from TKG/Qdrant)
- TKG scroll_face_points fixed: trace_id >= 1 (not == 1)
- TKG AsrxSegmentEntry: start/end -> start_time/end_time (match ASRX JSON)
- Unregister error handling: log instead of silent discard
- Add publish_pipeline_progress calls at each pipeline stage
  (processors, rule1, face_trace, identity_agent, TKG, rule2, completion)
2026-07-02 10:43:46 +08:00

295 lines
9.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/opt/homebrew/bin/python3.11
"""
Face Clustering Processor
職責:將短暫的 Face ID 聚合為持續的 Person ID並自動綁定 Speaker。
"""
import cv2
import json
import numpy as np
import os
import sys
import psycopg2
from sklearn.cluster import AgglomerativeClustering
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
# Use FaceNet embeddings from face.json instead of DeepFace
HAS_DEEPFACE = False
print("[FACE_CLUSTER] Using FaceNet embeddings from face.json (DeepFace not required)")
# 設定
UUID = os.getenv("UUID", "quick_preview")
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
VIDEO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
FACE_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face.json")
OUTPUT_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face_clustered.json")
ASRX_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.asrx.json")
DB_URL = os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
def optimized_clustering(embeddings):
"""
Optimized Clustering for large datasets (e.g. 25k faces).
Strategy: Sample -> Agglomerative -> Centroid Assignment
"""
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_distances
n_faces = len(embeddings)
print(f" 🚀 Starting optimized clustering for {n_faces} faces...")
# 1. Sampling
sample_size = min(5000, n_faces)
if n_faces > sample_size:
indices = np.random.choice(n_faces, sample_size, replace=False)
sample_embeddings = embeddings[indices]
else:
sample_embeddings = embeddings
indices = np.arange(n_faces)
print(f" 📊 Sampling {len(sample_embeddings)} faces for clustering structure...")
# 2. Agglomerative Clustering on Sample
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=0.4, metric="cosine", linkage="average"
)
sample_labels = clustering.fit_predict(sample_embeddings)
unique_labels = set(sample_labels)
n_clusters = len(unique_labels)
print(f" 🔍 Found {n_clusters} unique clusters in sample.")
# 3. Compute Centroids for each cluster
centroids = []
for label in unique_labels:
cluster_mask = sample_labels == label
cluster_faces = sample_embeddings[cluster_mask]
# Mean embedding
centroid = np.mean(cluster_faces, axis=0)
centroids.append(centroid)
centroids = np.array(centroids) # Shape: (n_clusters, 512)
# 4. Assign all faces to nearest centroid
# Batch processing to save memory
print(f" 🏃 Assigning {n_faces} faces to {n_clusters} clusters...")
all_labels = np.zeros(n_faces, dtype=int)
batch_size = 5000
for start in range(0, n_faces, batch_size):
end = min(start + batch_size, n_faces)
batch = embeddings[start:end]
dists = cosine_distances(batch, centroids)
all_labels[start:end] = np.argmin(dists, axis=1)
return all_labels
def main():
if not os.path.exists(FACE_JSON_PATH):
print("❌ Face JSON not found.")
return
with open(FACE_JSON_PATH) as f:
face_data = json.load(f)
frames_list = face_data.get("frames", [])
if not frames_list:
print("❌ No frames in JSON.")
return
# Get embeddings from Qdrant
print(f"[FACE_CLUSTER] Loading embeddings from Qdrant for {UUID}...")
try:
import requests
qdrant_url = "http://localhost:6333"
collection = "_faces"
# Query all embeddings for this file_uuid
response = requests.post(
f"{qdrant_url}/collections/{collection}/points/scroll",
json={
"filter": {
"must": [
{"key": "file_uuid", "match": {"value": UUID}}
]
},
"limit": 10000,
"with_vector": True
}
)
if response.status_code == 200:
result = response.json()
points = result.get("result", {}).get("points", [])
print(f"[FACE_CLUSTER] Loaded {len(points)} embeddings from Qdrant")
# Build face_id -> embedding map
embedding_map = {}
for point in points:
face_id = point.get("payload", {}).get("face_id")
vector = point.get("vector")
if face_id and vector:
embedding_map[face_id] = vector
else:
print(f"[FACE_CLUSTER] Qdrant query failed: {response.status_code}")
embedding_map = {}
except Exception as e:
print(f"[FACE_CLUSTER] Failed to load embeddings from Qdrant: {e}")
embedding_map = {}
# Use embeddings from Qdrant or face.json
embeddings = []
face_refs = []
print(f"🔍 Collecting face embeddings for {UUID}...")
for frame_idx, frame_obj in enumerate(frames_list):
faces = frame_obj.get("faces", [])
if not faces:
continue
for face_idx, face in enumerate(faces):
face_id = face.get("face_id")
if face_id and face_id in embedding_map:
embeddings.append(embedding_map[face_id])
face_refs.append({"frame_idx": frame_idx, "face_idx": face_idx, "face_id": face_id})
if not embeddings:
print("❌ No embeddings found in Qdrant.")
return
embeddings = np.array(embeddings)
print(f"✅ Collected {len(embeddings)} face embeddings from Qdrant.")
# 2. 聚類
print(f"🧠 Clustering {len(embeddings)} faces...")
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=0.4, metric="cosine", linkage="average"
)
labels = clustering.fit_predict(embeddings)
unique_labels = set(labels)
label_to_person = {l: f"Person_{i}" for i, l in enumerate(unique_labels)}
print(
f"👥 Detected {len(unique_labels)} unique persons: {[label_to_person[l] for l in unique_labels]}"
)
# 3. 更新 JSON
for ref, label in zip(face_refs, labels):
f_idx = ref["frame_idx"]
face_idx = ref["face_idx"]
person_id = label_to_person[label]
if f_idx < len(frames_list):
faces = frames_list[f_idx].get("faces", [])
if face_idx < len(faces):
frames_list[f_idx]["faces"][face_idx]["person_id"] = person_id
# 保存
with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
json.dump(face_data, f, indent=2, ensure_ascii=False)
print(f"✅ Saved clustered data to {OUTPUT_JSON_PATH}")
# 4. 自動綁定 Speaker
auto_bind_speakers()
def auto_bind_speakers():
if not os.path.exists(OUTPUT_JSON_PATH) or not os.path.exists(ASRX_JSON_PATH):
print("⚠️ Missing data for speaker binding.")
return
with open(OUTPUT_JSON_PATH) as f:
face_clustered = json.load(f)
with open(ASRX_JSON_PATH) as f:
asrx_data = json.load(f)
print("🔗 Auto-binding Speakers to Persons...")
# 建立 Face 時間列表
face_spans = []
for frame_obj in face_clustered.get("frames", []):
ts = frame_obj.get("timestamp")
for face in frame_obj.get("faces", []):
person_id = face.get("person_id")
if person_id and ts is not None:
face_spans.append({"ts": ts, "person_id": person_id})
speaker_person_counts = {}
# 對於每個說話片段,找出畫面中出現的人
for seg in asrx_data.get("segments", []):
start = seg.get("start")
end = seg.get("end")
speaker = seg.get("speaker_id")
if not speaker:
continue
# 找時間重疊
candidates = [f for f in face_spans if start <= f["ts"] <= end]
if candidates:
# 投票
person_counts = {}
for c in candidates:
pid = c["person_id"]
person_counts[pid] = person_counts.get(pid, 0) + 1
if speaker not in speaker_person_counts:
speaker_person_counts[speaker] = {}
best_person = max(person_counts, key=person_counts.get)
speaker_person_counts[speaker][best_person] = (
speaker_person_counts[speaker].get(best_person, 0) + 1
)
# 寫入資料庫
try:
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
for speaker, persons in speaker_person_counts.items():
if not persons:
continue
best_person = max(persons, key=persons.get)
print(
f" 🎤 {speaker} is likely {best_person} ({persons[best_person]} votes)"
)
# 1. 找或建 Talent
cur.execute("SELECT id FROM talents WHERE real_name = %s", (best_person,))
row = cur.fetchone()
if row:
talent_id = row[0]
else:
cur.execute(
"INSERT INTO talents (real_name) VALUES (%s) RETURNING id",
(best_person,),
)
talent_id = cur.fetchone()[0]
print(f" ✨ Created Talent #{talent_id} ({best_person})")
# 2. 綁定 Speaker
cur.execute(
"""
INSERT INTO identity_bindings (talent_id, binding_type, binding_value, source, confidence)
VALUES (%s, 'speaker', %s, 'auto_cluster', 0.8)
ON CONFLICT (binding_type, binding_value) DO UPDATE SET talent_id = EXCLUDED.talent_id
""",
(talent_id, speaker),
)
print(f" ✅ Bound {speaker} -> {best_person}")
conn.commit()
cur.close()
conn.close()
except Exception as e:
print(f" ❌ DB Error: {e}")
if __name__ == "__main__":
main()