feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)

Phase 2.6.1: co_occurrence_edges migration - build_co_occurrence_edges_from_qdrant() - Qdrant embeddings → frame grouping → YOLO objects - Result: 6679 edges (vs 6701 PostgreSQL) Phase 2.6.2: face_face_edges migration - build_face_face_edges_from_qdrant() - Qdrant embeddings → frame grouping → face pairs - mutual_gaze detection preserved - Result: 6 edges (exact match) Phase 2.6.3: speaker_face_edges migration - build_speaker_face_edges_from_qdrant() - Qdrant embeddings → trace_id frame ranges - SPEAKS_AS edge creation Architecture: - All edges use Qdrant payload (no face_detections queries) - PostgreSQL fallback for empty Qdrant - Estimated 3.6x performance improvement Testing: - Playground (3003): ✓ All Phase 2.6 logs verified - Edge counts: ✓ Close match with PostgreSQL - Fallback: ✓ Working Docs: - docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md - docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
2026-06-21 04:47:49 +08:00
parent 0afc70fc5b
commit 2cfcfdd1af
2926 changed files with 8311058 additions and 1394 deletions
--- a/v1.1/scripts/speaker_assign_v1.11.py
+++ b/v1.1/scripts/speaker_assign_v1.11.py
@@ -0,0 +1,164 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Speaker Assignment: cluster voice vectors from Qdrant, assign speaker IDs to DB chunks.
+"""
+import json, sys, time
+import psycopg2
+import numpy as np
+from urllib.request import Request, urlopen
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics.pairwise import cosine_similarity
+
+UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
+QDRANT = "http://localhost:6333"
+DB = "dbname=momentry user=accusys"
+COLLECTION = "momentry_dev_voice"
+
+print(f"=== Speaker Assignment for {UUID} ===")
+
+# Step 1: Read voice vectors from Qdrant
+print("Reading voice vectors from Qdrant...")
+vectors = []
+chunk_ids = []
+# We need to scroll through all points
+offset = None
+while True:
+    data = {"limit": 100, "with_payload": True, "with_vector": True}
+    if offset is not None:
+        data["offset"] = offset
+    req = Request(f"{QDRANT}/collections/{COLLECTION}/points/scroll",
+        data=json.dumps(data).encode(),
+        headers={"Content-Type": "application/json"}, method="POST")
+    resp = json.loads(urlopen(req).read())
+    result = resp["result"]
+    points = result.get("points", [])
+    if not points:
+        break
+    for pt in points:
+        payload = pt.get("payload", {})
+        cid = payload.get("chunk_id", "")
+        # Only get vectors for THIS UUID's chunks
+        # Filter by checking DB later, or rely on Qdrant payload
+        vectors.append(pt["vector"])
+        chunk_ids.append(cid)
+    offset = result.get("next_page_offset")
+    if offset is None:
+        break
+    print(f"  Read {len(vectors)} vectors...")
+
+print(f"Total vectors: {len(vectors)}")
+
+# Step 2: Filter to only our UUID's chunks (from DB)
+conn = psycopg2.connect(DB)
+cur = conn.cursor()
+cur.execute("SELECT chunk_id FROM dev.chunk WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY id", (UUID,))
+db_chunk_ids = set(row[0] for row in cur.fetchall())
+print(f"DB chunk_ids: {len(db_chunk_ids)}")
+
+# Filter vectors to match DB chunks
+filtered_vectors = []
+filtered_chunk_ids = []
+for v, cid in zip(vectors, chunk_ids):
+    if cid in db_chunk_ids:
+        filtered_vectors.append(v)
+        filtered_chunk_ids.append(cid)
+
+vectors = filtered_vectors
+chunk_ids = filtered_chunk_ids
+print(f"Matched vectors: {len(vectors)}")
+
+# Sort by chunk_id (which is numeric string)
+indices = sorted(range(len(chunk_ids)), key=lambda i: int(chunk_ids[i]) if chunk_ids[i].isdigit() else 0)
+vectors = [vectors[i] for i in indices]
+chunk_ids = [chunk_ids[i] for i in indices]
+
+# Step 3: Read speaker_change from asr.json
+asr_path = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json"
+with open(asr_path) as f:
+    asr_data = json.load(f)
+segments = asr_data.get("segments", [])
+speaker_changes = {}
+for seg in segments:
+    speaker_changes[seg["chunk_id"]] = seg.get("speaker_change", False)
+
+# Step 4: Cluster embeddings
+print("Clustering...")
+X = np.array(vectors)
+
+# Compute cosine distance matrix
+# Cosine distance = 1 - cosine_similarity
+cos_sim = cosine_similarity(X)
+cos_dist = 1 - cos_sim
+
+# Use AgglomerativeClustering with cosine distance
+# Determine optimal n_clusters by looking at speaker_change boundaries
+# First pass: use speaker_change as hard boundaries to get initial clusters
+# Then refine
+
+# Simpler: use a distance threshold
+n = len(vectors)
+labels = np.full(n, -1, dtype=int)
+current_speaker = 0
+
+# Start with first chunk as speaker 0
+labels[0] = current_speaker
+centroids = [np.array(vectors[0])]  # per-cluster centroid
+
+for i in range(1, n):
+    has_change = speaker_changes.get(chunk_ids[i], False)
+    vec = np.array(vectors[i])
+
+    if has_change:
+        # Speaker change: check if this is a NEW speaker or returning to a previous one
+        # Compare with centroid of current speaker vs others
+        similarities = [float(np.dot(vec, c) / (np.linalg.norm(vec) * np.linalg.norm(c) + 1e-10)) for c in centroids]
+        best_sim = max(similarities) if similarities else 0
+        best_cluster = similarities.index(best_sim) if similarities else 0
+
+        if best_sim > 0.65 and best_cluster != current_speaker:
+            # Returning to a previous speaker
+            labels[i] = best_cluster
+        elif best_sim < 0.55:
+            # New speaker
+            current_speaker = len(centroids)
+            labels[i] = current_speaker
+            centroids.append(vec)
+        else:
+            # Stay with current speaker (false change detection)
+            labels[i] = current_speaker
+            centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
+    else:
+        # No speaker change: same speaker as previous
+        labels[i] = current_speaker
+        centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
+
+n_speakers = len(set(labels))
+print(f"Identified {n_speakers} unique speakers")
+
+# Step 5: Update DB chunks with speaker assignment
+print("Updating DB chunks...")
+# Map: chunk_id -> speaker_id
+speaker_map = {}
+for cid, label in zip(chunk_ids, labels):
+    speaker_map[cid] = f"SPEAKER_{label}"
+
+updated = 0
+for cid, spk_id in speaker_map.items():
+    cur.execute("""
+        UPDATE dev.chunk SET metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
+        WHERE file_uuid = %s AND chunk_id = %s AND chunk_type = 'sentence'
+    """, (json.dumps({"speaker_id": spk_id}), UUID, cid))
+    updated += 1
+
+conn.commit()
+print(f"Updated {updated} chunks with speaker IDs")
+
+# Step 6: Save speaker map
+speaker_map_path = f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map.json"
+with open(speaker_map_path, "w") as f:
+    json.dump({"speakers": n_speakers, "assignments": speaker_map}, f, indent=2)
+print(f"Speaker map saved: {speaker_map_path}")
+
+cur.close()
+conn.close()
+print("=== Done ===")