#!/opt/homebrew/bin/python3.11 """ Speaker Assignment: cluster voice vectors from Qdrant, assign speaker IDs to DB chunks. """ import json, sys, time import psycopg2 import numpy as np from urllib.request import Request, urlopen from sklearn.cluster import AgglomerativeClustering from sklearn.metrics.pairwise import cosine_similarity UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5" QDRANT = "http://localhost:6333" DB = "dbname=momentry user=accusys" COLLECTION = "momentry_dev_voice" print(f"=== Speaker Assignment for {UUID} ===") # Step 1: Read voice vectors from Qdrant print("Reading voice vectors from Qdrant...") vectors = [] chunk_ids = [] # We need to scroll through all points offset = None while True: data = {"limit": 100, "with_payload": True, "with_vector": True} if offset is not None: data["offset"] = offset req = Request(f"{QDRANT}/collections/{COLLECTION}/points/scroll", data=json.dumps(data).encode(), headers={"Content-Type": "application/json"}, method="POST") resp = json.loads(urlopen(req).read()) result = resp["result"] points = result.get("points", []) if not points: break for pt in points: payload = pt.get("payload", {}) cid = payload.get("chunk_id", "") # Only get vectors for THIS UUID's chunks # Filter by checking DB later, or rely on Qdrant payload vectors.append(pt["vector"]) chunk_ids.append(cid) offset = result.get("next_page_offset") if offset is None: break print(f" Read {len(vectors)} vectors...") print(f"Total vectors: {len(vectors)}") # Step 2: Filter to only our UUID's chunks (from DB) conn = psycopg2.connect(DB) cur = conn.cursor() cur.execute("SELECT chunk_id FROM dev.chunk WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY id", (UUID,)) db_chunk_ids = set(row[0] for row in cur.fetchall()) print(f"DB chunk_ids: {len(db_chunk_ids)}") # Filter vectors to match DB chunks filtered_vectors = [] filtered_chunk_ids = [] for v, cid in zip(vectors, chunk_ids): if cid in db_chunk_ids: filtered_vectors.append(v) filtered_chunk_ids.append(cid) vectors = filtered_vectors chunk_ids = filtered_chunk_ids print(f"Matched vectors: {len(vectors)}") # Sort by chunk_id (which is numeric string) indices = sorted(range(len(chunk_ids)), key=lambda i: int(chunk_ids[i]) if chunk_ids[i].isdigit() else 0) vectors = [vectors[i] for i in indices] chunk_ids = [chunk_ids[i] for i in indices] # Step 3: Read speaker_change from asr.json asr_path = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json" with open(asr_path) as f: asr_data = json.load(f) segments = asr_data.get("segments", []) speaker_changes = {} for seg in segments: speaker_changes[seg["chunk_id"]] = seg.get("speaker_change", False) # Step 4: Cluster embeddings print("Clustering...") X = np.array(vectors) # Compute cosine distance matrix # Cosine distance = 1 - cosine_similarity cos_sim = cosine_similarity(X) cos_dist = 1 - cos_sim # Use AgglomerativeClustering with cosine distance # Determine optimal n_clusters by looking at speaker_change boundaries # First pass: use speaker_change as hard boundaries to get initial clusters # Then refine # Simpler: use a distance threshold n = len(vectors) labels = np.full(n, -1, dtype=int) current_speaker = 0 # Start with first chunk as speaker 0 labels[0] = current_speaker centroids = [np.array(vectors[0])] # per-cluster centroid for i in range(1, n): has_change = speaker_changes.get(chunk_ids[i], False) vec = np.array(vectors[i]) if has_change: # Speaker change: check if this is a NEW speaker or returning to a previous one # Compare with centroid of current speaker vs others similarities = [float(np.dot(vec, c) / (np.linalg.norm(vec) * np.linalg.norm(c) + 1e-10)) for c in centroids] best_sim = max(similarities) if similarities else 0 best_cluster = similarities.index(best_sim) if similarities else 0 if best_sim > 0.65 and best_cluster != current_speaker: # Returning to a previous speaker labels[i] = best_cluster elif best_sim < 0.55: # New speaker current_speaker = len(centroids) labels[i] = current_speaker centroids.append(vec) else: # Stay with current speaker (false change detection) labels[i] = current_speaker centroids[current_speaker] = (centroids[current_speaker] + vec) / 2 else: # No speaker change: same speaker as previous labels[i] = current_speaker centroids[current_speaker] = (centroids[current_speaker] + vec) / 2 n_speakers = len(set(labels)) print(f"Identified {n_speakers} unique speakers") # Step 5: Update DB chunks with speaker assignment print("Updating DB chunks...") # Map: chunk_id -> speaker_id speaker_map = {} for cid, label in zip(chunk_ids, labels): speaker_map[cid] = f"SPEAKER_{label}" updated = 0 for cid, spk_id in speaker_map.items(): cur.execute(""" UPDATE dev.chunk SET metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb WHERE file_uuid = %s AND chunk_id = %s AND chunk_type = 'sentence' """, (json.dumps({"speaker_id": spk_id}), UUID, cid)) updated += 1 conn.commit() print(f"Updated {updated} chunks with speaker IDs") # Step 6: Save speaker map speaker_map_path = f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map.json" with open(speaker_map_path, "w") as f: json.dump({"speakers": n_speakers, "assignments": speaker_map}, f, indent=2) print(f"Speaker map saved: {speaker_map_path}") cur.close() conn.close() print("=== Done ===")