momentry_core/scripts/speaker_assign.py

#!/opt/homebrew/bin/python3.11
"""
Speaker Assignment: cluster voice vectors from Qdrant, assign speaker IDs to DB chunks.
"""
import json, sys, time
import psycopg2
import numpy as np
from urllib.request import Request, urlopen
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity

UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
QDRANT = "http://localhost:6333"
DB = "dbname=momentry user=accusys"
COLLECTION = "momentry_dev_voice"

print(f"=== Speaker Assignment for {UUID} ===")

# Step 1: Read voice vectors from Qdrant
print("Reading voice vectors from Qdrant...")
vectors = []
chunk_ids = []
# We need to scroll through all points
offset = None
while True:
    data = {"limit": 100, "with_payload": True, "with_vector": True}
    if offset is not None:
        data["offset"] = offset
    req = Request(f"{QDRANT}/collections/{COLLECTION}/points/scroll",
        data=json.dumps(data).encode(),
        headers={"Content-Type": "application/json"}, method="POST")
    resp = json.loads(urlopen(req).read())
    result = resp["result"]
    points = result.get("points", [])
    if not points:
        break
    for pt in points:
        payload = pt.get("payload", {})
        cid = payload.get("chunk_id", "")
        # Only get vectors for THIS UUID's chunks
        # Filter by checking DB later, or rely on Qdrant payload
        vectors.append(pt["vector"])
        chunk_ids.append(cid)
    offset = result.get("next_page_offset")
    if offset is None:
        break
    print(f"  Read {len(vectors)} vectors...")

print(f"Total vectors: {len(vectors)}")

# Step 2: Filter to only our UUID's chunks (from DB)
conn = psycopg2.connect(DB)
cur = conn.cursor()
cur.execute("SELECT chunk_id FROM dev.chunk WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY id", (UUID,))
db_chunk_ids = set(row[0] for row in cur.fetchall())
print(f"DB chunk_ids: {len(db_chunk_ids)}")

# Filter vectors to match DB chunks
filtered_vectors = []
filtered_chunk_ids = []
for v, cid in zip(vectors, chunk_ids):
    if cid in db_chunk_ids:
        filtered_vectors.append(v)
        filtered_chunk_ids.append(cid)

vectors = filtered_vectors
chunk_ids = filtered_chunk_ids
print(f"Matched vectors: {len(vectors)}")

# Sort by chunk_id (which is numeric string)
indices = sorted(range(len(chunk_ids)), key=lambda i: int(chunk_ids[i]) if chunk_ids[i].isdigit() else 0)
vectors = [vectors[i] for i in indices]
chunk_ids = [chunk_ids[i] for i in indices]

# Step 3: Read speaker_change from asr.json
asr_path = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json"
with open(asr_path) as f:
    asr_data = json.load(f)
segments = asr_data.get("segments", [])
speaker_changes = {}
for seg in segments:
    speaker_changes[seg["chunk_id"]] = seg.get("speaker_change", False)

# Step 4: Cluster embeddings
print("Clustering...")
X = np.array(vectors)

# Compute cosine distance matrix
# Cosine distance = 1 - cosine_similarity
cos_sim = cosine_similarity(X)
cos_dist = 1 - cos_sim

# Use AgglomerativeClustering with cosine distance
# Determine optimal n_clusters by looking at speaker_change boundaries
# First pass: use speaker_change as hard boundaries to get initial clusters
# Then refine

# Simpler: use a distance threshold
n = len(vectors)
labels = np.full(n, -1, dtype=int)
current_speaker = 0

# Start with first chunk as speaker 0
labels[0] = current_speaker
centroids = [np.array(vectors[0])]  # per-cluster centroid

for i in range(1, n):
    has_change = speaker_changes.get(chunk_ids[i], False)
    vec = np.array(vectors[i])

    if has_change:
        # Speaker change: check if this is a NEW speaker or returning to a previous one
        # Compare with centroid of current speaker vs others
        similarities = [float(np.dot(vec, c) / (np.linalg.norm(vec) * np.linalg.norm(c) + 1e-10)) for c in centroids]
        best_sim = max(similarities) if similarities else 0
        best_cluster = similarities.index(best_sim) if similarities else 0

        if best_sim > 0.65 and best_cluster != current_speaker:
            # Returning to a previous speaker
            labels[i] = best_cluster
        elif best_sim < 0.55:
            # New speaker
            current_speaker = len(centroids)
            labels[i] = current_speaker
            centroids.append(vec)
        else:
            # Stay with current speaker (false change detection)
            labels[i] = current_speaker
            centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
    else:
        # No speaker change: same speaker as previous
        labels[i] = current_speaker
        centroids[current_speaker] = (centroids[current_speaker] + vec) / 2

n_speakers = len(set(labels))
print(f"Identified {n_speakers} unique speakers")

# Step 5: Update DB chunks with speaker assignment
print("Updating DB chunks...")
# Map: chunk_id -> speaker_id
speaker_map = {}
for cid, label in zip(chunk_ids, labels):
    speaker_map[cid] = f"SPEAKER_{label}"

updated = 0
for cid, spk_id in speaker_map.items():
    cur.execute("""
        UPDATE dev.chunk SET metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
        WHERE file_uuid = %s AND chunk_id = %s AND chunk_type = 'sentence'
    """, (json.dumps({"speaker_id": spk_id}), UUID, cid))
    updated += 1

conn.commit()
print(f"Updated {updated} chunks with speaker IDs")

# Step 6: Save speaker map
speaker_map_path = f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map.json"
with open(speaker_map_path, "w") as f:
    json.dump({"speakers": n_speakers, "assignments": speaker_map}, f, indent=2)
print(f"Speaker map saved: {speaker_map_path}")

cur.close()
conn.close()
print("=== Done ===")