- Add SERVICE_INVENTORY_V1.0.0.md (25 source-verified tools, 3.7GB) - Add ERP_SELECTION_REPORT.md (Odoo CE vs ERPNext comparison) - Add SFTPGO_ODOO_REPLACEMENT.md (SFTPGo migration plan) - Add SERVICE_GO_GITEA_BUILD.md (Go compiler + Gitea build report) - Add release visualize command (face trace heatmap + identity filter) - Add sqlite-vec integration (160MB SQLite with vec0 vector tables) - Add export_identities.py, export_sqlite.py, render_face_heatmap.py - Add Go, Gitea, Rust/Cargo, Swift, yt-dlp, SQLite, sqlite-vec to service CLI - Fix package to include identities and identity_bindings in data.sql - Update release list to show all deployed video stats - Add V1.0.0 YAML frontmatter to all docs (DOCS_STANDARD compliant)
165 lines
5.6 KiB
Python
165 lines
5.6 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Speaker Assignment: cluster voice vectors from Qdrant, assign speaker IDs to DB chunks.
|
|
"""
|
|
import json, sys, time
|
|
import psycopg2
|
|
import numpy as np
|
|
from urllib.request import Request, urlopen
|
|
from sklearn.cluster import AgglomerativeClustering
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
|
|
QDRANT = "http://localhost:6333"
|
|
DB = "dbname=momentry user=accusys"
|
|
COLLECTION = "momentry_dev_voice"
|
|
|
|
print(f"=== Speaker Assignment for {UUID} ===")
|
|
|
|
# Step 1: Read voice vectors from Qdrant
|
|
print("Reading voice vectors from Qdrant...")
|
|
vectors = []
|
|
chunk_ids = []
|
|
# We need to scroll through all points
|
|
offset = None
|
|
while True:
|
|
data = {"limit": 100, "with_payload": True, "with_vector": True}
|
|
if offset is not None:
|
|
data["offset"] = offset
|
|
req = Request(f"{QDRANT}/collections/{COLLECTION}/points/scroll",
|
|
data=json.dumps(data).encode(),
|
|
headers={"Content-Type": "application/json"}, method="POST")
|
|
resp = json.loads(urlopen(req).read())
|
|
result = resp["result"]
|
|
points = result.get("points", [])
|
|
if not points:
|
|
break
|
|
for pt in points:
|
|
payload = pt.get("payload", {})
|
|
cid = payload.get("chunk_id", "")
|
|
# Only get vectors for THIS UUID's chunks
|
|
# Filter by checking DB later, or rely on Qdrant payload
|
|
vectors.append(pt["vector"])
|
|
chunk_ids.append(cid)
|
|
offset = result.get("next_page_offset")
|
|
if offset is None:
|
|
break
|
|
print(f" Read {len(vectors)} vectors...")
|
|
|
|
print(f"Total vectors: {len(vectors)}")
|
|
|
|
# Step 2: Filter to only our UUID's chunks (from DB)
|
|
conn = psycopg2.connect(DB)
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT chunk_id FROM dev.chunk WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY id", (UUID,))
|
|
db_chunk_ids = set(row[0] for row in cur.fetchall())
|
|
print(f"DB chunk_ids: {len(db_chunk_ids)}")
|
|
|
|
# Filter vectors to match DB chunks
|
|
filtered_vectors = []
|
|
filtered_chunk_ids = []
|
|
for v, cid in zip(vectors, chunk_ids):
|
|
if cid in db_chunk_ids:
|
|
filtered_vectors.append(v)
|
|
filtered_chunk_ids.append(cid)
|
|
|
|
vectors = filtered_vectors
|
|
chunk_ids = filtered_chunk_ids
|
|
print(f"Matched vectors: {len(vectors)}")
|
|
|
|
# Sort by chunk_id (which is numeric string)
|
|
indices = sorted(range(len(chunk_ids)), key=lambda i: int(chunk_ids[i]) if chunk_ids[i].isdigit() else 0)
|
|
vectors = [vectors[i] for i in indices]
|
|
chunk_ids = [chunk_ids[i] for i in indices]
|
|
|
|
# Step 3: Read speaker_change from asr.json
|
|
asr_path = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json"
|
|
with open(asr_path) as f:
|
|
asr_data = json.load(f)
|
|
segments = asr_data.get("segments", [])
|
|
speaker_changes = {}
|
|
for seg in segments:
|
|
speaker_changes[seg["chunk_id"]] = seg.get("speaker_change", False)
|
|
|
|
# Step 4: Cluster embeddings
|
|
print("Clustering...")
|
|
X = np.array(vectors)
|
|
|
|
# Compute cosine distance matrix
|
|
# Cosine distance = 1 - cosine_similarity
|
|
cos_sim = cosine_similarity(X)
|
|
cos_dist = 1 - cos_sim
|
|
|
|
# Use AgglomerativeClustering with cosine distance
|
|
# Determine optimal n_clusters by looking at speaker_change boundaries
|
|
# First pass: use speaker_change as hard boundaries to get initial clusters
|
|
# Then refine
|
|
|
|
# Simpler: use a distance threshold
|
|
n = len(vectors)
|
|
labels = np.full(n, -1, dtype=int)
|
|
current_speaker = 0
|
|
|
|
# Start with first chunk as speaker 0
|
|
labels[0] = current_speaker
|
|
centroids = [np.array(vectors[0])] # per-cluster centroid
|
|
|
|
for i in range(1, n):
|
|
has_change = speaker_changes.get(chunk_ids[i], False)
|
|
vec = np.array(vectors[i])
|
|
|
|
if has_change:
|
|
# Speaker change: check if this is a NEW speaker or returning to a previous one
|
|
# Compare with centroid of current speaker vs others
|
|
similarities = [float(np.dot(vec, c) / (np.linalg.norm(vec) * np.linalg.norm(c) + 1e-10)) for c in centroids]
|
|
best_sim = max(similarities) if similarities else 0
|
|
best_cluster = similarities.index(best_sim) if similarities else 0
|
|
|
|
if best_sim > 0.65 and best_cluster != current_speaker:
|
|
# Returning to a previous speaker
|
|
labels[i] = best_cluster
|
|
elif best_sim < 0.55:
|
|
# New speaker
|
|
current_speaker = len(centroids)
|
|
labels[i] = current_speaker
|
|
centroids.append(vec)
|
|
else:
|
|
# Stay with current speaker (false change detection)
|
|
labels[i] = current_speaker
|
|
centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
|
|
else:
|
|
# No speaker change: same speaker as previous
|
|
labels[i] = current_speaker
|
|
centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
|
|
|
|
n_speakers = len(set(labels))
|
|
print(f"Identified {n_speakers} unique speakers")
|
|
|
|
# Step 5: Update DB chunks with speaker assignment
|
|
print("Updating DB chunks...")
|
|
# Map: chunk_id -> speaker_id
|
|
speaker_map = {}
|
|
for cid, label in zip(chunk_ids, labels):
|
|
speaker_map[cid] = f"SPEAKER_{label}"
|
|
|
|
updated = 0
|
|
for cid, spk_id in speaker_map.items():
|
|
cur.execute("""
|
|
UPDATE dev.chunk SET metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
|
|
WHERE file_uuid = %s AND chunk_id = %s AND chunk_type = 'sentence'
|
|
""", (json.dumps({"speaker_id": spk_id}), UUID, cid))
|
|
updated += 1
|
|
|
|
conn.commit()
|
|
print(f"Updated {updated} chunks with speaker IDs")
|
|
|
|
# Step 6: Save speaker map
|
|
speaker_map_path = f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map.json"
|
|
with open(speaker_map_path, "w") as f:
|
|
json.dump({"speakers": n_speakers, "assignments": speaker_map}, f, indent=2)
|
|
print(f"Speaker map saved: {speaker_map_path}")
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print("=== Done ===")
|