Files
momentry_core/scripts/speaker_assign.py
Accusys 2992a0e650 feat: service inventory, ERP reports, sqlite-vec integration, visualize tool
- Add SERVICE_INVENTORY_V1.0.0.md (25 source-verified tools, 3.7GB)
- Add ERP_SELECTION_REPORT.md (Odoo CE vs ERPNext comparison)
- Add SFTPGO_ODOO_REPLACEMENT.md (SFTPGo migration plan)
- Add SERVICE_GO_GITEA_BUILD.md (Go compiler + Gitea build report)
- Add release visualize command (face trace heatmap + identity filter)
- Add sqlite-vec integration (160MB SQLite with vec0 vector tables)
- Add export_identities.py, export_sqlite.py, render_face_heatmap.py
- Add Go, Gitea, Rust/Cargo, Swift, yt-dlp, SQLite, sqlite-vec to service CLI
- Fix package to include identities and identity_bindings in data.sql
- Update release list to show all deployed video stats
- Add V1.0.0 YAML frontmatter to all docs (DOCS_STANDARD compliant)
2026-05-13 02:37:45 +08:00

165 lines
5.6 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Speaker Assignment: cluster voice vectors from Qdrant, assign speaker IDs to DB chunks.
"""
import json, sys, time
import psycopg2
import numpy as np
from urllib.request import Request, urlopen
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
QDRANT = "http://localhost:6333"
DB = "dbname=momentry user=accusys"
COLLECTION = "momentry_dev_voice"
print(f"=== Speaker Assignment for {UUID} ===")
# Step 1: Read voice vectors from Qdrant
print("Reading voice vectors from Qdrant...")
vectors = []
chunk_ids = []
# We need to scroll through all points
offset = None
while True:
data = {"limit": 100, "with_payload": True, "with_vector": True}
if offset is not None:
data["offset"] = offset
req = Request(f"{QDRANT}/collections/{COLLECTION}/points/scroll",
data=json.dumps(data).encode(),
headers={"Content-Type": "application/json"}, method="POST")
resp = json.loads(urlopen(req).read())
result = resp["result"]
points = result.get("points", [])
if not points:
break
for pt in points:
payload = pt.get("payload", {})
cid = payload.get("chunk_id", "")
# Only get vectors for THIS UUID's chunks
# Filter by checking DB later, or rely on Qdrant payload
vectors.append(pt["vector"])
chunk_ids.append(cid)
offset = result.get("next_page_offset")
if offset is None:
break
print(f" Read {len(vectors)} vectors...")
print(f"Total vectors: {len(vectors)}")
# Step 2: Filter to only our UUID's chunks (from DB)
conn = psycopg2.connect(DB)
cur = conn.cursor()
cur.execute("SELECT chunk_id FROM dev.chunk WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY id", (UUID,))
db_chunk_ids = set(row[0] for row in cur.fetchall())
print(f"DB chunk_ids: {len(db_chunk_ids)}")
# Filter vectors to match DB chunks
filtered_vectors = []
filtered_chunk_ids = []
for v, cid in zip(vectors, chunk_ids):
if cid in db_chunk_ids:
filtered_vectors.append(v)
filtered_chunk_ids.append(cid)
vectors = filtered_vectors
chunk_ids = filtered_chunk_ids
print(f"Matched vectors: {len(vectors)}")
# Sort by chunk_id (which is numeric string)
indices = sorted(range(len(chunk_ids)), key=lambda i: int(chunk_ids[i]) if chunk_ids[i].isdigit() else 0)
vectors = [vectors[i] for i in indices]
chunk_ids = [chunk_ids[i] for i in indices]
# Step 3: Read speaker_change from asr.json
asr_path = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json"
with open(asr_path) as f:
asr_data = json.load(f)
segments = asr_data.get("segments", [])
speaker_changes = {}
for seg in segments:
speaker_changes[seg["chunk_id"]] = seg.get("speaker_change", False)
# Step 4: Cluster embeddings
print("Clustering...")
X = np.array(vectors)
# Compute cosine distance matrix
# Cosine distance = 1 - cosine_similarity
cos_sim = cosine_similarity(X)
cos_dist = 1 - cos_sim
# Use AgglomerativeClustering with cosine distance
# Determine optimal n_clusters by looking at speaker_change boundaries
# First pass: use speaker_change as hard boundaries to get initial clusters
# Then refine
# Simpler: use a distance threshold
n = len(vectors)
labels = np.full(n, -1, dtype=int)
current_speaker = 0
# Start with first chunk as speaker 0
labels[0] = current_speaker
centroids = [np.array(vectors[0])] # per-cluster centroid
for i in range(1, n):
has_change = speaker_changes.get(chunk_ids[i], False)
vec = np.array(vectors[i])
if has_change:
# Speaker change: check if this is a NEW speaker or returning to a previous one
# Compare with centroid of current speaker vs others
similarities = [float(np.dot(vec, c) / (np.linalg.norm(vec) * np.linalg.norm(c) + 1e-10)) for c in centroids]
best_sim = max(similarities) if similarities else 0
best_cluster = similarities.index(best_sim) if similarities else 0
if best_sim > 0.65 and best_cluster != current_speaker:
# Returning to a previous speaker
labels[i] = best_cluster
elif best_sim < 0.55:
# New speaker
current_speaker = len(centroids)
labels[i] = current_speaker
centroids.append(vec)
else:
# Stay with current speaker (false change detection)
labels[i] = current_speaker
centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
else:
# No speaker change: same speaker as previous
labels[i] = current_speaker
centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
n_speakers = len(set(labels))
print(f"Identified {n_speakers} unique speakers")
# Step 5: Update DB chunks with speaker assignment
print("Updating DB chunks...")
# Map: chunk_id -> speaker_id
speaker_map = {}
for cid, label in zip(chunk_ids, labels):
speaker_map[cid] = f"SPEAKER_{label}"
updated = 0
for cid, spk_id in speaker_map.items():
cur.execute("""
UPDATE dev.chunk SET metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
WHERE file_uuid = %s AND chunk_id = %s AND chunk_type = 'sentence'
""", (json.dumps({"speaker_id": spk_id}), UUID, cid))
updated += 1
conn.commit()
print(f"Updated {updated} chunks with speaker IDs")
# Step 6: Save speaker map
speaker_map_path = f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map.json"
with open(speaker_map_path, "w") as f:
json.dump({"speakers": n_speakers, "assignments": speaker_map}, f, indent=2)
print(f"Speaker map saved: {speaker_map_path}")
cur.close()
conn.close()
print("=== Done ===")