feat: service inventory, ERP reports, sqlite-vec integration, visualize tool
- Add SERVICE_INVENTORY_V1.0.0.md (25 source-verified tools, 3.7GB) - Add ERP_SELECTION_REPORT.md (Odoo CE vs ERPNext comparison) - Add SFTPGO_ODOO_REPLACEMENT.md (SFTPGo migration plan) - Add SERVICE_GO_GITEA_BUILD.md (Go compiler + Gitea build report) - Add release visualize command (face trace heatmap + identity filter) - Add sqlite-vec integration (160MB SQLite with vec0 vector tables) - Add export_identities.py, export_sqlite.py, render_face_heatmap.py - Add Go, Gitea, Rust/Cargo, Swift, yt-dlp, SQLite, sqlite-vec to service CLI - Fix package to include identities and identity_bindings in data.sql - Update release list to show all deployed video stats - Add V1.0.0 YAML frontmatter to all docs (DOCS_STANDARD compliant)
This commit is contained in:
164
scripts/speaker_assign.py
Normal file
164
scripts/speaker_assign.py
Normal file
@@ -0,0 +1,164 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Assignment: cluster voice vectors from Qdrant, assign speaker IDs to DB chunks.
|
||||
"""
|
||||
import json, sys, time
|
||||
import psycopg2
|
||||
import numpy as np
|
||||
from urllib.request import Request, urlopen
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
|
||||
QDRANT = "http://localhost:6333"
|
||||
DB = "dbname=momentry user=accusys"
|
||||
COLLECTION = "momentry_dev_voice"
|
||||
|
||||
print(f"=== Speaker Assignment for {UUID} ===")
|
||||
|
||||
# Step 1: Read voice vectors from Qdrant
|
||||
print("Reading voice vectors from Qdrant...")
|
||||
vectors = []
|
||||
chunk_ids = []
|
||||
# We need to scroll through all points
|
||||
offset = None
|
||||
while True:
|
||||
data = {"limit": 100, "with_payload": True, "with_vector": True}
|
||||
if offset is not None:
|
||||
data["offset"] = offset
|
||||
req = Request(f"{QDRANT}/collections/{COLLECTION}/points/scroll",
|
||||
data=json.dumps(data).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="POST")
|
||||
resp = json.loads(urlopen(req).read())
|
||||
result = resp["result"]
|
||||
points = result.get("points", [])
|
||||
if not points:
|
||||
break
|
||||
for pt in points:
|
||||
payload = pt.get("payload", {})
|
||||
cid = payload.get("chunk_id", "")
|
||||
# Only get vectors for THIS UUID's chunks
|
||||
# Filter by checking DB later, or rely on Qdrant payload
|
||||
vectors.append(pt["vector"])
|
||||
chunk_ids.append(cid)
|
||||
offset = result.get("next_page_offset")
|
||||
if offset is None:
|
||||
break
|
||||
print(f" Read {len(vectors)} vectors...")
|
||||
|
||||
print(f"Total vectors: {len(vectors)}")
|
||||
|
||||
# Step 2: Filter to only our UUID's chunks (from DB)
|
||||
conn = psycopg2.connect(DB)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT chunk_id FROM dev.chunk WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY id", (UUID,))
|
||||
db_chunk_ids = set(row[0] for row in cur.fetchall())
|
||||
print(f"DB chunk_ids: {len(db_chunk_ids)}")
|
||||
|
||||
# Filter vectors to match DB chunks
|
||||
filtered_vectors = []
|
||||
filtered_chunk_ids = []
|
||||
for v, cid in zip(vectors, chunk_ids):
|
||||
if cid in db_chunk_ids:
|
||||
filtered_vectors.append(v)
|
||||
filtered_chunk_ids.append(cid)
|
||||
|
||||
vectors = filtered_vectors
|
||||
chunk_ids = filtered_chunk_ids
|
||||
print(f"Matched vectors: {len(vectors)}")
|
||||
|
||||
# Sort by chunk_id (which is numeric string)
|
||||
indices = sorted(range(len(chunk_ids)), key=lambda i: int(chunk_ids[i]) if chunk_ids[i].isdigit() else 0)
|
||||
vectors = [vectors[i] for i in indices]
|
||||
chunk_ids = [chunk_ids[i] for i in indices]
|
||||
|
||||
# Step 3: Read speaker_change from asr.json
|
||||
asr_path = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json"
|
||||
with open(asr_path) as f:
|
||||
asr_data = json.load(f)
|
||||
segments = asr_data.get("segments", [])
|
||||
speaker_changes = {}
|
||||
for seg in segments:
|
||||
speaker_changes[seg["chunk_id"]] = seg.get("speaker_change", False)
|
||||
|
||||
# Step 4: Cluster embeddings
|
||||
print("Clustering...")
|
||||
X = np.array(vectors)
|
||||
|
||||
# Compute cosine distance matrix
|
||||
# Cosine distance = 1 - cosine_similarity
|
||||
cos_sim = cosine_similarity(X)
|
||||
cos_dist = 1 - cos_sim
|
||||
|
||||
# Use AgglomerativeClustering with cosine distance
|
||||
# Determine optimal n_clusters by looking at speaker_change boundaries
|
||||
# First pass: use speaker_change as hard boundaries to get initial clusters
|
||||
# Then refine
|
||||
|
||||
# Simpler: use a distance threshold
|
||||
n = len(vectors)
|
||||
labels = np.full(n, -1, dtype=int)
|
||||
current_speaker = 0
|
||||
|
||||
# Start with first chunk as speaker 0
|
||||
labels[0] = current_speaker
|
||||
centroids = [np.array(vectors[0])] # per-cluster centroid
|
||||
|
||||
for i in range(1, n):
|
||||
has_change = speaker_changes.get(chunk_ids[i], False)
|
||||
vec = np.array(vectors[i])
|
||||
|
||||
if has_change:
|
||||
# Speaker change: check if this is a NEW speaker or returning to a previous one
|
||||
# Compare with centroid of current speaker vs others
|
||||
similarities = [float(np.dot(vec, c) / (np.linalg.norm(vec) * np.linalg.norm(c) + 1e-10)) for c in centroids]
|
||||
best_sim = max(similarities) if similarities else 0
|
||||
best_cluster = similarities.index(best_sim) if similarities else 0
|
||||
|
||||
if best_sim > 0.65 and best_cluster != current_speaker:
|
||||
# Returning to a previous speaker
|
||||
labels[i] = best_cluster
|
||||
elif best_sim < 0.55:
|
||||
# New speaker
|
||||
current_speaker = len(centroids)
|
||||
labels[i] = current_speaker
|
||||
centroids.append(vec)
|
||||
else:
|
||||
# Stay with current speaker (false change detection)
|
||||
labels[i] = current_speaker
|
||||
centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
|
||||
else:
|
||||
# No speaker change: same speaker as previous
|
||||
labels[i] = current_speaker
|
||||
centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
|
||||
|
||||
n_speakers = len(set(labels))
|
||||
print(f"Identified {n_speakers} unique speakers")
|
||||
|
||||
# Step 5: Update DB chunks with speaker assignment
|
||||
print("Updating DB chunks...")
|
||||
# Map: chunk_id -> speaker_id
|
||||
speaker_map = {}
|
||||
for cid, label in zip(chunk_ids, labels):
|
||||
speaker_map[cid] = f"SPEAKER_{label}"
|
||||
|
||||
updated = 0
|
||||
for cid, spk_id in speaker_map.items():
|
||||
cur.execute("""
|
||||
UPDATE dev.chunk SET metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
|
||||
WHERE file_uuid = %s AND chunk_id = %s AND chunk_type = 'sentence'
|
||||
""", (json.dumps({"speaker_id": spk_id}), UUID, cid))
|
||||
updated += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"Updated {updated} chunks with speaker IDs")
|
||||
|
||||
# Step 6: Save speaker map
|
||||
speaker_map_path = f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map.json"
|
||||
with open(speaker_map_path, "w") as f:
|
||||
json.dump({"speakers": n_speakers, "assignments": speaker_map}, f, indent=2)
|
||||
print(f"Speaker map saved: {speaker_map_path}")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("=== Done ===")
|
||||
Reference in New Issue
Block a user