feat: service inventory, ERP reports, sqlite-vec integration, visualize tool

- Add SERVICE_INVENTORY_V1.0.0.md (25 source-verified tools, 3.7GB) - Add ERP_SELECTION_REPORT.md (Odoo CE vs ERPNext comparison) - Add SFTPGO_ODOO_REPLACEMENT.md (SFTPGo migration plan) - Add SERVICE_GO_GITEA_BUILD.md (Go compiler + Gitea build report) - Add release visualize command (face trace heatmap + identity filter) - Add sqlite-vec integration (160MB SQLite with vec0 vector tables) - Add export_identities.py, export_sqlite.py, render_face_heatmap.py - Add Go, Gitea, Rust/Cargo, Swift, yt-dlp, SQLite, sqlite-vec to service CLI - Fix package to include identities and identity_bindings in data.sql - Update release list to show all deployed video stats - Add V1.0.0 YAML frontmatter to all docs (DOCS_STANDARD compliant)
2026-05-13 02:37:45 +08:00
parent cac60c6093
commit 2992a0e650
25 changed files with 6076 additions and 3 deletions
--- a/scripts/speaker_assign.py
+++ b/scripts/speaker_assign.py
@@ -0,0 +1,164 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Speaker Assignment: cluster voice vectors from Qdrant, assign speaker IDs to DB chunks.
+"""
+import json, sys, time
+import psycopg2
+import numpy as np
+from urllib.request import Request, urlopen
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics.pairwise import cosine_similarity
+
+UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
+QDRANT = "http://localhost:6333"
+DB = "dbname=momentry user=accusys"
+COLLECTION = "momentry_dev_voice"
+
+print(f"=== Speaker Assignment for {UUID} ===")
+
+# Step 1: Read voice vectors from Qdrant
+print("Reading voice vectors from Qdrant...")
+vectors = []
+chunk_ids = []
+# We need to scroll through all points
+offset = None
+while True:
+    data = {"limit": 100, "with_payload": True, "with_vector": True}
+    if offset is not None:
+        data["offset"] = offset
+    req = Request(f"{QDRANT}/collections/{COLLECTION}/points/scroll",
+        data=json.dumps(data).encode(),
+        headers={"Content-Type": "application/json"}, method="POST")
+    resp = json.loads(urlopen(req).read())
+    result = resp["result"]
+    points = result.get("points", [])
+    if not points:
+        break
+    for pt in points:
+        payload = pt.get("payload", {})
+        cid = payload.get("chunk_id", "")
+        # Only get vectors for THIS UUID's chunks
+        # Filter by checking DB later, or rely on Qdrant payload
+        vectors.append(pt["vector"])
+        chunk_ids.append(cid)
+    offset = result.get("next_page_offset")
+    if offset is None:
+        break
+    print(f"  Read {len(vectors)} vectors...")
+
+print(f"Total vectors: {len(vectors)}")
+
+# Step 2: Filter to only our UUID's chunks (from DB)
+conn = psycopg2.connect(DB)
+cur = conn.cursor()
+cur.execute("SELECT chunk_id FROM dev.chunk WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY id", (UUID,))
+db_chunk_ids = set(row[0] for row in cur.fetchall())
+print(f"DB chunk_ids: {len(db_chunk_ids)}")
+
+# Filter vectors to match DB chunks
+filtered_vectors = []
+filtered_chunk_ids = []
+for v, cid in zip(vectors, chunk_ids):
+    if cid in db_chunk_ids:
+        filtered_vectors.append(v)
+        filtered_chunk_ids.append(cid)
+
+vectors = filtered_vectors
+chunk_ids = filtered_chunk_ids
+print(f"Matched vectors: {len(vectors)}")
+
+# Sort by chunk_id (which is numeric string)
+indices = sorted(range(len(chunk_ids)), key=lambda i: int(chunk_ids[i]) if chunk_ids[i].isdigit() else 0)
+vectors = [vectors[i] for i in indices]
+chunk_ids = [chunk_ids[i] for i in indices]
+
+# Step 3: Read speaker_change from asr.json
+asr_path = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json"
+with open(asr_path) as f:
+    asr_data = json.load(f)
+segments = asr_data.get("segments", [])
+speaker_changes = {}
+for seg in segments:
+    speaker_changes[seg["chunk_id"]] = seg.get("speaker_change", False)
+
+# Step 4: Cluster embeddings
+print("Clustering...")
+X = np.array(vectors)
+
+# Compute cosine distance matrix
+# Cosine distance = 1 - cosine_similarity
+cos_sim = cosine_similarity(X)
+cos_dist = 1 - cos_sim
+
+# Use AgglomerativeClustering with cosine distance
+# Determine optimal n_clusters by looking at speaker_change boundaries
+# First pass: use speaker_change as hard boundaries to get initial clusters
+# Then refine
+
+# Simpler: use a distance threshold
+n = len(vectors)
+labels = np.full(n, -1, dtype=int)
+current_speaker = 0
+
+# Start with first chunk as speaker 0
+labels[0] = current_speaker
+centroids = [np.array(vectors[0])]  # per-cluster centroid
+
+for i in range(1, n):
+    has_change = speaker_changes.get(chunk_ids[i], False)
+    vec = np.array(vectors[i])
+
+    if has_change:
+        # Speaker change: check if this is a NEW speaker or returning to a previous one
+        # Compare with centroid of current speaker vs others
+        similarities = [float(np.dot(vec, c) / (np.linalg.norm(vec) * np.linalg.norm(c) + 1e-10)) for c in centroids]
+        best_sim = max(similarities) if similarities else 0
+        best_cluster = similarities.index(best_sim) if similarities else 0
+
+        if best_sim > 0.65 and best_cluster != current_speaker:
+            # Returning to a previous speaker
+            labels[i] = best_cluster
+        elif best_sim < 0.55:
+            # New speaker
+            current_speaker = len(centroids)
+            labels[i] = current_speaker
+            centroids.append(vec)
+        else:
+            # Stay with current speaker (false change detection)
+            labels[i] = current_speaker
+            centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
+    else:
+        # No speaker change: same speaker as previous
+        labels[i] = current_speaker
+        centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
+
+n_speakers = len(set(labels))
+print(f"Identified {n_speakers} unique speakers")
+
+# Step 5: Update DB chunks with speaker assignment
+print("Updating DB chunks...")
+# Map: chunk_id -> speaker_id
+speaker_map = {}
+for cid, label in zip(chunk_ids, labels):
+    speaker_map[cid] = f"SPEAKER_{label}"
+
+updated = 0
+for cid, spk_id in speaker_map.items():
+    cur.execute("""
+        UPDATE dev.chunk SET metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
+        WHERE file_uuid = %s AND chunk_id = %s AND chunk_type = 'sentence'
+    """, (json.dumps({"speaker_id": spk_id}), UUID, cid))
+    updated += 1
+
+conn.commit()
+print(f"Updated {updated} chunks with speaker IDs")
+
+# Step 6: Save speaker map
+speaker_map_path = f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map.json"
+with open(speaker_map_path, "w") as f:
+    json.dump({"speakers": n_speakers, "assignments": speaker_map}, f, indent=2)
+print(f"Speaker map saved: {speaker_map_path}")
+
+cur.close()
+conn.close()
+print("=== Done ===")