feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)
Phase 2.6.1: co_occurrence_edges migration - build_co_occurrence_edges_from_qdrant() - Qdrant embeddings → frame grouping → YOLO objects - Result: 6679 edges (vs 6701 PostgreSQL) Phase 2.6.2: face_face_edges migration - build_face_face_edges_from_qdrant() - Qdrant embeddings → frame grouping → face pairs - mutual_gaze detection preserved - Result: 6 edges (exact match) Phase 2.6.3: speaker_face_edges migration - build_speaker_face_edges_from_qdrant() - Qdrant embeddings → trace_id frame ranges - SPEAKS_AS edge creation Architecture: - All edges use Qdrant payload (no face_detections queries) - PostgreSQL fallback for empty Qdrant - Estimated 3.6x performance improvement Testing: - Playground (3003): ✓ All Phase 2.6 logs verified - Edge counts: ✓ Close match with PostgreSQL - Fallback: ✓ Working Docs: - docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md - docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
This commit is contained in:
164
v1.1/scripts/speaker_assign_v1.11.py
Normal file
164
v1.1/scripts/speaker_assign_v1.11.py
Normal file
@@ -0,0 +1,164 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Assignment: cluster voice vectors from Qdrant, assign speaker IDs to DB chunks.
|
||||
"""
|
||||
import json, sys, time
|
||||
import psycopg2
|
||||
import numpy as np
|
||||
from urllib.request import Request, urlopen
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
|
||||
QDRANT = "http://localhost:6333"
|
||||
DB = "dbname=momentry user=accusys"
|
||||
COLLECTION = "momentry_dev_voice"
|
||||
|
||||
print(f"=== Speaker Assignment for {UUID} ===")
|
||||
|
||||
# Step 1: Read voice vectors from Qdrant
|
||||
print("Reading voice vectors from Qdrant...")
|
||||
vectors = []
|
||||
chunk_ids = []
|
||||
# We need to scroll through all points
|
||||
offset = None
|
||||
while True:
|
||||
data = {"limit": 100, "with_payload": True, "with_vector": True}
|
||||
if offset is not None:
|
||||
data["offset"] = offset
|
||||
req = Request(f"{QDRANT}/collections/{COLLECTION}/points/scroll",
|
||||
data=json.dumps(data).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="POST")
|
||||
resp = json.loads(urlopen(req).read())
|
||||
result = resp["result"]
|
||||
points = result.get("points", [])
|
||||
if not points:
|
||||
break
|
||||
for pt in points:
|
||||
payload = pt.get("payload", {})
|
||||
cid = payload.get("chunk_id", "")
|
||||
# Only get vectors for THIS UUID's chunks
|
||||
# Filter by checking DB later, or rely on Qdrant payload
|
||||
vectors.append(pt["vector"])
|
||||
chunk_ids.append(cid)
|
||||
offset = result.get("next_page_offset")
|
||||
if offset is None:
|
||||
break
|
||||
print(f" Read {len(vectors)} vectors...")
|
||||
|
||||
print(f"Total vectors: {len(vectors)}")
|
||||
|
||||
# Step 2: Filter to only our UUID's chunks (from DB)
|
||||
conn = psycopg2.connect(DB)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT chunk_id FROM dev.chunk WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY id", (UUID,))
|
||||
db_chunk_ids = set(row[0] for row in cur.fetchall())
|
||||
print(f"DB chunk_ids: {len(db_chunk_ids)}")
|
||||
|
||||
# Filter vectors to match DB chunks
|
||||
filtered_vectors = []
|
||||
filtered_chunk_ids = []
|
||||
for v, cid in zip(vectors, chunk_ids):
|
||||
if cid in db_chunk_ids:
|
||||
filtered_vectors.append(v)
|
||||
filtered_chunk_ids.append(cid)
|
||||
|
||||
vectors = filtered_vectors
|
||||
chunk_ids = filtered_chunk_ids
|
||||
print(f"Matched vectors: {len(vectors)}")
|
||||
|
||||
# Sort by chunk_id (which is numeric string)
|
||||
indices = sorted(range(len(chunk_ids)), key=lambda i: int(chunk_ids[i]) if chunk_ids[i].isdigit() else 0)
|
||||
vectors = [vectors[i] for i in indices]
|
||||
chunk_ids = [chunk_ids[i] for i in indices]
|
||||
|
||||
# Step 3: Read speaker_change from asr.json
|
||||
asr_path = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json"
|
||||
with open(asr_path) as f:
|
||||
asr_data = json.load(f)
|
||||
segments = asr_data.get("segments", [])
|
||||
speaker_changes = {}
|
||||
for seg in segments:
|
||||
speaker_changes[seg["chunk_id"]] = seg.get("speaker_change", False)
|
||||
|
||||
# Step 4: Cluster embeddings
|
||||
print("Clustering...")
|
||||
X = np.array(vectors)
|
||||
|
||||
# Compute cosine distance matrix
|
||||
# Cosine distance = 1 - cosine_similarity
|
||||
cos_sim = cosine_similarity(X)
|
||||
cos_dist = 1 - cos_sim
|
||||
|
||||
# Use AgglomerativeClustering with cosine distance
|
||||
# Determine optimal n_clusters by looking at speaker_change boundaries
|
||||
# First pass: use speaker_change as hard boundaries to get initial clusters
|
||||
# Then refine
|
||||
|
||||
# Simpler: use a distance threshold
|
||||
n = len(vectors)
|
||||
labels = np.full(n, -1, dtype=int)
|
||||
current_speaker = 0
|
||||
|
||||
# Start with first chunk as speaker 0
|
||||
labels[0] = current_speaker
|
||||
centroids = [np.array(vectors[0])] # per-cluster centroid
|
||||
|
||||
for i in range(1, n):
|
||||
has_change = speaker_changes.get(chunk_ids[i], False)
|
||||
vec = np.array(vectors[i])
|
||||
|
||||
if has_change:
|
||||
# Speaker change: check if this is a NEW speaker or returning to a previous one
|
||||
# Compare with centroid of current speaker vs others
|
||||
similarities = [float(np.dot(vec, c) / (np.linalg.norm(vec) * np.linalg.norm(c) + 1e-10)) for c in centroids]
|
||||
best_sim = max(similarities) if similarities else 0
|
||||
best_cluster = similarities.index(best_sim) if similarities else 0
|
||||
|
||||
if best_sim > 0.65 and best_cluster != current_speaker:
|
||||
# Returning to a previous speaker
|
||||
labels[i] = best_cluster
|
||||
elif best_sim < 0.55:
|
||||
# New speaker
|
||||
current_speaker = len(centroids)
|
||||
labels[i] = current_speaker
|
||||
centroids.append(vec)
|
||||
else:
|
||||
# Stay with current speaker (false change detection)
|
||||
labels[i] = current_speaker
|
||||
centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
|
||||
else:
|
||||
# No speaker change: same speaker as previous
|
||||
labels[i] = current_speaker
|
||||
centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
|
||||
|
||||
n_speakers = len(set(labels))
|
||||
print(f"Identified {n_speakers} unique speakers")
|
||||
|
||||
# Step 5: Update DB chunks with speaker assignment
|
||||
print("Updating DB chunks...")
|
||||
# Map: chunk_id -> speaker_id
|
||||
speaker_map = {}
|
||||
for cid, label in zip(chunk_ids, labels):
|
||||
speaker_map[cid] = f"SPEAKER_{label}"
|
||||
|
||||
updated = 0
|
||||
for cid, spk_id in speaker_map.items():
|
||||
cur.execute("""
|
||||
UPDATE dev.chunk SET metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
|
||||
WHERE file_uuid = %s AND chunk_id = %s AND chunk_type = 'sentence'
|
||||
""", (json.dumps({"speaker_id": spk_id}), UUID, cid))
|
||||
updated += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"Updated {updated} chunks with speaker IDs")
|
||||
|
||||
# Step 6: Save speaker map
|
||||
speaker_map_path = f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map.json"
|
||||
with open(speaker_map_path, "w") as f:
|
||||
json.dump({"speakers": n_speakers, "assignments": speaker_map}, f, indent=2)
|
||||
print(f"Speaker map saved: {speaker_map_path}")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("=== Done ===")
|
||||
Reference in New Issue
Block a user