#!/opt/homebrew/bin/python3.11 """ Build new ASRX speaker_id → character name mapping using: 1. Old DB sentence chunk metadata (speaker_name from face-to-TMDb match) 2. New ASRX segments (1:1 aligned with ASR, each with speaker_id + voice embedding) """ import json, sys, psycopg2 from collections import Counter, defaultdict import numpy as np from urllib.request import Request, urlopen UUID = "aeed71342a899fe4b4c57b7d41bcb692" ASRX_PATH = f"/Users/accusys/momentry/output_dev/{UUID}.asrx.json" QDRANT_URL = "http://localhost:6333" DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp" # Character name normalization NAME_MAP = { "Speaker_0": "Unknown", "SPEAKER_0": "Unknown", "SPEAKER_1": "Unknown", "SPEAKER_2": "Unknown", "SPEAKER_3": "Unknown", "SPEAKER_4": "Unknown", "SPEAKER_5": "Unknown", "SPEAKER_6": "Unknown", "SPEAKER_7": "Unknown", "SPEAKER_8": "Unknown", "SPEAKER_9": "Unknown", } print("=== Step 1: Load DB sentence chunks ===") conn = psycopg2.connect(DB_URL) cur = conn.cursor() cur.execute(""" SELECT chunk_index, metadata->>'speaker_id' as old_sid, metadata->>'speaker_name' as old_name FROM dev.chunks WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY chunk_index """, (UUID,)) rows = cur.fetchall() cur.close() conn.close() print(f"Loaded {len(rows)} sentence chunks from DB") # Build array indexed by chunk_index db_by_idx = {} for r in rows: db_by_idx[r[0]] = {"old_sid": r[1], "old_name": r[2]} print("=== Step 2: Load new ASRX ===") asrx = json.load(open(ASRX_PATH)) segs = asrx["segments"] embeddings = asrx.get("embeddings", []) print(f"Loaded {len(segs)} ASRX segments, {len(embeddings)} embeddings") # Build mapping: new_speaker_id --> old_name distribution new_to_old = defaultdict(list) old_name_counter = defaultdict(Counter) unmapped = 0 total = 0 for i, seg in enumerate(segs): new_sid = seg["speaker_id"] total += 1 if i in db_by_idx: old_name = db_by_idx[i].get("old_name", "") old_sid = db_by_idx[i].get("old_sid", "") # Normalize old name if old_name and old_name not in NAME_MAP: # Normalize case: "Speaker_0" → "Unknown" if old_name.startswith("Speaker_") or old_name.startswith("SPEAKER_"): old_name = "Unknown" elif old_name in NAME_MAP: old_name = NAME_MAP[old_name] new_to_old[new_sid].append(old_name) old_name_counter[new_sid][old_name] += 1 else: unmapped += 1 new_to_old[new_sid].append("Unknown") print(f"\nMapped {total - unmapped} segments, {unmapped} unmapped") print(f"\nMapping {len(new_to_old)} new speaker IDs:") # Determine best character name for each new speaker speaker_identity = {} for sid in sorted(new_to_old.keys()): counter = old_name_counter[sid] total_for_speaker = sum(counter.values()) best_name = counter.most_common(1)[0][0] best_count = counter.most_common(1)[0][1] pct = best_count / total_for_speaker * 100 speaker_identity[sid] = { "name": best_name, "confidence": round(pct, 1), "count": total_for_speaker, "distribution": dict(counter.most_common(5)) } print(f" {sid}: {best_name} ({pct:.0f}%, {total_for_speaker} segs)") for nm, cnt in counter.most_common(5): if nm != best_name: print(f" {nm}: {cnt}") print("\n=== Step 3: Assign names to all new ASRX segments ===") assignments = [] for i, seg in enumerate(segs): new_sid = seg["speaker_id"] assigned_name = speaker_identity[new_sid]["name"] assignments.append({ "index": i, "speaker_id": new_sid, "speaker_name": assigned_name, "start_time": seg["start_time"], "end_time": seg["end_time"], }) # Save mapping output = { "uuid": UUID, "total_segments": len(segs), "speaker_identity": speaker_identity, "assignments": assignments, } with open(f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map_v2.json", "w") as f: json.dump(output, f, indent=2) print(f"\nSaved speaker mapping to output_dev/{UUID}.speaker_map_v2.json") print("\n=== Summary ===") for sid, info in sorted(speaker_identity.items()): print(f" {sid} ({info['count']} segs, {info['confidence']}% confidence): {info['name']}")