feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
137
scripts/map_speakers_v2.py
Normal file
137
scripts/map_speakers_v2.py
Normal file
@@ -0,0 +1,137 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Build new ASRX speaker_id → character name mapping using:
|
||||
1. Old DB sentence chunk metadata (speaker_name from face-to-TMDb match)
|
||||
2. New ASRX segments (1:1 aligned with ASR, each with speaker_id + voice embedding)
|
||||
"""
|
||||
|
||||
import json, sys, psycopg2
|
||||
from collections import Counter, defaultdict
|
||||
import numpy as np
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
ASRX_PATH = f"/Users/accusys/momentry/output_dev/{UUID}.asrx.json"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
|
||||
# Character name normalization
|
||||
NAME_MAP = {
|
||||
"Speaker_0": "Unknown",
|
||||
"SPEAKER_0": "Unknown",
|
||||
"SPEAKER_1": "Unknown",
|
||||
"SPEAKER_2": "Unknown",
|
||||
"SPEAKER_3": "Unknown",
|
||||
"SPEAKER_4": "Unknown",
|
||||
"SPEAKER_5": "Unknown",
|
||||
"SPEAKER_6": "Unknown",
|
||||
"SPEAKER_7": "Unknown",
|
||||
"SPEAKER_8": "Unknown",
|
||||
"SPEAKER_9": "Unknown",
|
||||
}
|
||||
|
||||
print("=== Step 1: Load DB sentence chunks ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT chunk_index, metadata->>'speaker_id' as old_sid,
|
||||
metadata->>'speaker_name' as old_name
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
print(f"Loaded {len(rows)} sentence chunks from DB")
|
||||
|
||||
# Build array indexed by chunk_index
|
||||
db_by_idx = {}
|
||||
for r in rows:
|
||||
db_by_idx[r[0]] = {"old_sid": r[1], "old_name": r[2]}
|
||||
|
||||
print("=== Step 2: Load new ASRX ===")
|
||||
asrx = json.load(open(ASRX_PATH))
|
||||
segs = asrx["segments"]
|
||||
embeddings = asrx.get("embeddings", [])
|
||||
print(f"Loaded {len(segs)} ASRX segments, {len(embeddings)} embeddings")
|
||||
|
||||
# Build mapping: new_speaker_id --> old_name distribution
|
||||
new_to_old = defaultdict(list)
|
||||
old_name_counter = defaultdict(Counter)
|
||||
unmapped = 0
|
||||
total = 0
|
||||
|
||||
for i, seg in enumerate(segs):
|
||||
new_sid = seg["speaker_id"]
|
||||
total += 1
|
||||
|
||||
if i in db_by_idx:
|
||||
old_name = db_by_idx[i].get("old_name", "")
|
||||
old_sid = db_by_idx[i].get("old_sid", "")
|
||||
|
||||
# Normalize old name
|
||||
if old_name and old_name not in NAME_MAP:
|
||||
# Normalize case: "Speaker_0" → "Unknown"
|
||||
if old_name.startswith("Speaker_") or old_name.startswith("SPEAKER_"):
|
||||
old_name = "Unknown"
|
||||
elif old_name in NAME_MAP:
|
||||
old_name = NAME_MAP[old_name]
|
||||
|
||||
new_to_old[new_sid].append(old_name)
|
||||
old_name_counter[new_sid][old_name] += 1
|
||||
else:
|
||||
unmapped += 1
|
||||
new_to_old[new_sid].append("Unknown")
|
||||
|
||||
print(f"\nMapped {total - unmapped} segments, {unmapped} unmapped")
|
||||
print(f"\nMapping {len(new_to_old)} new speaker IDs:")
|
||||
|
||||
# Determine best character name for each new speaker
|
||||
speaker_identity = {}
|
||||
for sid in sorted(new_to_old.keys()):
|
||||
counter = old_name_counter[sid]
|
||||
total_for_speaker = sum(counter.values())
|
||||
best_name = counter.most_common(1)[0][0]
|
||||
best_count = counter.most_common(1)[0][1]
|
||||
pct = best_count / total_for_speaker * 100
|
||||
|
||||
speaker_identity[sid] = {
|
||||
"name": best_name,
|
||||
"confidence": round(pct, 1),
|
||||
"count": total_for_speaker,
|
||||
"distribution": dict(counter.most_common(5))
|
||||
}
|
||||
print(f" {sid}: {best_name} ({pct:.0f}%, {total_for_speaker} segs)")
|
||||
for nm, cnt in counter.most_common(5):
|
||||
if nm != best_name:
|
||||
print(f" {nm}: {cnt}")
|
||||
|
||||
print("\n=== Step 3: Assign names to all new ASRX segments ===")
|
||||
assignments = []
|
||||
for i, seg in enumerate(segs):
|
||||
new_sid = seg["speaker_id"]
|
||||
assigned_name = speaker_identity[new_sid]["name"]
|
||||
assignments.append({
|
||||
"index": i,
|
||||
"speaker_id": new_sid,
|
||||
"speaker_name": assigned_name,
|
||||
"start_time": seg["start_time"],
|
||||
"end_time": seg["end_time"],
|
||||
})
|
||||
|
||||
# Save mapping
|
||||
output = {
|
||||
"uuid": UUID,
|
||||
"total_segments": len(segs),
|
||||
"speaker_identity": speaker_identity,
|
||||
"assignments": assignments,
|
||||
}
|
||||
with open(f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map_v2.json", "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
print(f"\nSaved speaker mapping to output_dev/{UUID}.speaker_map_v2.json")
|
||||
|
||||
print("\n=== Summary ===")
|
||||
for sid, info in sorted(speaker_identity.items()):
|
||||
print(f" {sid} ({info['count']} segs, {info['confidence']}% confidence): {info['name']}")
|
||||
Reference in New Issue
Block a user