feat: Phase 1 handover - schema migration, correction mechanism, API fixes

Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
2026-05-11 07:03:22 +08:00
parent ef894a44ad
commit 39ba5ddf76
147 changed files with 19843 additions and 3053 deletions
--- a/scripts/map_speakers_v2.py
+++ b/scripts/map_speakers_v2.py
@@ -0,0 +1,137 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Build new ASRX speaker_id → character name mapping using:
+1. Old DB sentence chunk metadata (speaker_name from face-to-TMDb match)
+2. New ASRX segments (1:1 aligned with ASR, each with speaker_id + voice embedding)
+"""
+
+import json, sys, psycopg2
+from collections import Counter, defaultdict
+import numpy as np
+from urllib.request import Request, urlopen
+
+UUID = "aeed71342a899fe4b4c57b7d41bcb692"
+ASRX_PATH = f"/Users/accusys/momentry/output_dev/{UUID}.asrx.json"
+QDRANT_URL = "http://localhost:6333"
+
+DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
+
+# Character name normalization
+NAME_MAP = {
+    "Speaker_0": "Unknown",
+    "SPEAKER_0": "Unknown",
+    "SPEAKER_1": "Unknown",
+    "SPEAKER_2": "Unknown",
+    "SPEAKER_3": "Unknown",
+    "SPEAKER_4": "Unknown",
+    "SPEAKER_5": "Unknown",
+    "SPEAKER_6": "Unknown",
+    "SPEAKER_7": "Unknown",
+    "SPEAKER_8": "Unknown",
+    "SPEAKER_9": "Unknown",
+}
+
+print("=== Step 1: Load DB sentence chunks ===")
+conn = psycopg2.connect(DB_URL)
+cur = conn.cursor()
+cur.execute("""
+    SELECT chunk_index, metadata->>'speaker_id' as old_sid,
+           metadata->>'speaker_name' as old_name
+    FROM dev.chunks
+    WHERE file_uuid = %s AND chunk_type = 'sentence'
+    ORDER BY chunk_index
+""", (UUID,))
+rows = cur.fetchall()
+cur.close()
+conn.close()
+print(f"Loaded {len(rows)} sentence chunks from DB")
+
+# Build array indexed by chunk_index
+db_by_idx = {}
+for r in rows:
+    db_by_idx[r[0]] = {"old_sid": r[1], "old_name": r[2]}
+
+print("=== Step 2: Load new ASRX ===")
+asrx = json.load(open(ASRX_PATH))
+segs = asrx["segments"]
+embeddings = asrx.get("embeddings", [])
+print(f"Loaded {len(segs)} ASRX segments, {len(embeddings)} embeddings")
+
+# Build mapping: new_speaker_id --> old_name distribution
+new_to_old = defaultdict(list)
+old_name_counter = defaultdict(Counter)
+unmapped = 0
+total = 0
+
+for i, seg in enumerate(segs):
+    new_sid = seg["speaker_id"]
+    total += 1
+    
+    if i in db_by_idx:
+        old_name = db_by_idx[i].get("old_name", "")
+        old_sid = db_by_idx[i].get("old_sid", "")
+        
+        # Normalize old name
+        if old_name and old_name not in NAME_MAP:
+            # Normalize case: "Speaker_0" → "Unknown"
+            if old_name.startswith("Speaker_") or old_name.startswith("SPEAKER_"):
+                old_name = "Unknown"
+        elif old_name in NAME_MAP:
+            old_name = NAME_MAP[old_name]
+        
+        new_to_old[new_sid].append(old_name)
+        old_name_counter[new_sid][old_name] += 1
+    else:
+        unmapped += 1
+        new_to_old[new_sid].append("Unknown")
+
+print(f"\nMapped {total - unmapped} segments, {unmapped} unmapped")
+print(f"\nMapping {len(new_to_old)} new speaker IDs:")
+
+# Determine best character name for each new speaker
+speaker_identity = {}
+for sid in sorted(new_to_old.keys()):
+    counter = old_name_counter[sid]
+    total_for_speaker = sum(counter.values())
+    best_name = counter.most_common(1)[0][0]
+    best_count = counter.most_common(1)[0][1]
+    pct = best_count / total_for_speaker * 100
+    
+    speaker_identity[sid] = {
+        "name": best_name,
+        "confidence": round(pct, 1),
+        "count": total_for_speaker,
+        "distribution": dict(counter.most_common(5))
+    }
+    print(f"  {sid}: {best_name} ({pct:.0f}%, {total_for_speaker} segs)")
+    for nm, cnt in counter.most_common(5):
+        if nm != best_name:
+            print(f"         {nm}: {cnt}")
+
+print("\n=== Step 3: Assign names to all new ASRX segments ===")
+assignments = []
+for i, seg in enumerate(segs):
+    new_sid = seg["speaker_id"]
+    assigned_name = speaker_identity[new_sid]["name"]
+    assignments.append({
+        "index": i,
+        "speaker_id": new_sid,
+        "speaker_name": assigned_name,
+        "start_time": seg["start_time"],
+        "end_time": seg["end_time"],
+    })
+
+# Save mapping
+output = {
+    "uuid": UUID,
+    "total_segments": len(segs),
+    "speaker_identity": speaker_identity,
+    "assignments": assignments,
+}
+with open(f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map_v2.json", "w") as f:
+    json.dump(output, f, indent=2)
+print(f"\nSaved speaker mapping to output_dev/{UUID}.speaker_map_v2.json")
+
+print("\n=== Summary ===")
+for sid, info in sorted(speaker_identity.items()):
+    print(f"  {sid} ({info['count']} segs, {info['confidence']}% confidence): {info['name']}")