momentry_core/scripts/update_fine_speakers.py

#!/opt/homebrew/bin/python3.11
"""
Update DB sentence chunks with fine-grained ASRX speaker assignments.
Each ASR segment gets the majority speaker_name from overlapping fine segments.
"""
import json, psycopg2
from collections import Counter

UUID = "aeed71342a899fe4b4c57b7d41bcb692"
BASE = "/Users/accusys/momentry/output_dev"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"

print("=== Step 1: Load fine ASRX ===")
fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json"))
fine_segs = fine["segments"]
print(f"Fine segments: {len(fine_segs)}")

print("\n=== Step 2: Load existing sentence chunks ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
    SELECT id, chunk_index, start_time, end_time, metadata
    FROM dev.chunks
    WHERE file_uuid=%s AND chunk_type='sentence'
    ORDER BY chunk_index
""", (UUID,))
chunks = cur.fetchall()
print(f"DB sentence chunks: {len(chunks)}")

# For each chunk, find overlapping fine segments
print("\n=== Step 3: Update speaker assignments ===")
updated = 0
for row in chunks:
    db_id, idx, st, et, meta = row
    if meta is None or isinstance(meta, str):
        try:
            meta = json.loads(meta) if isinstance(meta, str) else {}
        except:
            meta = {}

    # Find overlapping fine segments
    overlapping = [s for s in fine_segs if s["start_time"] < et and s["end_time"] > st]

    if overlapping:
        # Majority vote
        names = Counter(s["speaker_name"] for s in overlapping)
        ids = Counter(s["speaker_id"] for s in overlapping)
        best_name = names.most_common(1)[0][0]
        best_id = ids.most_common(1)[0][0]

        meta["speaker_name"] = best_name
        meta["speaker_id"] = best_id
        meta["fine_speaker_name"] = best_name
        meta["fine_speaker_id"] = best_id
        meta["fine_details"] = dict(names)
    else:
        meta["fine_speaker_name"] = meta.get("speaker_name", "Unknown")
        meta["fine_speaker_id"] = meta.get("speaker_id", "Unknown")

    cur.execute("""
        UPDATE dev.chunks SET metadata=%s::jsonb, updated_at=NOW()
        WHERE id=%s
    """, (json.dumps(meta), db_id))
    updated += 1

conn.commit()
print(f"Updated {updated} chunks")

# Verify distribution
cur.execute("""
    SELECT metadata->>'fine_speaker_name', COUNT(*)
    FROM dev.chunks
    WHERE file_uuid=%s AND chunk_type='sentence'
    GROUP BY 1 ORDER BY 2 DESC
""", (UUID,))
print("\nNew speaker distribution:")
for name, cnt in cur.fetchall():
    print(f"  {name}: {cnt}")

conn.close()
print("\n=== Done ===")