momentry_core/scripts/match_speakers_to_chunks.py

#!/opt/homebrew/bin/python3.11
"""
Match Speaker IDs from ASRX to Child Chunks
"""

import json
import psycopg2

UUID = "384b0ff44aaaa1f1"
ASRX_PATH = f"output/{UUID}/{UUID}.asrx.json"
DB_URL = "postgresql://accusys@localhost:5432/momentry"


def match_speakers():
    print(f"🚀 Matching Speakers for {UUID}...")
    with open(ASRX_PATH) as f:
        asrx = json.load(f)

    segments = asrx if isinstance(asrx, list) else asrx.get("segments", [])
    print(f"📂 Loaded {len(segments)} ASRX segments.")

    conn = psycopg2.connect(DB_URL)
    cur = conn.cursor()

    count = 0
    for seg in segments:
        start = seg["start"]
        end = seg["end"]
        speaker = seg.get("speaker_id")
        if not speaker:
            continue

        # Find overlapping child chunks
        cur.execute(
            """
            UPDATE child_chunks
            SET speaker_ids = array_append(speaker_ids, %s)
            WHERE uuid = %s
            AND start_time < %s
            AND end_time > %s
            AND NOT (speaker_ids @> ARRAY[%s]::text[])
        """,
            (speaker, UUID, end, start, speaker),
        )

        if cur.rowcount > 0:
            count += cur.rowcount

    conn.commit()
    print(f"✅ Updated {count} child chunks with Speaker IDs.")
    cur.close()
    conn.close()


if __name__ == "__main__":
    match_speakers()