#!/opt/homebrew/bin/python3.11 """ Update DB sentence chunks with fine-grained ASRX speaker assignments. Each ASR segment gets the majority speaker_name from overlapping fine segments. """ import json, psycopg2 from collections import Counter UUID = "aeed71342a899fe4b4c57b7d41bcb692" BASE = "/Users/accusys/momentry/output_dev" DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp" print("=== Step 1: Load fine ASRX ===") fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json")) fine_segs = fine["segments"] print(f"Fine segments: {len(fine_segs)}") print("\n=== Step 2: Load existing sentence chunks ===") conn = psycopg2.connect(DB_URL) cur = conn.cursor() cur.execute(""" SELECT id, chunk_index, start_time, end_time, metadata FROM dev.chunks WHERE file_uuid=%s AND chunk_type='sentence' ORDER BY chunk_index """, (UUID,)) chunks = cur.fetchall() print(f"DB sentence chunks: {len(chunks)}") # For each chunk, find overlapping fine segments print("\n=== Step 3: Update speaker assignments ===") updated = 0 for row in chunks: db_id, idx, st, et, meta = row if meta is None or isinstance(meta, str): try: meta = json.loads(meta) if isinstance(meta, str) else {} except: meta = {} # Find overlapping fine segments overlapping = [s for s in fine_segs if s["start_time"] < et and s["end_time"] > st] if overlapping: # Majority vote names = Counter(s["speaker_name"] for s in overlapping) ids = Counter(s["speaker_id"] for s in overlapping) best_name = names.most_common(1)[0][0] best_id = ids.most_common(1)[0][0] meta["speaker_name"] = best_name meta["speaker_id"] = best_id meta["fine_speaker_name"] = best_name meta["fine_speaker_id"] = best_id meta["fine_details"] = dict(names) else: meta["fine_speaker_name"] = meta.get("speaker_name", "Unknown") meta["fine_speaker_id"] = meta.get("speaker_id", "Unknown") cur.execute(""" UPDATE dev.chunks SET metadata=%s::jsonb, updated_at=NOW() WHERE id=%s """, (json.dumps(meta), db_id)) updated += 1 conn.commit() print(f"Updated {updated} chunks") # Verify distribution cur.execute(""" SELECT metadata->>'fine_speaker_name', COUNT(*) FROM dev.chunks WHERE file_uuid=%s AND chunk_type='sentence' GROUP BY 1 ORDER BY 2 DESC """, (UUID,)) print("\nNew speaker distribution:") for name, cnt in cur.fetchall(): print(f" {name}: {cnt}") conn.close() print("\n=== Done ===")