Files
momentry_core/scripts/update_fine_speakers.py
Accusys 39ba5ddf76 feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index
Correction: asr-1.json format, generate/apply scripts
API: 37/37 endpoints fixed and tested
Docs: HANDOVER_V2.0.md for M4
2026-05-11 07:03:22 +08:00

82 lines
2.5 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Update DB sentence chunks with fine-grained ASRX speaker assignments.
Each ASR segment gets the majority speaker_name from overlapping fine segments.
"""
import json, psycopg2
from collections import Counter
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
BASE = "/Users/accusys/momentry/output_dev"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
print("=== Step 1: Load fine ASRX ===")
fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json"))
fine_segs = fine["segments"]
print(f"Fine segments: {len(fine_segs)}")
print("\n=== Step 2: Load existing sentence chunks ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
SELECT id, chunk_index, start_time, end_time, metadata
FROM dev.chunks
WHERE file_uuid=%s AND chunk_type='sentence'
ORDER BY chunk_index
""", (UUID,))
chunks = cur.fetchall()
print(f"DB sentence chunks: {len(chunks)}")
# For each chunk, find overlapping fine segments
print("\n=== Step 3: Update speaker assignments ===")
updated = 0
for row in chunks:
db_id, idx, st, et, meta = row
if meta is None or isinstance(meta, str):
try:
meta = json.loads(meta) if isinstance(meta, str) else {}
except:
meta = {}
# Find overlapping fine segments
overlapping = [s for s in fine_segs if s["start_time"] < et and s["end_time"] > st]
if overlapping:
# Majority vote
names = Counter(s["speaker_name"] for s in overlapping)
ids = Counter(s["speaker_id"] for s in overlapping)
best_name = names.most_common(1)[0][0]
best_id = ids.most_common(1)[0][0]
meta["speaker_name"] = best_name
meta["speaker_id"] = best_id
meta["fine_speaker_name"] = best_name
meta["fine_speaker_id"] = best_id
meta["fine_details"] = dict(names)
else:
meta["fine_speaker_name"] = meta.get("speaker_name", "Unknown")
meta["fine_speaker_id"] = meta.get("speaker_id", "Unknown")
cur.execute("""
UPDATE dev.chunks SET metadata=%s::jsonb, updated_at=NOW()
WHERE id=%s
""", (json.dumps(meta), db_id))
updated += 1
conn.commit()
print(f"Updated {updated} chunks")
# Verify distribution
cur.execute("""
SELECT metadata->>'fine_speaker_name', COUNT(*)
FROM dev.chunks
WHERE file_uuid=%s AND chunk_type='sentence'
GROUP BY 1 ORDER BY 2 DESC
""", (UUID,))
print("\nNew speaker distribution:")
for name, cnt in cur.fetchall():
print(f" {name}: {cnt}")
conn.close()
print("\n=== Done ===")