Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
82 lines
2.5 KiB
Python
82 lines
2.5 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Update DB sentence chunks with fine-grained ASRX speaker assignments.
|
|
Each ASR segment gets the majority speaker_name from overlapping fine segments.
|
|
"""
|
|
import json, psycopg2
|
|
from collections import Counter
|
|
|
|
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
|
BASE = "/Users/accusys/momentry/output_dev"
|
|
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
|
|
|
print("=== Step 1: Load fine ASRX ===")
|
|
fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json"))
|
|
fine_segs = fine["segments"]
|
|
print(f"Fine segments: {len(fine_segs)}")
|
|
|
|
print("\n=== Step 2: Load existing sentence chunks ===")
|
|
conn = psycopg2.connect(DB_URL)
|
|
cur = conn.cursor()
|
|
cur.execute("""
|
|
SELECT id, chunk_index, start_time, end_time, metadata
|
|
FROM dev.chunks
|
|
WHERE file_uuid=%s AND chunk_type='sentence'
|
|
ORDER BY chunk_index
|
|
""", (UUID,))
|
|
chunks = cur.fetchall()
|
|
print(f"DB sentence chunks: {len(chunks)}")
|
|
|
|
# For each chunk, find overlapping fine segments
|
|
print("\n=== Step 3: Update speaker assignments ===")
|
|
updated = 0
|
|
for row in chunks:
|
|
db_id, idx, st, et, meta = row
|
|
if meta is None or isinstance(meta, str):
|
|
try:
|
|
meta = json.loads(meta) if isinstance(meta, str) else {}
|
|
except:
|
|
meta = {}
|
|
|
|
# Find overlapping fine segments
|
|
overlapping = [s for s in fine_segs if s["start_time"] < et and s["end_time"] > st]
|
|
|
|
if overlapping:
|
|
# Majority vote
|
|
names = Counter(s["speaker_name"] for s in overlapping)
|
|
ids = Counter(s["speaker_id"] for s in overlapping)
|
|
best_name = names.most_common(1)[0][0]
|
|
best_id = ids.most_common(1)[0][0]
|
|
|
|
meta["speaker_name"] = best_name
|
|
meta["speaker_id"] = best_id
|
|
meta["fine_speaker_name"] = best_name
|
|
meta["fine_speaker_id"] = best_id
|
|
meta["fine_details"] = dict(names)
|
|
else:
|
|
meta["fine_speaker_name"] = meta.get("speaker_name", "Unknown")
|
|
meta["fine_speaker_id"] = meta.get("speaker_id", "Unknown")
|
|
|
|
cur.execute("""
|
|
UPDATE dev.chunks SET metadata=%s::jsonb, updated_at=NOW()
|
|
WHERE id=%s
|
|
""", (json.dumps(meta), db_id))
|
|
updated += 1
|
|
|
|
conn.commit()
|
|
print(f"Updated {updated} chunks")
|
|
|
|
# Verify distribution
|
|
cur.execute("""
|
|
SELECT metadata->>'fine_speaker_name', COUNT(*)
|
|
FROM dev.chunks
|
|
WHERE file_uuid=%s AND chunk_type='sentence'
|
|
GROUP BY 1 ORDER BY 2 DESC
|
|
""", (UUID,))
|
|
print("\nNew speaker distribution:")
|
|
for name, cnt in cur.fetchall():
|
|
print(f" {name}: {cnt}")
|
|
|
|
conn.close()
|
|
print("\n=== Done ===")
|