momentry_core/scripts/migrate_asr_to_children.py

#!/opt/homebrew/bin/python3.11
"""
Migrate ASR Segments to Child Chunks
將 ASR 的細碎語音片段寫入 child_chunks 表，並關聯到 parent_chunks。
"""

import json
import psycopg2

# Configuration
UUID = "384b0ff44aaaa1f1"
ASR_PATH = f"output/{UUID}/{UUID}.asr.json"
DB_URL = "postgresql://accusys@localhost:5432/momentry"


def migrate():
    print(f"🚀 Starting migration for {UUID}...")

    # 1. Load Data
    with open(ASR_PATH, "r") as f:
        asr_data = json.load(f)
    segments = asr_data.get("segments", [])
    print(f"📂 Loaded {len(segments)} ASR segments.")

    # 2. Load Parent Chunks to map time ranges
    conn = psycopg2.connect(DB_URL)
    cur = conn.cursor()

    cur.execute(
        "SELECT id, start_time, end_time FROM parent_chunks WHERE uuid = %s", (UUID,)
    )
    parents = cur.fetchall()
    print(f"📂 Found {len(parents)} Parent Chunks.")

    # 3. Insert Child Chunks
    count = 0
    for seg in segments:
        text = seg.get("text", "").strip()
        start = seg.get("start", 0)
        end = seg.get("end", 0)

        if not text:
            continue

        # Find Parent
        parent_id = None
        for pid, p_start, p_end in parents:
            # Tolerate 1s margin
            if start >= p_start - 1.0 and end <= p_end + 1.0:
                parent_id = pid
                break

        # Insert
        # Note: raw_text_vector is null for now, we only do semantic search on Parent
        cur.execute(
            """
            INSERT INTO child_chunks (parent_id, uuid, start_time, end_time, raw_text, speaker_ids)
            VALUES (%s, %s, %s, %s, %s, %s)
        """,
            (
                parent_id,
                UUID,
                start,
                end,
                text,
                [seg.get("speaker_id")] if seg.get("speaker_id") else [],
            ),
        )
        count += 1

    conn.commit()
    print(f"✅ Successfully migrated {count} Child Chunks.")
    cur.close()
    conn.close()


if __name__ == "__main__":
    migrate()