#!/opt/homebrew/bin/python3.11 """ Migrate ASR Segments to Child Chunks 將 ASR 的細碎語音片段寫入 child_chunks 表,並關聯到 parent_chunks。 """ import json import psycopg2 # Configuration UUID = "384b0ff44aaaa1f1" ASR_PATH = f"output/{UUID}/{UUID}.asr.json" DB_URL = "postgresql://accusys@localhost:5432/momentry" def migrate(): print(f"🚀 Starting migration for {UUID}...") # 1. Load Data with open(ASR_PATH, "r") as f: asr_data = json.load(f) segments = asr_data.get("segments", []) print(f"📂 Loaded {len(segments)} ASR segments.") # 2. Load Parent Chunks to map time ranges conn = psycopg2.connect(DB_URL) cur = conn.cursor() cur.execute( "SELECT id, start_time, end_time FROM parent_chunks WHERE uuid = %s", (UUID,) ) parents = cur.fetchall() print(f"📂 Found {len(parents)} Parent Chunks.") # 3. Insert Child Chunks count = 0 for seg in segments: text = seg.get("text", "").strip() start = seg.get("start", 0) end = seg.get("end", 0) if not text: continue # Find Parent parent_id = None for pid, p_start, p_end in parents: # Tolerate 1s margin if start >= p_start - 1.0 and end <= p_end + 1.0: parent_id = pid break # Insert # Note: raw_text_vector is null for now, we only do semantic search on Parent cur.execute( """ INSERT INTO child_chunks (parent_id, uuid, start_time, end_time, raw_text, speaker_ids) VALUES (%s, %s, %s, %s, %s, %s) """, ( parent_id, UUID, start, end, text, [seg.get("speaker_id")] if seg.get("speaker_id") else [], ), ) count += 1 conn.commit() print(f"✅ Successfully migrated {count} Child Chunks.") cur.close() conn.close() if __name__ == "__main__": migrate()