Files
momentry_core/scripts/migrate_asr_to_children.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

79 lines
2.0 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Migrate ASR Segments to Child Chunks
將 ASR 的細碎語音片段寫入 child_chunks 表,並關聯到 parent_chunks。
"""
import json
import psycopg2
# Configuration
UUID = "384b0ff44aaaa1f1"
ASR_PATH = f"output/{UUID}/{UUID}.asr.json"
DB_URL = "postgresql://accusys@localhost:5432/momentry"
def migrate():
print(f"🚀 Starting migration for {UUID}...")
# 1. Load Data
with open(ASR_PATH, "r") as f:
asr_data = json.load(f)
segments = asr_data.get("segments", [])
print(f"📂 Loaded {len(segments)} ASR segments.")
# 2. Load Parent Chunks to map time ranges
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute(
"SELECT id, start_time, end_time FROM parent_chunks WHERE uuid = %s", (UUID,)
)
parents = cur.fetchall()
print(f"📂 Found {len(parents)} Parent Chunks.")
# 3. Insert Child Chunks
count = 0
for seg in segments:
text = seg.get("text", "").strip()
start = seg.get("start", 0)
end = seg.get("end", 0)
if not text:
continue
# Find Parent
parent_id = None
for pid, p_start, p_end in parents:
# Tolerate 1s margin
if start >= p_start - 1.0 and end <= p_end + 1.0:
parent_id = pid
break
# Insert
# Note: raw_text_vector is null for now, we only do semantic search on Parent
cur.execute(
"""
INSERT INTO child_chunks (parent_id, uuid, start_time, end_time, raw_text, speaker_ids)
VALUES (%s, %s, %s, %s, %s, %s)
""",
(
parent_id,
UUID,
start,
end,
text,
[seg.get("speaker_id")] if seg.get("speaker_id") else [],
),
)
count += 1
conn.commit()
print(f"✅ Successfully migrated {count} Child Chunks.")
cur.close()
conn.close()
if __name__ == "__main__":
migrate()