momentry_core/scripts/generate_asr1.py

#!/opt/homebrew/bin/python3.11
"""
Generate {uuid}.asr-1.json by comparing asr.json (3417) with DB chunks (4188).
Identifies which ASR segments were split and records corrections.
"""
import json, os, subprocess, sys, time

PG_BIN = "/Users/accusys/pgsql/18.3/bin"
DB_USER = "accusys"
DB_NAME = "momentry"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
UUID = "aeed71342a899fe4b4c57b7d41bcb692"


def psql(sql):
    r = subprocess.run([f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-t", "-A", "-F", chr(31), "-c", sql],
        capture_output=True, text=True, timeout=30)
    return r.stdout.strip()


def main():
    t0 = time.time()
    print(f"Loading ASR segments from {UUID}.asr.json...")
    asr_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr.json")
    with open(asr_path) as f:
        asr_data = json.load(f)
    asr_segs = asr_data["segments"]
    print(f"  {len(asr_segs)} ASR segments")

    print("Loading DB sentence chunks...")
    rows = []
    raw = psql(
        f"SELECT chunk_index, start_frame, end_frame, start_time, end_time, chunk_id, text_content "
        f"FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence' "
        f"ORDER BY chunk_index"
    )
    for line in raw.split("\n"):
        if not line.strip():
            continue
        parts = line.split(chr(31))
        rows.append(parts)

    db_chunks = []
    for r in rows:
        db_chunks.append({
            "chunk_index": int(r[0]),
            "start_frame": int(r[1]),
            "end_frame": int(r[2]),
            "start_time": float(r[3]),
            "end_time": float(r[4]),
            "chunk_id": r[5],
            "text_content": r[6] if len(r) > 6 and r[6] else "",
        })
    print(f"  {len(db_chunks)} DB chunks")

    # For each DB chunk, find the best-matching ASR segment.
    # A DB chunk belongs to ASR segment i if chunk's time range
    # falls WITHIN ASR segment i's time range.
    asr_of_chunk = {}  # chunk_index -> asr_idx
    for dc in db_chunks:
        ct_mid = (dc["start_time"] + dc["end_time"]) / 2
        best_asr = None
        for ai, a in enumerate(asr_segs):
            if a["start"] - 0.1 <= dc["start_time"] and dc["end_time"] <= a["end"] + 0.1:
                if best_asr is None:
                    best_asr = ai
                else:
                    prev_a = asr_segs[best_asr]
                    prev_mid = (prev_a["start"] + prev_a["end"]) / 2
                    if abs(ct_mid - prev_mid) > abs(ct_mid - (a["start"] + a["end"]) / 2):
                        best_asr = ai
        if best_asr is not None:
            asr_of_chunk[dc["chunk_index"]] = best_asr

    print(f"  Mapped: {len(asr_of_chunk)} / {len(db_chunks)} chunks to ASR segments")

    # Group DB chunks by ASR index
    from collections import defaultdict
    chunks_by_asr = defaultdict(list)
    for ci, ai in asr_of_chunk.items():
        chunks_by_asr[ai].append(ci)

    # Build kept + corrections
    corrections = []
    kept = []
    for ai, child_indices in sorted(chunks_by_asr.items()):
        if len(child_indices) < 2:
            dc = db_chunks[child_indices[0]]
            kept.append({
                "chunk_index": ai,
                "start_frame": dc["start_frame"],
                "end_frame": dc["end_frame"],
                "text_content": dc["text_content"],
            })
            continue
        a = asr_segs[ai]
        children = []
        for ci in child_indices:
            dc = db_chunks[ci]
            children.append({
                "chunk_id": dc["chunk_id"],
                "start_frame": dc["start_frame"],
                "end_frame": dc["end_frame"],
                "text_content": dc["text_content"],
            })
        children_sorted = sorted(children, key=lambda x: x["start_frame"])

        # Assign new chunk_id format based on chunk_index
        # The first child of parent ASR idx N gets "N-01", second "N-02", etc.
        for si, child in enumerate(children_sorted):
            child["new_chunk_id"] = f"{ai}-{si+1:02d}"

        corrections.append({
            "parent_chunk_index": ai,
            "reason": "split",
            "original": {
                "start_frame": int(a["start"] * 24),
                "end_frame": int(a["end"] * 24),
                "text_content": a["text"],
            },
            "corrected": children_sorted
        })

    total_corrected = sum(len(c["corrected"]) for c in corrections)
    print(f"  Kept chunks: {len(kept)}")
    print(f"  Corrected chunks: {total_corrected}")
    print(f"  Total: {len(kept) + total_corrected} (should be {len(db_chunks)})\n")

    # Write output
    output = {
        "file_uuid": UUID,
        "asr_version": 1,
        "kept": kept,
        "corrections": corrections
    }
    output_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr-1.json")
    with open(output_path, "w") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"\nSaved: {output_path} ({os.path.getsize(output_path) / 1024:.0f} KB)")

    # Stats
    split_sizes = {}
    for c in corrections:
        n = len(c["corrected"])
        split_sizes[n] = split_sizes.get(n, 0) + 1
    print(f"\nSplit distribution:")
    for n in sorted(split_sizes):
        print(f"  {n} children: {split_sizes[n]} ASR segments → {n * split_sizes[n]} chunks")

    elapsed = time.time() - t0
    print(f"\nElapsed: {elapsed:.1f}s")


if __name__ == "__main__":
    main()