#!/opt/homebrew/bin/python3.11 """ Generate {uuid}.asr-1.json by comparing asr.json (3417) with DB chunks (4188). Identifies which ASR segments were split and records corrections. """ import json, os, subprocess, sys, time PG_BIN = "/Users/accusys/pgsql/18.3/bin" DB_USER = "accusys" DB_NAME = "momentry" OUTPUT_DIR = "/Users/accusys/momentry/output_dev" UUID = "aeed71342a899fe4b4c57b7d41bcb692" def psql(sql): r = subprocess.run([f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-t", "-A", "-F", chr(31), "-c", sql], capture_output=True, text=True, timeout=30) return r.stdout.strip() def main(): t0 = time.time() print(f"Loading ASR segments from {UUID}.asr.json...") asr_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr.json") with open(asr_path) as f: asr_data = json.load(f) asr_segs = asr_data["segments"] print(f" {len(asr_segs)} ASR segments") print("Loading DB sentence chunks...") rows = [] raw = psql( f"SELECT chunk_index, start_frame, end_frame, start_time, end_time, chunk_id, text_content " f"FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence' " f"ORDER BY chunk_index" ) for line in raw.split("\n"): if not line.strip(): continue parts = line.split(chr(31)) rows.append(parts) db_chunks = [] for r in rows: db_chunks.append({ "chunk_index": int(r[0]), "start_frame": int(r[1]), "end_frame": int(r[2]), "start_time": float(r[3]), "end_time": float(r[4]), "chunk_id": r[5], "text_content": r[6] if len(r) > 6 and r[6] else "", }) print(f" {len(db_chunks)} DB chunks") # For each DB chunk, find the best-matching ASR segment. # A DB chunk belongs to ASR segment i if chunk's time range # falls WITHIN ASR segment i's time range. asr_of_chunk = {} # chunk_index -> asr_idx for dc in db_chunks: ct_mid = (dc["start_time"] + dc["end_time"]) / 2 best_asr = None for ai, a in enumerate(asr_segs): if a["start"] - 0.1 <= dc["start_time"] and dc["end_time"] <= a["end"] + 0.1: if best_asr is None: best_asr = ai else: prev_a = asr_segs[best_asr] prev_mid = (prev_a["start"] + prev_a["end"]) / 2 if abs(ct_mid - prev_mid) > abs(ct_mid - (a["start"] + a["end"]) / 2): best_asr = ai if best_asr is not None: asr_of_chunk[dc["chunk_index"]] = best_asr print(f" Mapped: {len(asr_of_chunk)} / {len(db_chunks)} chunks to ASR segments") # Group DB chunks by ASR index from collections import defaultdict chunks_by_asr = defaultdict(list) for ci, ai in asr_of_chunk.items(): chunks_by_asr[ai].append(ci) # Build kept + corrections corrections = [] kept = [] for ai, child_indices in sorted(chunks_by_asr.items()): if len(child_indices) < 2: dc = db_chunks[child_indices[0]] kept.append({ "chunk_index": ai, "start_frame": dc["start_frame"], "end_frame": dc["end_frame"], "text_content": dc["text_content"], }) continue a = asr_segs[ai] children = [] for ci in child_indices: dc = db_chunks[ci] children.append({ "chunk_id": dc["chunk_id"], "start_frame": dc["start_frame"], "end_frame": dc["end_frame"], "text_content": dc["text_content"], }) children_sorted = sorted(children, key=lambda x: x["start_frame"]) # Assign new chunk_id format based on chunk_index # The first child of parent ASR idx N gets "N-01", second "N-02", etc. for si, child in enumerate(children_sorted): child["new_chunk_id"] = f"{ai}-{si+1:02d}" corrections.append({ "parent_chunk_index": ai, "reason": "split", "original": { "start_frame": int(a["start"] * 24), "end_frame": int(a["end"] * 24), "text_content": a["text"], }, "corrected": children_sorted }) total_corrected = sum(len(c["corrected"]) for c in corrections) print(f" Kept chunks: {len(kept)}") print(f" Corrected chunks: {total_corrected}") print(f" Total: {len(kept) + total_corrected} (should be {len(db_chunks)})\n") # Write output output = { "file_uuid": UUID, "asr_version": 1, "kept": kept, "corrections": corrections } output_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr-1.json") with open(output_path, "w") as f: json.dump(output, f, indent=2, ensure_ascii=False) print(f"\nSaved: {output_path} ({os.path.getsize(output_path) / 1024:.0f} KB)") # Stats split_sizes = {} for c in corrections: n = len(c["corrected"]) split_sizes[n] = split_sizes.get(n, 0) + 1 print(f"\nSplit distribution:") for n in sorted(split_sizes): print(f" {n} children: {split_sizes[n]} ASR segments → {n * split_sizes[n]} chunks") elapsed = time.time() - t0 print(f"\nElapsed: {elapsed:.1f}s") if __name__ == "__main__": main()