feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)

Phase 2.6.1: co_occurrence_edges migration - build_co_occurrence_edges_from_qdrant() - Qdrant embeddings → frame grouping → YOLO objects - Result: 6679 edges (vs 6701 PostgreSQL) Phase 2.6.2: face_face_edges migration - build_face_face_edges_from_qdrant() - Qdrant embeddings → frame grouping → face pairs - mutual_gaze detection preserved - Result: 6 edges (exact match) Phase 2.6.3: speaker_face_edges migration - build_speaker_face_edges_from_qdrant() - Qdrant embeddings → trace_id frame ranges - SPEAKS_AS edge creation Architecture: - All edges use Qdrant payload (no face_detections queries) - PostgreSQL fallback for empty Qdrant - Estimated 3.6x performance improvement Testing: - Playground (3003): ✓ All Phase 2.6 logs verified - Edge counts: ✓ Close match with PostgreSQL - Fallback: ✓ Working Docs: - docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md - docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
2026-06-21 04:47:49 +08:00
parent 0afc70fc5b
commit 2cfcfdd1af
2926 changed files with 8311058 additions and 1394 deletions
--- a/v1.1/scripts/generate_asr1_v1.11.py
+++ b/v1.1/scripts/generate_asr1_v1.11.py
@@ -0,0 +1,155 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Generate {uuid}.asr-1.json by comparing asr.json (3417) with DB chunks (4188).
+Identifies which ASR segments were split and records corrections.
+"""
+import json, os, subprocess, sys, time
+
+PG_BIN = "/Users/accusys/pgsql/18.3/bin"
+DB_USER = "accusys"
+DB_NAME = "momentry"
+OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
+UUID = "aeed71342a899fe4b4c57b7d41bcb692"
+
+
+def psql(sql):
+    r = subprocess.run([f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-t", "-A", "-F", chr(31), "-c", sql],
+        capture_output=True, text=True, timeout=30)
+    return r.stdout.strip()
+
+
+def main():
+    t0 = time.time()
+    print(f"Loading ASR segments from {UUID}.asr.json...")
+    asr_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr.json")
+    with open(asr_path) as f:
+        asr_data = json.load(f)
+    asr_segs = asr_data["segments"]
+    print(f"  {len(asr_segs)} ASR segments")
+
+    print("Loading DB sentence chunks...")
+    rows = []
+    raw = psql(
+        f"SELECT chunk_index, start_frame, end_frame, start_time, end_time, chunk_id, text_content "
+        f"FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence' "
+        f"ORDER BY chunk_index"
+    )
+    for line in raw.split("\n"):
+        if not line.strip():
+            continue
+        parts = line.split(chr(31))
+        rows.append(parts)
+
+    db_chunks = []
+    for r in rows:
+        db_chunks.append({
+            "chunk_index": int(r[0]),
+            "start_frame": int(r[1]),
+            "end_frame": int(r[2]),
+            "start_time": float(r[3]),
+            "end_time": float(r[4]),
+            "chunk_id": r[5],
+            "text_content": r[6] if len(r) > 6 and r[6] else "",
+        })
+    print(f"  {len(db_chunks)} DB chunks")
+
+    # For each DB chunk, find the best-matching ASR segment.
+    # A DB chunk belongs to ASR segment i if chunk's time range
+    # falls WITHIN ASR segment i's time range.
+    asr_of_chunk = {}  # chunk_index -> asr_idx
+    for dc in db_chunks:
+        ct_mid = (dc["start_time"] + dc["end_time"]) / 2
+        best_asr = None
+        for ai, a in enumerate(asr_segs):
+            if a["start"] - 0.1 <= dc["start_time"] and dc["end_time"] <= a["end"] + 0.1:
+                if best_asr is None:
+                    best_asr = ai
+                else:
+                    prev_a = asr_segs[best_asr]
+                    prev_mid = (prev_a["start"] + prev_a["end"]) / 2
+                    if abs(ct_mid - prev_mid) > abs(ct_mid - (a["start"] + a["end"]) / 2):
+                        best_asr = ai
+        if best_asr is not None:
+            asr_of_chunk[dc["chunk_index"]] = best_asr
+
+    print(f"  Mapped: {len(asr_of_chunk)} / {len(db_chunks)} chunks to ASR segments")
+
+    # Group DB chunks by ASR index
+    from collections import defaultdict
+    chunks_by_asr = defaultdict(list)
+    for ci, ai in asr_of_chunk.items():
+        chunks_by_asr[ai].append(ci)
+
+    # Build kept + corrections
+    corrections = []
+    kept = []
+    for ai, child_indices in sorted(chunks_by_asr.items()):
+        if len(child_indices) < 2:
+            dc = db_chunks[child_indices[0]]
+            kept.append({
+                "chunk_index": ai,
+                "start_frame": dc["start_frame"],
+                "end_frame": dc["end_frame"],
+                "text_content": dc["text_content"],
+            })
+            continue
+        a = asr_segs[ai]
+        children = []
+        for ci in child_indices:
+            dc = db_chunks[ci]
+            children.append({
+                "chunk_id": dc["chunk_id"],
+                "start_frame": dc["start_frame"],
+                "end_frame": dc["end_frame"],
+                "text_content": dc["text_content"],
+            })
+        children_sorted = sorted(children, key=lambda x: x["start_frame"])
+
+        # Assign new chunk_id format based on chunk_index
+        # The first child of parent ASR idx N gets "N-01", second "N-02", etc.
+        for si, child in enumerate(children_sorted):
+            child["new_chunk_id"] = f"{ai}-{si+1:02d}"
+
+        corrections.append({
+            "parent_chunk_index": ai,
+            "reason": "split",
+            "original": {
+                "start_frame": int(a["start"] * 24),
+                "end_frame": int(a["end"] * 24),
+                "text_content": a["text"],
+            },
+            "corrected": children_sorted
+        })
+
+    total_corrected = sum(len(c["corrected"]) for c in corrections)
+    print(f"  Kept chunks: {len(kept)}")
+    print(f"  Corrected chunks: {total_corrected}")
+    print(f"  Total: {len(kept) + total_corrected} (should be {len(db_chunks)})\n")
+
+    # Write output
+    output = {
+        "file_uuid": UUID,
+        "asr_version": 1,
+        "kept": kept,
+        "corrections": corrections
+    }
+    output_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr-1.json")
+    with open(output_path, "w") as f:
+        json.dump(output, f, indent=2, ensure_ascii=False)
+    print(f"\nSaved: {output_path} ({os.path.getsize(output_path) / 1024:.0f} KB)")
+
+    # Stats
+    split_sizes = {}
+    for c in corrections:
+        n = len(c["corrected"])
+        split_sizes[n] = split_sizes.get(n, 0) + 1
+    print(f"\nSplit distribution:")
+    for n in sorted(split_sizes):
+        print(f"  {n} children: {split_sizes[n]} ASR segments → {n * split_sizes[n]} chunks")
+
+    elapsed = time.time() - t0
+    print(f"\nElapsed: {elapsed:.1f}s")
+
+
+if __name__ == "__main__":
+    main()