feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)
Phase 2.6.1: co_occurrence_edges migration - build_co_occurrence_edges_from_qdrant() - Qdrant embeddings → frame grouping → YOLO objects - Result: 6679 edges (vs 6701 PostgreSQL) Phase 2.6.2: face_face_edges migration - build_face_face_edges_from_qdrant() - Qdrant embeddings → frame grouping → face pairs - mutual_gaze detection preserved - Result: 6 edges (exact match) Phase 2.6.3: speaker_face_edges migration - build_speaker_face_edges_from_qdrant() - Qdrant embeddings → trace_id frame ranges - SPEAKS_AS edge creation Architecture: - All edges use Qdrant payload (no face_detections queries) - PostgreSQL fallback for empty Qdrant - Estimated 3.6x performance improvement Testing: - Playground (3003): ✓ All Phase 2.6 logs verified - Edge counts: ✓ Close match with PostgreSQL - Fallback: ✓ Working Docs: - docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md - docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
This commit is contained in:
155
v1.1/scripts/generate_asr1_v1.11.py
Normal file
155
v1.1/scripts/generate_asr1_v1.11.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Generate {uuid}.asr-1.json by comparing asr.json (3417) with DB chunks (4188).
|
||||
Identifies which ASR segments were split and records corrections.
|
||||
"""
|
||||
import json, os, subprocess, sys, time
|
||||
|
||||
PG_BIN = "/Users/accusys/pgsql/18.3/bin"
|
||||
DB_USER = "accusys"
|
||||
DB_NAME = "momentry"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
|
||||
|
||||
def psql(sql):
|
||||
r = subprocess.run([f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-t", "-A", "-F", chr(31), "-c", sql],
|
||||
capture_output=True, text=True, timeout=30)
|
||||
return r.stdout.strip()
|
||||
|
||||
|
||||
def main():
|
||||
t0 = time.time()
|
||||
print(f"Loading ASR segments from {UUID}.asr.json...")
|
||||
asr_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr.json")
|
||||
with open(asr_path) as f:
|
||||
asr_data = json.load(f)
|
||||
asr_segs = asr_data["segments"]
|
||||
print(f" {len(asr_segs)} ASR segments")
|
||||
|
||||
print("Loading DB sentence chunks...")
|
||||
rows = []
|
||||
raw = psql(
|
||||
f"SELECT chunk_index, start_frame, end_frame, start_time, end_time, chunk_id, text_content "
|
||||
f"FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence' "
|
||||
f"ORDER BY chunk_index"
|
||||
)
|
||||
for line in raw.split("\n"):
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = line.split(chr(31))
|
||||
rows.append(parts)
|
||||
|
||||
db_chunks = []
|
||||
for r in rows:
|
||||
db_chunks.append({
|
||||
"chunk_index": int(r[0]),
|
||||
"start_frame": int(r[1]),
|
||||
"end_frame": int(r[2]),
|
||||
"start_time": float(r[3]),
|
||||
"end_time": float(r[4]),
|
||||
"chunk_id": r[5],
|
||||
"text_content": r[6] if len(r) > 6 and r[6] else "",
|
||||
})
|
||||
print(f" {len(db_chunks)} DB chunks")
|
||||
|
||||
# For each DB chunk, find the best-matching ASR segment.
|
||||
# A DB chunk belongs to ASR segment i if chunk's time range
|
||||
# falls WITHIN ASR segment i's time range.
|
||||
asr_of_chunk = {} # chunk_index -> asr_idx
|
||||
for dc in db_chunks:
|
||||
ct_mid = (dc["start_time"] + dc["end_time"]) / 2
|
||||
best_asr = None
|
||||
for ai, a in enumerate(asr_segs):
|
||||
if a["start"] - 0.1 <= dc["start_time"] and dc["end_time"] <= a["end"] + 0.1:
|
||||
if best_asr is None:
|
||||
best_asr = ai
|
||||
else:
|
||||
prev_a = asr_segs[best_asr]
|
||||
prev_mid = (prev_a["start"] + prev_a["end"]) / 2
|
||||
if abs(ct_mid - prev_mid) > abs(ct_mid - (a["start"] + a["end"]) / 2):
|
||||
best_asr = ai
|
||||
if best_asr is not None:
|
||||
asr_of_chunk[dc["chunk_index"]] = best_asr
|
||||
|
||||
print(f" Mapped: {len(asr_of_chunk)} / {len(db_chunks)} chunks to ASR segments")
|
||||
|
||||
# Group DB chunks by ASR index
|
||||
from collections import defaultdict
|
||||
chunks_by_asr = defaultdict(list)
|
||||
for ci, ai in asr_of_chunk.items():
|
||||
chunks_by_asr[ai].append(ci)
|
||||
|
||||
# Build kept + corrections
|
||||
corrections = []
|
||||
kept = []
|
||||
for ai, child_indices in sorted(chunks_by_asr.items()):
|
||||
if len(child_indices) < 2:
|
||||
dc = db_chunks[child_indices[0]]
|
||||
kept.append({
|
||||
"chunk_index": ai,
|
||||
"start_frame": dc["start_frame"],
|
||||
"end_frame": dc["end_frame"],
|
||||
"text_content": dc["text_content"],
|
||||
})
|
||||
continue
|
||||
a = asr_segs[ai]
|
||||
children = []
|
||||
for ci in child_indices:
|
||||
dc = db_chunks[ci]
|
||||
children.append({
|
||||
"chunk_id": dc["chunk_id"],
|
||||
"start_frame": dc["start_frame"],
|
||||
"end_frame": dc["end_frame"],
|
||||
"text_content": dc["text_content"],
|
||||
})
|
||||
children_sorted = sorted(children, key=lambda x: x["start_frame"])
|
||||
|
||||
# Assign new chunk_id format based on chunk_index
|
||||
# The first child of parent ASR idx N gets "N-01", second "N-02", etc.
|
||||
for si, child in enumerate(children_sorted):
|
||||
child["new_chunk_id"] = f"{ai}-{si+1:02d}"
|
||||
|
||||
corrections.append({
|
||||
"parent_chunk_index": ai,
|
||||
"reason": "split",
|
||||
"original": {
|
||||
"start_frame": int(a["start"] * 24),
|
||||
"end_frame": int(a["end"] * 24),
|
||||
"text_content": a["text"],
|
||||
},
|
||||
"corrected": children_sorted
|
||||
})
|
||||
|
||||
total_corrected = sum(len(c["corrected"]) for c in corrections)
|
||||
print(f" Kept chunks: {len(kept)}")
|
||||
print(f" Corrected chunks: {total_corrected}")
|
||||
print(f" Total: {len(kept) + total_corrected} (should be {len(db_chunks)})\n")
|
||||
|
||||
# Write output
|
||||
output = {
|
||||
"file_uuid": UUID,
|
||||
"asr_version": 1,
|
||||
"kept": kept,
|
||||
"corrections": corrections
|
||||
}
|
||||
output_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr-1.json")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2, ensure_ascii=False)
|
||||
print(f"\nSaved: {output_path} ({os.path.getsize(output_path) / 1024:.0f} KB)")
|
||||
|
||||
# Stats
|
||||
split_sizes = {}
|
||||
for c in corrections:
|
||||
n = len(c["corrected"])
|
||||
split_sizes[n] = split_sizes.get(n, 0) + 1
|
||||
print(f"\nSplit distribution:")
|
||||
for n in sorted(split_sizes):
|
||||
print(f" {n} children: {split_sizes[n]} ASR segments → {n * split_sizes[n]} chunks")
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"\nElapsed: {elapsed:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user