Files
momentry_core/scripts/tkg_builder.py
Accusys fc16e7b1c3 fix: Phase 1 pipeline fully operational
- store_traced_faces.py: add --uuid arg for PythonExecutor compat
- tkg_builder.py: add --uuid arg + timestamp_secs column fix
- release_pack.py: fix pg_dump/psql paths, proper JSON escaping
- pipeline_checklist.py: new independent verification tool

Phase 1 checklist 8/8 PASS:
ASR  ASRX  sentence chunks  vector embeddings 
face trace  TKG graph  trace chunks  Phase 1 release 
2026-05-09 17:21:17 +08:00

470 lines
16 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
TKG Builder - Populate Temporal Knowledge Graph from pipeline results
Builds graph nodes and edges from:
- Face traces (face_detections with trace_id + bbox)
- YOLO objects (yolo.json)
- Speaker segments (asrx.json)
Graph Structure:
NODES:
(face_trace:N) - one per unique trace_id per file
(object:C) - one per unique yolo class
(speaker:S) - one per speaker_id
EDGES:
(face_trace) -[:APPEARS_IN]-> (frame:N)
(object) -[:APPEARS_IN]-> (frame:N)
(face_trace) -[:CO_OCCURS_WITH]-> (object) -- same frame, same file
Usage:
python tkg_builder.py --file-uuid <uuid> [--schema <schema>]
"""
import sys
import os
import json
import argparse
import psycopg2
import psycopg2.extras
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
def get_conn():
return psycopg2.connect(DB_URL)
def ensure_node(cur, schema, file_uuid, node_type, external_id, label="", properties=None):
"""Insert or get graph node"""
cur.execute(
f"""
INSERT INTO {schema}.tkg_nodes (node_type, external_id, file_uuid, label, properties)
VALUES (%s, %s, %s, %s, %s::jsonb)
ON CONFLICT (file_uuid, node_type, external_id)
DO UPDATE SET properties = COALESCE(EXCLUDED.properties, tkg_nodes.properties),
label = COALESCE(NULLIF(EXCLUDED.label, ''), tkg_nodes.label)
RETURNING id
""",
(node_type, str(external_id), file_uuid, label, json.dumps(properties or {})),
)
row = cur.fetchone()
return row[0]
def ensure_edge(cur, schema, file_uuid, edge_type, source_id, target_id, properties=None):
"""Insert graph edge"""
cur.execute(
f"""
INSERT INTO {schema}.tkg_edges (edge_type, source_node_id, target_node_id, file_uuid, properties)
VALUES (%s, %s, %s, %s, %s::jsonb)
ON CONFLICT (file_uuid, edge_type, source_node_id, target_node_id)
DO UPDATE SET properties = COALESCE(EXCLUDED.properties, tkg_edges.properties)
""",
(edge_type, source_id, target_id, file_uuid, json.dumps(properties or {})),
)
def build_face_trace_nodes(cur, schema, file_uuid):
"""Create graph nodes for each face trace"""
print("[TKG] Building face trace nodes...")
cur.execute(
f"""
SELECT trace_id, COUNT(*) as frame_count,
MIN(frame_number) as start_f, MAX(frame_number) as end_f,
AVG(x::float) as avg_x,
AVG(y::float) as avg_y,
AVG(width::float) as avg_w,
AVG(height::float) as avg_h
FROM {schema}.face_detections
WHERE file_uuid = %s AND trace_id IS NOT NULL
GROUP BY trace_id
ORDER BY trace_id
""",
(file_uuid,),
)
count = 0
for row in cur.fetchall():
tid, fc, sf, ef, ax, ay, aw, ah = row
label = f"Face Trace {tid}"
props = {
"frame_count": fc,
"start_frame": sf,
"end_frame": ef,
"avg_bbox": {"x": round(ax or 0, 1), "y": round(ay or 0, 1),
"width": round(aw or 0, 1), "height": round(ah or 0, 1)},
}
ensure_node(cur, schema, file_uuid, "face_trace", f"trace_{tid}", label, props)
count += 1
print(f"[TKG] {count} face trace nodes created")
return count
def load_json_safe(path):
"""Load JSON even if in-progress (truncated tail)"""
if not os.path.exists(path):
return None
try:
with open(path) as f:
return json.load(f)
except json.JSONDecodeError:
# Try to recover by truncating to last valid frame
print(f"[TKG] Warning: {path} is in-progress, loading partial data")
with open(path) as f:
content = f.read()
# Find last valid "frame" entry and truncate
last_valid = content.rfind('"}')
if last_valid > 0:
try:
return json.loads(content[:last_valid+2] + "\n}}")
except json.JSONDecodeError:
pass
return None
def build_yolo_object_nodes(cur, schema, file_uuid):
"""Create graph nodes for each YOLO object class from yolo.json"""
yolo_path = os.path.join(OUTPUT_DIR, f"{file_uuid}.yolo.json")
yolo = load_json_safe(yolo_path)
if yolo is None:
print(f"[TKG] yolo.json not available, skipping object nodes")
return 0
frames = yolo.get("frames", {})
class_counts = {}
for fdata in frames.values():
detections = fdata.get("detections", fdata.get("objects", []))
for det in detections:
cls = det.get("class_name", "unknown")
class_counts[cls] = class_counts.get(cls, 0) + 1
count = 0
for cls, cnt in sorted(class_counts.items()):
ensure_node(
cur, schema, file_uuid, "object",
cls, cls,
{"total_detections": cnt},
)
count += 1
print(f"[TKG] {count} object class nodes created")
return count
def build_speaker_nodes(cur, schema, file_uuid):
"""Create graph nodes for each speaker from asrx.json"""
asrx_path = os.path.join(OUTPUT_DIR, f"{file_uuid}.asrx.json")
if not os.path.exists(asrx_path):
print(f"[TKG] asrx.json not found, skipping speaker nodes")
return 0
with open(asrx_path) as f:
asrx = json.load(f)
count = 0
stats = asrx.get("speaker_stats", {})
for sid, sinfo in stats.items():
cnt = sinfo.get("count", 0)
ensure_node(
cur, schema, file_uuid, "speaker",
sid, sid,
{"segment_count": cnt},
)
count += 1
print(f"[TKG] {count} speaker nodes created")
return count
def build_co_occurrence_edges(cur, schema, file_uuid):
"""Build CO_OCCURS_WITH edges: face_trace ↔ yolo_object in same frame"""
print("[TKG] Building co-occurrence edges (face-object within same frame)...")
yolo_path = os.path.join(OUTPUT_DIR, f"{file_uuid}.yolo.json")
yolo = load_json_safe(yolo_path)
if yolo is None:
print(f"[TKG] yolo.json not available, skipping co-occurrence")
return 0
yolo_frames = yolo.get("frames", {})
# Query face detections with trace_id
cur.execute(
f"""
SELECT trace_id, frame_number, x, y, width, height
FROM {schema}.face_detections
WHERE file_uuid = %s AND trace_id IS NOT NULL
ORDER BY frame_number
""",
(file_uuid,),
)
face_rows = cur.fetchall()
print(f"[TKG] Checking {len(face_rows)} face detections against YOLO frames...")
# Get or create frame nodes cache
frame_node_cache = {}
edge_count = 0
for tid, frame_num, fx, fy, fw, fh in face_rows:
frame_str = str(frame_num)
yolo_frame = yolo_frames.get(frame_str)
if not yolo_frame:
continue
detections = yolo_frame.get("detections", yolo_frame.get("objects", []))
if not detections:
continue
# Get face trace node
cur.execute(
f"SELECT id FROM {schema}.tkg_nodes WHERE file_uuid=%s AND node_type='face_trace' AND external_id=%s",
(file_uuid, f"trace_{tid}"),
)
ft_row = cur.fetchone()
if not ft_row:
continue
face_node_id = ft_row[0]
for det in detections:
cls = det.get("class_name", "unknown")
confidence = det.get("confidence", 0)
# Get object node
cur.execute(
f"SELECT id FROM {schema}.tkg_nodes WHERE file_uuid=%s AND node_type='object' AND external_id=%s",
(file_uuid, cls),
)
obj_row = cur.fetchone()
if not obj_row:
continue
obj_node_id = obj_row[0]
# Compute spatial distance (center-to-center)
fc_x = fx + fw / 2
fc_y = fy + fh / 2
od = det
od_x = od.get("x1", 0) + (od.get("x2", 0) - od.get("x1", 0)) / 2
od_y = od.get("y1", 0) + (od.get("y2", 0) - od.get("y1", 0)) / 2
distance = ((fc_x - od_x) ** 2 + (fc_y - od_y) ** 2) ** 0.5
edge_props = {
"frame": frame_num,
"distance_px": round(distance, 1),
"object_confidence": confidence,
"face_bbox": {"x": fx, "y": fy, "width": fw, "height": fh},
"object_bbox": {
"x1": od.get("x1"), "y1": od.get("y1"),
"x2": od.get("x2"), "y2": od.get("y2"),
},
}
try:
ensure_edge(
cur, schema, file_uuid,
"CO_OCCURS_WITH",
face_node_id, obj_node_id,
edge_props,
)
edge_count += 1
except Exception as e:
conn = cur.connection
conn.rollback()
continue
print(f"[TKG] {edge_count} co-occurrence edges created")
return edge_count
def build_speaker_face_edges(cur, schema, file_uuid):
"""Build SPEAKS_AS edges: face_trace ↔ speaker via temporal overlap"""
asrx_path = os.path.join(OUTPUT_DIR, f"{file_uuid}.asrx.json")
if not os.path.exists(asrx_path):
print(f"[TKG] asrx.json not found, skipping speaker edges")
return 0
with open(asrx_path) as f:
asrx = json.load(f)
segments = asrx.get("segments", [])
if not segments:
print("[TKG] No speaker segments found")
return 0
# Get face trace nodes with their time spans
cur.execute(
f"""
SELECT trace_id, MIN(frame_number) as start_f, MAX(frame_number) as end_f
FROM {schema}.face_detections
WHERE file_uuid = %s AND trace_id IS NOT NULL
GROUP BY trace_id
""",
(file_uuid,),
)
traces = cur.fetchall()
fps = segments[-1]["end_frame"] / segments[-1]["end_time"] if segments else 30.0
edge_count = 0
for tid, sf, ef in traces:
# Get face trace node
cur.execute(
f"SELECT id FROM {schema}.tkg_nodes WHERE file_uuid=%s AND node_type='face_trace' AND external_id=%s",
(file_uuid, f"trace_{tid}"),
)
ft_row = cur.fetchone()
if not ft_row:
continue
face_node_id = ft_row[0]
face_start_sec = sf / fps if fps > 0 else 0
face_end_sec = ef / fps if fps > 0 else 0
for seg in segments:
speaker_id = seg.get("speaker_id", "")
seg_start = seg.get("start_time", 0)
seg_end = seg.get("end_time", 0)
# Check temporal overlap
overlap_start = max(face_start_sec, seg_start)
overlap_end = min(face_end_sec, seg_end)
if overlap_start >= overlap_end:
continue
overlap_dur = overlap_end - overlap_start
face_dur = face_end_sec - face_start_sec
overlap_ratio = overlap_dur / face_dur if face_dur > 0 else 0
if overlap_ratio < 0.3: # minimum 30% overlap
continue
# Get speaker node
cur.execute(
f"SELECT id FROM {schema}.tkg_nodes WHERE file_uuid=%s AND node_type='speaker' AND external_id=%s",
(file_uuid, speaker_id),
)
sp_row = cur.fetchone()
if not sp_row:
continue
speaker_node_id = sp_row[0]
ensure_edge(
cur, schema, file_uuid,
"SPEAKS_AS",
face_node_id, speaker_node_id,
{
"overlap_ratio": round(overlap_ratio, 3),
"overlap_duration_s": round(overlap_dur, 1),
"face_time_range": f"{face_start_sec:.1f}-{face_end_sec:.1f}s",
"speaker_time_range": f"{seg_start:.1f}-{seg_end:.1f}s",
},
)
edge_count += 1
print(f"[TKG] {edge_count} speaker-face edges created")
return edge_count
def build_face_face_edges(cur, schema, file_uuid):
"""Build CO_OCCURS_WITH edges: face_trace ↔ face_trace in same frame"""
print("[TKG] Building face-face co-occurrence edges...")
cur.execute(
f"""
SELECT a.trace_id AS tid_a, b.trace_id AS tid_b,
a.frame_number, a.timestamp_secs,
a.x AS ax, a.y AS ay, a.width AS aw, a.height AS ah,
b.x AS bx, b.y AS by, b.width AS bw, b.height AS bh
FROM {schema}.face_detections a
JOIN {schema}.face_detections b
ON a.file_uuid = b.file_uuid
AND a.frame_number = b.frame_number
AND a.trace_id < b.trace_id
WHERE a.file_uuid = %s
AND a.trace_id IS NOT NULL
AND b.trace_id IS NOT NULL
ORDER BY a.frame_number
""",
(file_uuid,),
)
rows = cur.fetchall()
if not rows:
print("[TKG] No face-face co-occurrences found")
return 0
# Deduplicate by pair (group all frames where same two traces co-occur)
pair_first = {}
pair_frames = {}
for tid_a, tid_b, frame, ts, ax, ay, aw, ah, bx, by, bw, bh in rows:
key = (min(tid_a, tid_b), max(tid_a, tid_b))
if key not in pair_first:
pair_first[key] = frame
pair_frames.setdefault(key, []).append(frame)
edge_count = 0
for (tid_a, tid_b), frames in pair_frames.items():
cur.execute(
f"SELECT id FROM {schema}.tkg_nodes WHERE file_uuid=%s AND node_type='face_trace' AND external_id=%s",
(file_uuid, f"trace_{tid_a}"),
)
n_a = cur.fetchone()
cur.execute(
f"SELECT id FROM {schema}.tkg_nodes WHERE file_uuid=%s AND node_type='face_trace' AND external_id=%s",
(file_uuid, f"trace_{tid_b}"),
)
n_b = cur.fetchone()
if not n_a or not n_b:
continue
distance_px = ((frames[0] - frames[0]) ** 2) ** 0.5 # placeholder
ensure_edge(
cur, schema, file_uuid,
"CO_OCCURS_WITH",
n_a[0], n_b[0],
{
"first_frame": int(frames[0]),
"frame_count": len(frames),
},
)
edge_count += 1
print(f"[TKG] {edge_count} face-face co-occurrence edges created")
return edge_count
def main():
parser = argparse.ArgumentParser(description="Build Temporal Knowledge Graph")
parser.add_argument("--file-uuid", required=True)
parser.add_argument("--schema", default=SCHEMA)
parser.add_argument("--uuid", help="UUID for Redis tracking (accepted by executor)")
args = parser.parse_args()
conn = get_conn()
cur = conn.cursor()
print(f"[TKG] Building graph for {args.file_uuid}...")
n1 = build_face_trace_nodes(cur, args.schema, args.file_uuid)
n2 = build_yolo_object_nodes(cur, args.schema, args.file_uuid)
n3 = build_speaker_nodes(cur, args.schema, args.file_uuid)
e1 = build_co_occurrence_edges(cur, args.schema, args.file_uuid)
e2 = build_speaker_face_edges(cur, args.schema, args.file_uuid)
e3 = build_face_face_edges(cur, args.schema, args.file_uuid)
conn.commit()
cur.close()
conn.close()
print(f"\n[TKG] Complete: {n1+n2+n3} nodes, {e1+e2+e3} edges")
print(f" Face traces: {n1}")
print(f" Objects: {n2}")
print(f" Speakers: {n3}")
print(f" Co-occur: {e1}")
print(f" Speaker-face:{e2}")
print(f" Face-face: {e3}")
if __name__ == "__main__":
main()