feat: trace quality agent selection report, identity clustering runner_v2 DB write, age/gender CoreML selection, updated experiment config UUID
This commit is contained in:
399
scripts/tkg_builder.py
Normal file
399
scripts/tkg_builder.py
Normal file
@@ -0,0 +1,399 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
TKG Builder - Populate Temporal Knowledge Graph from pipeline results
|
||||
|
||||
Builds graph nodes and edges from:
|
||||
- Face traces (face_detections with trace_id + bbox)
|
||||
- YOLO objects (yolo.json)
|
||||
- Speaker segments (asrx.json)
|
||||
|
||||
Graph Structure:
|
||||
NODES:
|
||||
(face_trace:N) - one per unique trace_id per file
|
||||
(object:C) - one per unique yolo class
|
||||
(speaker:S) - one per speaker_id
|
||||
EDGES:
|
||||
(face_trace) -[:APPEARS_IN]-> (frame:N)
|
||||
(object) -[:APPEARS_IN]-> (frame:N)
|
||||
(face_trace) -[:CO_OCCURS_WITH]-> (object) -- same frame, same file
|
||||
|
||||
Usage:
|
||||
python tkg_builder.py --file-uuid <uuid> [--schema <schema>]
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
|
||||
SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
|
||||
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
|
||||
|
||||
|
||||
def get_conn():
|
||||
return psycopg2.connect(DB_URL)
|
||||
|
||||
|
||||
def ensure_node(cur, schema, file_uuid, node_type, external_id, label="", properties=None):
|
||||
"""Insert or get graph node"""
|
||||
cur.execute(
|
||||
f"""
|
||||
INSERT INTO {schema}.tkg_nodes (node_type, external_id, file_uuid, label, properties)
|
||||
VALUES (%s, %s, %s, %s, %s::jsonb)
|
||||
ON CONFLICT (file_uuid, node_type, external_id)
|
||||
DO UPDATE SET properties = COALESCE(EXCLUDED.properties, tkg_nodes.properties),
|
||||
label = COALESCE(NULLIF(EXCLUDED.label, ''), tkg_nodes.label)
|
||||
RETURNING id
|
||||
""",
|
||||
(node_type, str(external_id), file_uuid, label, json.dumps(properties or {})),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
return row[0]
|
||||
|
||||
|
||||
def ensure_edge(cur, schema, file_uuid, edge_type, source_id, target_id, properties=None):
|
||||
"""Insert graph edge"""
|
||||
cur.execute(
|
||||
f"""
|
||||
INSERT INTO {schema}.tkg_edges (edge_type, source_node_id, target_node_id, file_uuid, properties)
|
||||
VALUES (%s, %s, %s, %s, %s::jsonb)
|
||||
ON CONFLICT (file_uuid, edge_type, source_node_id, target_node_id)
|
||||
DO UPDATE SET properties = COALESCE(EXCLUDED.properties, tkg_edges.properties)
|
||||
""",
|
||||
(edge_type, source_id, target_id, file_uuid, json.dumps(properties or {})),
|
||||
)
|
||||
|
||||
|
||||
def build_face_trace_nodes(cur, schema, file_uuid):
|
||||
"""Create graph nodes for each face trace"""
|
||||
print("[TKG] Building face trace nodes...")
|
||||
cur.execute(
|
||||
f"""
|
||||
SELECT trace_id, COUNT(*) as frame_count,
|
||||
MIN(frame_number) as start_f, MAX(frame_number) as end_f,
|
||||
AVG(x::float) as avg_x,
|
||||
AVG(y::float) as avg_y,
|
||||
AVG(width::float) as avg_w,
|
||||
AVG(height::float) as avg_h
|
||||
FROM {schema}.face_detections
|
||||
WHERE file_uuid = %s AND trace_id IS NOT NULL
|
||||
GROUP BY trace_id
|
||||
ORDER BY trace_id
|
||||
""",
|
||||
(file_uuid,),
|
||||
)
|
||||
count = 0
|
||||
for row in cur.fetchall():
|
||||
tid, fc, sf, ef, ax, ay, aw, ah = row
|
||||
label = f"Face Trace {tid}"
|
||||
props = {
|
||||
"frame_count": fc,
|
||||
"start_frame": sf,
|
||||
"end_frame": ef,
|
||||
"avg_bbox": {"x": round(ax or 0, 1), "y": round(ay or 0, 1),
|
||||
"width": round(aw or 0, 1), "height": round(ah or 0, 1)},
|
||||
}
|
||||
ensure_node(cur, schema, file_uuid, "face_trace", f"trace_{tid}", label, props)
|
||||
count += 1
|
||||
print(f"[TKG] {count} face trace nodes created")
|
||||
return count
|
||||
|
||||
|
||||
def load_json_safe(path):
|
||||
"""Load JSON even if in-progress (truncated tail)"""
|
||||
if not os.path.exists(path):
|
||||
return None
|
||||
try:
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
# Try to recover by truncating to last valid frame
|
||||
print(f"[TKG] Warning: {path} is in-progress, loading partial data")
|
||||
with open(path) as f:
|
||||
content = f.read()
|
||||
# Find last valid "frame" entry and truncate
|
||||
last_valid = content.rfind('"}')
|
||||
if last_valid > 0:
|
||||
try:
|
||||
return json.loads(content[:last_valid+2] + "\n}}")
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def build_yolo_object_nodes(cur, schema, file_uuid):
|
||||
"""Create graph nodes for each YOLO object class from yolo.json"""
|
||||
yolo_path = os.path.join(OUTPUT_DIR, f"{file_uuid}.yolo.json")
|
||||
yolo = load_json_safe(yolo_path)
|
||||
if yolo is None:
|
||||
print(f"[TKG] yolo.json not available, skipping object nodes")
|
||||
return 0
|
||||
|
||||
frames = yolo.get("frames", {})
|
||||
class_counts = {}
|
||||
for fdata in frames.values():
|
||||
detections = fdata.get("detections", fdata.get("objects", []))
|
||||
for det in detections:
|
||||
cls = det.get("class_name", "unknown")
|
||||
class_counts[cls] = class_counts.get(cls, 0) + 1
|
||||
|
||||
count = 0
|
||||
for cls, cnt in sorted(class_counts.items()):
|
||||
ensure_node(
|
||||
cur, schema, file_uuid, "object",
|
||||
cls, cls,
|
||||
{"total_detections": cnt},
|
||||
)
|
||||
count += 1
|
||||
print(f"[TKG] {count} object class nodes created")
|
||||
return count
|
||||
|
||||
|
||||
def build_speaker_nodes(cur, schema, file_uuid):
|
||||
"""Create graph nodes for each speaker from asrx.json"""
|
||||
asrx_path = os.path.join(OUTPUT_DIR, f"{file_uuid}.asrx.json")
|
||||
if not os.path.exists(asrx_path):
|
||||
print(f"[TKG] asrx.json not found, skipping speaker nodes")
|
||||
return 0
|
||||
|
||||
with open(asrx_path) as f:
|
||||
asrx = json.load(f)
|
||||
|
||||
count = 0
|
||||
stats = asrx.get("speaker_stats", {})
|
||||
for sid, sinfo in stats.items():
|
||||
cnt = sinfo.get("count", 0)
|
||||
ensure_node(
|
||||
cur, schema, file_uuid, "speaker",
|
||||
sid, sid,
|
||||
{"segment_count": cnt},
|
||||
)
|
||||
count += 1
|
||||
print(f"[TKG] {count} speaker nodes created")
|
||||
return count
|
||||
|
||||
|
||||
def build_co_occurrence_edges(cur, schema, file_uuid):
|
||||
"""Build CO_OCCURS_WITH edges: face_trace ↔ yolo_object in same frame"""
|
||||
print("[TKG] Building co-occurrence edges (face-object within same frame)...")
|
||||
|
||||
yolo_path = os.path.join(OUTPUT_DIR, f"{file_uuid}.yolo.json")
|
||||
yolo = load_json_safe(yolo_path)
|
||||
if yolo is None:
|
||||
print(f"[TKG] yolo.json not available, skipping co-occurrence")
|
||||
return 0
|
||||
|
||||
yolo_frames = yolo.get("frames", {})
|
||||
|
||||
# Query face detections with trace_id
|
||||
cur.execute(
|
||||
f"""
|
||||
SELECT trace_id, frame_number, x, y, width, height
|
||||
FROM {schema}.face_detections
|
||||
WHERE file_uuid = %s AND trace_id IS NOT NULL
|
||||
ORDER BY frame_number
|
||||
""",
|
||||
(file_uuid,),
|
||||
)
|
||||
face_rows = cur.fetchall()
|
||||
print(f"[TKG] Checking {len(face_rows)} face detections against YOLO frames...")
|
||||
|
||||
# Get or create frame nodes cache
|
||||
frame_node_cache = {}
|
||||
|
||||
edge_count = 0
|
||||
for tid, frame_num, fx, fy, fw, fh in face_rows:
|
||||
frame_str = str(frame_num)
|
||||
yolo_frame = yolo_frames.get(frame_str)
|
||||
if not yolo_frame:
|
||||
continue
|
||||
|
||||
detections = yolo_frame.get("detections", yolo_frame.get("objects", []))
|
||||
if not detections:
|
||||
continue
|
||||
|
||||
# Get face trace node
|
||||
cur.execute(
|
||||
f"SELECT id FROM {schema}.tkg_nodes WHERE file_uuid=%s AND node_type='face_trace' AND external_id=%s",
|
||||
(file_uuid, f"trace_{tid}"),
|
||||
)
|
||||
ft_row = cur.fetchone()
|
||||
if not ft_row:
|
||||
continue
|
||||
face_node_id = ft_row[0]
|
||||
|
||||
for det in detections:
|
||||
cls = det.get("class_name", "unknown")
|
||||
confidence = det.get("confidence", 0)
|
||||
|
||||
# Get object node
|
||||
cur.execute(
|
||||
f"SELECT id FROM {schema}.tkg_nodes WHERE file_uuid=%s AND node_type='object' AND external_id=%s",
|
||||
(file_uuid, cls),
|
||||
)
|
||||
obj_row = cur.fetchone()
|
||||
if not obj_row:
|
||||
continue
|
||||
obj_node_id = obj_row[0]
|
||||
|
||||
# Compute spatial distance (center-to-center)
|
||||
fc_x = fx + fw / 2
|
||||
fc_y = fy + fh / 2
|
||||
|
||||
od = det
|
||||
od_x = od.get("x1", 0) + (od.get("x2", 0) - od.get("x1", 0)) / 2
|
||||
od_y = od.get("y1", 0) + (od.get("y2", 0) - od.get("y1", 0)) / 2
|
||||
distance = ((fc_x - od_x) ** 2 + (fc_y - od_y) ** 2) ** 0.5
|
||||
|
||||
edge_props = {
|
||||
"frame": frame_num,
|
||||
"distance_px": round(distance, 1),
|
||||
"object_confidence": confidence,
|
||||
"face_bbox": {"x": fx, "y": fy, "width": fw, "height": fh},
|
||||
"object_bbox": {
|
||||
"x1": od.get("x1"), "y1": od.get("y1"),
|
||||
"x2": od.get("x2"), "y2": od.get("y2"),
|
||||
},
|
||||
}
|
||||
|
||||
try:
|
||||
ensure_edge(
|
||||
cur, schema, file_uuid,
|
||||
"CO_OCCURS_WITH",
|
||||
face_node_id, obj_node_id,
|
||||
edge_props,
|
||||
)
|
||||
edge_count += 1
|
||||
except Exception as e:
|
||||
conn = cur.connection
|
||||
conn.rollback()
|
||||
continue
|
||||
|
||||
print(f"[TKG] {edge_count} co-occurrence edges created")
|
||||
return edge_count
|
||||
|
||||
|
||||
def build_speaker_face_edges(cur, schema, file_uuid):
|
||||
"""Build SPEAKS_AS edges: face_trace ↔ speaker via temporal overlap"""
|
||||
asrx_path = os.path.join(OUTPUT_DIR, f"{file_uuid}.asrx.json")
|
||||
if not os.path.exists(asrx_path):
|
||||
print(f"[TKG] asrx.json not found, skipping speaker edges")
|
||||
return 0
|
||||
|
||||
with open(asrx_path) as f:
|
||||
asrx = json.load(f)
|
||||
|
||||
segments = asrx.get("segments", [])
|
||||
if not segments:
|
||||
print("[TKG] No speaker segments found")
|
||||
return 0
|
||||
|
||||
# Get face trace nodes with their time spans
|
||||
cur.execute(
|
||||
f"""
|
||||
SELECT trace_id, MIN(frame_number) as start_f, MAX(frame_number) as end_f
|
||||
FROM {schema}.face_detections
|
||||
WHERE file_uuid = %s AND trace_id IS NOT NULL
|
||||
GROUP BY trace_id
|
||||
""",
|
||||
(file_uuid,),
|
||||
)
|
||||
traces = cur.fetchall()
|
||||
|
||||
fps = segments[-1]["end_frame"] / segments[-1]["end_time"] if segments else 30.0
|
||||
|
||||
edge_count = 0
|
||||
for tid, sf, ef in traces:
|
||||
# Get face trace node
|
||||
cur.execute(
|
||||
f"SELECT id FROM {schema}.tkg_nodes WHERE file_uuid=%s AND node_type='face_trace' AND external_id=%s",
|
||||
(file_uuid, f"trace_{tid}"),
|
||||
)
|
||||
ft_row = cur.fetchone()
|
||||
if not ft_row:
|
||||
continue
|
||||
face_node_id = ft_row[0]
|
||||
|
||||
face_start_sec = sf / fps if fps > 0 else 0
|
||||
face_end_sec = ef / fps if fps > 0 else 0
|
||||
|
||||
for seg in segments:
|
||||
speaker_id = seg.get("speaker_id", "")
|
||||
seg_start = seg.get("start_time", 0)
|
||||
seg_end = seg.get("end_time", 0)
|
||||
|
||||
# Check temporal overlap
|
||||
overlap_start = max(face_start_sec, seg_start)
|
||||
overlap_end = min(face_end_sec, seg_end)
|
||||
if overlap_start >= overlap_end:
|
||||
continue
|
||||
|
||||
overlap_dur = overlap_end - overlap_start
|
||||
face_dur = face_end_sec - face_start_sec
|
||||
overlap_ratio = overlap_dur / face_dur if face_dur > 0 else 0
|
||||
|
||||
if overlap_ratio < 0.3: # minimum 30% overlap
|
||||
continue
|
||||
|
||||
# Get speaker node
|
||||
cur.execute(
|
||||
f"SELECT id FROM {schema}.tkg_nodes WHERE file_uuid=%s AND node_type='speaker' AND external_id=%s",
|
||||
(file_uuid, speaker_id),
|
||||
)
|
||||
sp_row = cur.fetchone()
|
||||
if not sp_row:
|
||||
continue
|
||||
speaker_node_id = sp_row[0]
|
||||
|
||||
ensure_edge(
|
||||
cur, schema, file_uuid,
|
||||
"SPEAKS_AS",
|
||||
face_node_id, speaker_node_id,
|
||||
{
|
||||
"overlap_ratio": round(overlap_ratio, 3),
|
||||
"overlap_duration_s": round(overlap_dur, 1),
|
||||
"face_time_range": f"{face_start_sec:.1f}-{face_end_sec:.1f}s",
|
||||
"speaker_time_range": f"{seg_start:.1f}-{seg_end:.1f}s",
|
||||
},
|
||||
)
|
||||
edge_count += 1
|
||||
|
||||
print(f"[TKG] {edge_count} speaker-face edges created")
|
||||
return edge_count
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Build Temporal Knowledge Graph")
|
||||
parser.add_argument("--file-uuid", required=True)
|
||||
parser.add_argument("--schema", default=SCHEMA)
|
||||
args = parser.parse_args()
|
||||
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
|
||||
print(f"[TKG] Building graph for {args.file_uuid}...")
|
||||
|
||||
n1 = build_face_trace_nodes(cur, args.schema, args.file_uuid)
|
||||
n2 = build_yolo_object_nodes(cur, args.schema, args.file_uuid)
|
||||
n3 = build_speaker_nodes(cur, args.schema, args.file_uuid)
|
||||
|
||||
e1 = build_co_occurrence_edges(cur, args.schema, args.file_uuid)
|
||||
e2 = build_speaker_face_edges(cur, args.schema, args.file_uuid)
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print(f"\n[TKG] Complete: {n1+n2+n3} nodes, {e1+e2} edges")
|
||||
print(f" Face traces: {n1}")
|
||||
print(f" Objects: {n2}")
|
||||
print(f" Speakers: {n3}")
|
||||
print(f" Co-occur: {e1}")
|
||||
print(f" Speaker-face:{e2}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user