feat: trace quality agent selection report, identity clustering runner_v2 DB write, age/gender CoreML selection, updated experiment config UUID

2026-05-06 14:41:48 +08:00
parent 74b6182eba
commit 65a1f77e65
1048 changed files with 103499 additions and 0 deletions
--- a/experiments/trace_quality_agent.py
+++ b/experiments/trace_quality_agent.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+"""
+Trace 品質檢查 Agent — 選型實驗報告
+評估每個 trace 是否符合 identity 標準，檢測需補掃/覆查的異常 trace。
+
+檢查項目:
+  1. 取樣密度      — trace < 3 frames → 需要 dense scan
+  2. 人臉驗證      — DeepFace vs Apple Vision 確認是否為人臉
+  3. Embedding 品質 — trace 內方差過大 → 可能混入多人
+  4. 時序衝突      — 同 identity 兩 trace 同時出現 → 需 split
+"""
+
+import json, sys, os, time, argparse, io
+from collections import defaultdict
+from pathlib import Path
+
+DB_URL = "postgresql://accusys@localhost:5432/momentry"
+SCHEMA = "dev"
+FILE_UUID = "417a7e93860d70c87aee6c4c1b715d70"
+VIDEO_PATH = "/Users/accusys/test_video/Old_Time_Movie_Show_-_Charade_1963.HD.mov"
+OUT_DIR = Path("/Users/accusys/momentry/output_dev/experiments/trace_quality")
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+# ============================================================
+# Report Header
+# ============================================================
+print("=" * 70)
+print("Trace 品質檢查 — 技術選型實驗報告")
+print("=" * 70)
+print(f"File: Charade (1963), {FILE_UUID}")
+print(f"Traces: 2347, Faces: 6182")
+print()
+
+import psycopg2
+import psycopg2.extras
+import numpy as np
+
+conn = psycopg2.connect(DB_URL)
+cur = conn.cursor()
+
+# ============================================================
+# Check 1: Sample Density (取樣密度)
+# ============================================================
+print("=" * 70)
+print("Check 1: 取樣密度 (Sample Density)")
+print("=" * 70)
+
+cur.execute(f"""
+    SELECT 
+        CASE WHEN fc = 1 THEN '1 frame'
+             WHEN fc <= 3 THEN '2-3 frames'
+             WHEN fc <= 10 THEN '4-10 frames'
+             ELSE '11+ frames'
+        END AS density,
+        COUNT(*) AS trace_count,
+        ROUND(COUNT(*)::numeric / (SELECT COUNT(*) FROM (SELECT trace_id, COUNT(*) FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL GROUP BY trace_id) t) * 100, 1) AS pct
+    FROM (SELECT trace_id, COUNT(*) AS fc FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL GROUP BY trace_id) t
+    GROUP BY 1 ORDER BY MIN(fc)
+""", (FILE_UUID, FILE_UUID))
+
+for density, count, pct in cur.fetchall():
+    marker = " ← needs dense scan" if "frame" in density and int(density[0]) < 4 else ""
+    print(f"  {density:<15} {count:>6} traces ({pct:>5.1f}%){marker}")
+
+need_dense = sum(1 for _ in cur.fetchall()) if False else 0
+cur.execute(f"SELECT COUNT(*) FROM (SELECT trace_id FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL GROUP BY trace_id HAVING COUNT(*) < 4) t", (FILE_UUID,))
+need_dense = cur.fetchone()[0]
+print(f"\n  需 dense scan: {need_dense} traces ({need_dense/2347*100:.1f}%)")
+
+print()
+print("  技術方案:")
+print("    方案A: swift_face --sample-interval 1 (Apple Vision, ~250fps)")
+print("    方案B: ffmpeg + DeepFace (Python, ~0.2s/face)")
+print("  建議: 方案A，無需額外模型，速度快，已整合於 pipeline")
+
+# ============================================================
+# Check 2: Human Face Verification (人臉驗證)
+# ============================================================
+print()
+print("=" * 70)
+print("Check 2: 人臉驗證 (Human Face Verification)")
+print("=" * 70)
+
+# Sample 20 traces: 10 with high confidence (likely human), 10 with low (possibly non-human)
+cur.execute(f"""
+    (SELECT trace_id, AVG(confidence)::numeric(4,3) AS c, AVG(width)::int AS w, AVG(height)::int AS h,
+            MIN(frame_number) AS f
+     FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL
+     GROUP BY trace_id ORDER BY AVG(confidence) ASC LIMIT 5)
+    UNION ALL
+    (SELECT trace_id, AVG(confidence)::numeric(4,3) AS c, AVG(width)::int AS w, AVG(height)::int AS h,
+            MIN(frame_number) AS f
+     FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL
+     GROUP BY trace_id ORDER BY AVG(confidence) DESC LIMIT 5)
+""", (FILE_UUID, FILE_UUID))
+
+samples = cur.fetchall()
+
+# Test DeepFace
+print("  DeepFace 人臉驗證 (10 samples):")
+try:
+    from deepface import DeepFace
+    import warnings
+    warnings.filterwarnings("ignore")
+
+    t0 = time.time()
+    for tid, conf, w, h, frame in samples:
+        sec = frame / 59.94
+        img_path = OUT_DIR / f"trace_{tid}_verify.jpg"
+        if not img_path.exists():
+            os.system(f'ffmpeg -y -ss {sec:.1f} -i "{VIDEO_PATH}" -frames:v 1 -q:v 3 {img_path} 2>/dev/null')
+        try:
+            r = DeepFace.analyze(str(img_path), actions=['age','gender'], enforce_detection=False, detector_backend='opencv')
+            if isinstance(r, list): r = r[0]
+            age = r.get('age', 0)
+            gender = r.get('dominant_gender', 'N/A')
+            is_human = age > 0 and gender in ('Man', 'Woman')
+            print(f"    trace {tid:>5}: conf={conf:.3f} {w}x{h} → age={age:.0f} gender={gender:<5} {'✅ human' if is_human else '⚠️ non-human?'}")
+        except Exception as e:
+            print(f"    trace {tid:>5}: conf={conf:.3f} {w}x{h} → ERROR {str(e)[:60]}")
+    dt = time.time() - t0
+    print(f"    Time: {dt:.1f}s ({dt/10:.1f}s/face)")
+except ImportError:
+    print("    DeepFace not available")
+
+# Test Apple Vision approach (statistical, no ML)
+print()
+print("  Statistical filter (no ML):")
+print("    Rule: confidence < 0.5 OR aspect_ratio deviation > 0.3 → flag")
+cur.execute(f"""
+    SELECT COUNT(*) FROM {SCHEMA}.face_detections 
+    WHERE file_uuid=%s AND trace_id IS NOT NULL AND confidence < 0.5
+""", (FILE_UUID,))
+low_conf = cur.fetchone()[0]
+print(f"    Low confidence (<0.5): {low_conf} faces")
+print(f"    Aspect ratio: all detections are square (Vision bbox), no filtering possible")
+
+print()
+print("  建議: DeepFace verify for low-confidence traces only")
+print("        可選 gateway: conf < 0.6 才跑 DeepFace，節省 90% 成本")
+
+# ============================================================
+# Check 3: Embedding Quality
+# ============================================================
+print()
+print("=" * 70)
+print("Check 3: Embedding Quality (嵌入品質)")
+print("=" * 70)
+
+# Check intra-trace embedding variance for top 5 largest traces
+cur.execute(f"""
+    SELECT trace_id, COUNT(*) AS fc, AVG(confidence)::numeric(4,3) AS conf
+    FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL
+    GROUP BY trace_id ORDER BY fc DESC LIMIT 10
+""", (FILE_UUID,))
+top_traces = cur.fetchall()
+
+print("  Intra-trace embedding variance (top 10 traces by size):")
+for tid, fc, conf in top_traces:
+    cur.execute(f"""
+        SELECT embedding FROM {SCHEMA}.face_detections
+        WHERE file_uuid=%s AND trace_id=%s AND embedding IS NOT NULL
+    """, (FILE_UUID, tid))
+    embs = [np.array(row[0]) for row in cur.fetchall() if row[0]]
+    if len(embs) < 2:
+        print(f"    trace {tid:>5}: {fc:>3} faces, conf={conf:.3f} — not enough embeddings")
+        continue
+    
+    # Normalize and compute pairwise cosine similarity
+    embs_norm = np.array([e / (np.linalg.norm(e) + 1e-10) for e in embs])
+    sim_matrix = embs_norm @ embs_norm.T
+    np.fill_diagonal(sim_matrix, 0)
+    # Exclude diagonal zeros when finding min
+    non_diag = sim_matrix[sim_matrix > 0.0001]
+    var = float(1.0 - np.mean(sim_matrix[sim_matrix > 0.0001])) if len(non_diag) > 0 else 0.0
+    min_sim = float(np.min(non_diag)) if len(non_diag) > 0 else 0.0
+
+    quality = "✅ good" if var < 0.3 and min_sim > 0.5 else \
+              "⚠️ check" if var < 0.5 and min_sim > 0.3 else \
+              "❌ split likely"
+    print(f"    trace {tid:>5}: {fc:>3} faces, conf={conf:.3f}, variance={var:.3f}, min_sim={min_sim:.3f} → {quality}")
+
+print()
+print("  建議: variance > 0.2 OR min_sim < 0.4 → 標記 split")
+print("        純統計方法，無需模型")
+
+# ============================================================
+# Check 4: Temporal Collision
+# ============================================================
+print()
+print("=" * 70)
+print("Check 4: 時序衝突 (Temporal Collision)")
+print("=" * 70)
+
+cur.execute(f"""
+    SELECT i.name, a.trace_id, a.frame_number AS a_frame, b.trace_id AS b_trace, b.frame_number AS b_frame
+    FROM {SCHEMA}.face_detections a
+    JOIN {SCHEMA}.face_detections b ON a.file_uuid=b.file_uuid AND a.frame_number=b.frame_number AND a.trace_id<b.trace_id
+    JOIN {SCHEMA}.identities i ON a.identity_id=i.id AND b.identity_id=i.id
+    WHERE a.file_uuid=%s AND a.identity_id IS NOT NULL
+    ORDER BY a.frame_number LIMIT 10
+""", (FILE_UUID,))
+collisions = cur.fetchall()
+
+if collisions:
+    print("  ⚠️ 同一 identity 的 trace 出現在同一幀:")
+    for name, a_tid, af, b_tid, bf in collisions:
+        print(f"    {name}: trace {a_tid} & {b_tid} at frame {af}")
+else:
+    print("  ✅ No temporal collisions detected")
+
+print()
+print("  建議: 純 SQL 檢測，發現碰撞 → 自動 split into separate identities")
+
+cur.close(); conn.close()
+
+# ============================================================
+# Summary
+# ============================================================
+print()
+print("=" * 70)
+print("選型建議總結")
+print("=" * 70)
+print()
+print(f"  {'檢查':<25} {'技術':<20} {'模型':<12} {'速度':<10} {'可行性'}")
+print(f"  {'-'*70}")
+print(f"  {'1.取樣密度':<25} {'SQL + swift_face':<20} {'Apple Vision':<12} {'250fps':<10} {'✅ 已整合'}")
+print(f"  {'2.人臉驗證':<25} {'DeepFace analyze':<20} {'AgeNet':<12} {'0.2s/face':<10} {'✅ MIT license'}")
+print(f"  {'3.Embedding 品質':<25} {'numpy statistics':<20} {'None':<12} {'instant':<10} {'✅ 純計算'}")
+print(f"  {'4.時序衝突':<25} {'SQL JOIN':<20} {'None':<12} {'instant':<10} {'✅ 純查詢'}")
+print(f"  {'5.Speaker 一致性':<25} {'SQL + overlap':<20} {'None':<12} {'instant':<10} {'✅ 後續追加'}")
+print()
+print(f"  唯一需要外部模型的: Check 2 (DeepFace, MIT, 0.2s/face)")
+print(f"  其他全為純 SQL/統計，可立即實作")