momentry_core/experiments/trace_quality_agent.py

#!/usr/bin/env python3
"""
Trace 品質檢查 Agent — 選型實驗報告
評估每個 trace 是否符合 identity 標準，檢測需補掃/覆查的異常 trace。

檢查項目:
  1. 取樣密度      — trace < 3 frames → 需要 dense scan
  2. 人臉驗證      — DeepFace vs Apple Vision 確認是否為人臉
  3. Embedding 品質 — trace 內方差過大 → 可能混入多人
  4. 時序衝突      — 同 identity 兩 trace 同時出現 → 需 split
"""

import json, sys, os, time, argparse, io
from collections import defaultdict
from pathlib import Path

DB_URL = "postgresql://accusys@localhost:5432/momentry"
SCHEMA = "dev"
FILE_UUID = "417a7e93860d70c87aee6c4c1b715d70"
VIDEO_PATH = "/Users/accusys/test_video/Old_Time_Movie_Show_-_Charade_1963.HD.mov"
OUT_DIR = Path("/Users/accusys/momentry/output_dev/experiments/trace_quality")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ============================================================
# Report Header
# ============================================================
print("=" * 70)
print("Trace 品質檢查 — 技術選型實驗報告")
print("=" * 70)
print(f"File: Charade (1963), {FILE_UUID}")
print(f"Traces: 2347, Faces: 6182")
print()

import psycopg2
import psycopg2.extras
import numpy as np

conn = psycopg2.connect(DB_URL)
cur = conn.cursor()

# ============================================================
# Check 1: Sample Density (取樣密度)
# ============================================================
print("=" * 70)
print("Check 1: 取樣密度 (Sample Density)")
print("=" * 70)

cur.execute(f"""
    SELECT
        CASE WHEN fc = 1 THEN '1 frame'
             WHEN fc <= 3 THEN '2-3 frames'
             WHEN fc <= 10 THEN '4-10 frames'
             ELSE '11+ frames'
        END AS density,
        COUNT(*) AS trace_count,
        ROUND(COUNT(*)::numeric / (SELECT COUNT(*) FROM (SELECT trace_id, COUNT(*) FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL GROUP BY trace_id) t) * 100, 1) AS pct
    FROM (SELECT trace_id, COUNT(*) AS fc FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL GROUP BY trace_id) t
    GROUP BY 1 ORDER BY MIN(fc)
""", (FILE_UUID, FILE_UUID))

for density, count, pct in cur.fetchall():
    marker = " ← needs dense scan" if "frame" in density and int(density[0]) < 4 else ""
    print(f"  {density:<15} {count:>6} traces ({pct:>5.1f}%){marker}")

need_dense = sum(1 for _ in cur.fetchall()) if False else 0
cur.execute(f"SELECT COUNT(*) FROM (SELECT trace_id FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL GROUP BY trace_id HAVING COUNT(*) < 4) t", (FILE_UUID,))
need_dense = cur.fetchone()[0]
print(f"\n  需 dense scan: {need_dense} traces ({need_dense/2347*100:.1f}%)")

print()
print("  技術方案:")
print("    方案A: swift_face --sample-interval 1 (Apple Vision, ~250fps)")
print("    方案B: ffmpeg + DeepFace (Python, ~0.2s/face)")
print("  建議: 方案A，無需額外模型，速度快，已整合於 pipeline")

# ============================================================
# Check 2: Human Face Verification (人臉驗證)
# ============================================================
print()
print("=" * 70)
print("Check 2: 人臉驗證 (Human Face Verification)")
print("=" * 70)

# Sample 20 traces: 10 with high confidence (likely human), 10 with low (possibly non-human)
cur.execute(f"""
    (SELECT trace_id, AVG(confidence)::numeric(4,3) AS c, AVG(width)::int AS w, AVG(height)::int AS h,
            MIN(frame_number) AS f
     FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL
     GROUP BY trace_id ORDER BY AVG(confidence) ASC LIMIT 5)
    UNION ALL
    (SELECT trace_id, AVG(confidence)::numeric(4,3) AS c, AVG(width)::int AS w, AVG(height)::int AS h,
            MIN(frame_number) AS f
     FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL
     GROUP BY trace_id ORDER BY AVG(confidence) DESC LIMIT 5)
""", (FILE_UUID, FILE_UUID))

samples = cur.fetchall()

# Test DeepFace
print("  DeepFace 人臉驗證 (10 samples):")
try:
    from deepface import DeepFace
    import warnings
    warnings.filterwarnings("ignore")

    t0 = time.time()
    for tid, conf, w, h, frame in samples:
        sec = frame / 59.94
        img_path = OUT_DIR / f"trace_{tid}_verify.jpg"
        if not img_path.exists():
            os.system(f'ffmpeg -y -ss {sec:.1f} -i "{VIDEO_PATH}" -frames:v 1 -q:v 3 {img_path} 2>/dev/null')
        try:
            r = DeepFace.analyze(str(img_path), actions=['age','gender'], enforce_detection=False, detector_backend='opencv')
            if isinstance(r, list): r = r[0]
            age = r.get('age', 0)
            gender = r.get('dominant_gender', 'N/A')
            is_human = age > 0 and gender in ('Man', 'Woman')
            print(f"    trace {tid:>5}: conf={conf:.3f} {w}x{h} → age={age:.0f} gender={gender:<5} {'✅ human' if is_human else '⚠️ non-human?'}")
        except Exception as e:
            print(f"    trace {tid:>5}: conf={conf:.3f} {w}x{h} → ERROR {str(e)[:60]}")
    dt = time.time() - t0
    print(f"    Time: {dt:.1f}s ({dt/10:.1f}s/face)")
except ImportError:
    print("    DeepFace not available")

# Test Apple Vision approach (statistical, no ML)
print()
print("  Statistical filter (no ML):")
print("    Rule: confidence < 0.5 OR aspect_ratio deviation > 0.3 → flag")
cur.execute(f"""
    SELECT COUNT(*) FROM {SCHEMA}.face_detections
    WHERE file_uuid=%s AND trace_id IS NOT NULL AND confidence < 0.5
""", (FILE_UUID,))
low_conf = cur.fetchone()[0]
print(f"    Low confidence (<0.5): {low_conf} faces")
print(f"    Aspect ratio: all detections are square (Vision bbox), no filtering possible")

print()
print("  建議: DeepFace verify for low-confidence traces only")
print("        可選 gateway: conf < 0.6 才跑 DeepFace，節省 90% 成本")

# ============================================================
# Check 3: Embedding Quality
# ============================================================
print()
print("=" * 70)
print("Check 3: Embedding Quality (嵌入品質)")
print("=" * 70)

# Check intra-trace embedding variance for top 5 largest traces
cur.execute(f"""
    SELECT trace_id, COUNT(*) AS fc, AVG(confidence)::numeric(4,3) AS conf
    FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL
    GROUP BY trace_id ORDER BY fc DESC LIMIT 10
""", (FILE_UUID,))
top_traces = cur.fetchall()

print("  Intra-trace embedding variance (top 10 traces by size):")
for tid, fc, conf in top_traces:
    cur.execute(f"""
        SELECT embedding FROM {SCHEMA}.face_detections
        WHERE file_uuid=%s AND trace_id=%s AND embedding IS NOT NULL
    """, (FILE_UUID, tid))
    embs = [np.array(row[0]) for row in cur.fetchall() if row[0]]
    if len(embs) < 2:
        print(f"    trace {tid:>5}: {fc:>3} faces, conf={conf:.3f} — not enough embeddings")
        continue

    # Normalize and compute pairwise cosine similarity
    embs_norm = np.array([e / (np.linalg.norm(e) + 1e-10) for e in embs])
    sim_matrix = embs_norm @ embs_norm.T
    np.fill_diagonal(sim_matrix, 0)
    # Exclude diagonal zeros when finding min
    non_diag = sim_matrix[sim_matrix > 0.0001]
    var = float(1.0 - np.mean(sim_matrix[sim_matrix > 0.0001])) if len(non_diag) > 0 else 0.0
    min_sim = float(np.min(non_diag)) if len(non_diag) > 0 else 0.0

    quality = "✅ good" if var < 0.3 and min_sim > 0.5 else \
              "⚠️ check" if var < 0.5 and min_sim > 0.3 else \
              "❌ split likely"
    print(f"    trace {tid:>5}: {fc:>3} faces, conf={conf:.3f}, variance={var:.3f}, min_sim={min_sim:.3f} → {quality}")

print()
print("  建議: variance > 0.2 OR min_sim < 0.4 → 標記 split")
print("        純統計方法，無需模型")

# ============================================================
# Check 4: Temporal Collision
# ============================================================
print()
print("=" * 70)
print("Check 4: 時序衝突 (Temporal Collision)")
print("=" * 70)

cur.execute(f"""
    SELECT i.name, a.trace_id, a.frame_number AS a_frame, b.trace_id AS b_trace, b.frame_number AS b_frame
    FROM {SCHEMA}.face_detections a
    JOIN {SCHEMA}.face_detections b ON a.file_uuid=b.file_uuid AND a.frame_number=b.frame_number AND a.trace_id<b.trace_id
    JOIN {SCHEMA}.identities i ON a.identity_id=i.id AND b.identity_id=i.id
    WHERE a.file_uuid=%s AND a.identity_id IS NOT NULL
    ORDER BY a.frame_number LIMIT 10
""", (FILE_UUID,))
collisions = cur.fetchall()

if collisions:
    print("  ⚠️ 同一 identity 的 trace 出現在同一幀:")
    for name, a_tid, af, b_tid, bf in collisions:
        print(f"    {name}: trace {a_tid} & {b_tid} at frame {af}")
else:
    print("  ✅ No temporal collisions detected")

print()
print("  建議: 純 SQL 檢測，發現碰撞 → 自動 split into separate identities")

cur.close(); conn.close()

# ============================================================
# Summary
# ============================================================
print()
print("=" * 70)
print("選型建議總結")
print("=" * 70)
print()
print(f"  {'檢查':<25} {'技術':<20} {'模型':<12} {'速度':<10} {'可行性'}")
print(f"  {'-'*70}")
print(f"  {'1.取樣密度':<25} {'SQL + swift_face':<20} {'Apple Vision':<12} {'250fps':<10} {'✅ 已整合'}")
print(f"  {'2.人臉驗證':<25} {'DeepFace analyze':<20} {'AgeNet':<12} {'0.2s/face':<10} {'✅ MIT license'}")
print(f"  {'3.Embedding 品質':<25} {'numpy statistics':<20} {'None':<12} {'instant':<10} {'✅ 純計算'}")
print(f"  {'4.時序衝突':<25} {'SQL JOIN':<20} {'None':<12} {'instant':<10} {'✅ 純查詢'}")
print(f"  {'5.Speaker 一致性':<25} {'SQL + overlap':<20} {'None':<12} {'instant':<10} {'✅ 後續追加'}")
print()
print(f"  唯一需要外部模型的: Check 2 (DeepFace, MIT, 0.2s/face)")
print(f"  其他全為純 SQL/統計，可立即實作")