feat: trace quality agent selection report, identity clustering runner_v2 DB write, age/gender CoreML selection, updated experiment config UUID

This commit is contained in:
Warren
2026-05-06 14:41:48 +08:00
parent 74b6182eba
commit 65a1f77e65
1048 changed files with 103499 additions and 0 deletions

View File

@@ -0,0 +1,234 @@
#!/usr/bin/env python3
"""
Trace 品質檢查 Agent — 選型實驗報告
評估每個 trace 是否符合 identity 標準,檢測需補掃/覆查的異常 trace。
檢查項目:
1. 取樣密度 — trace < 3 frames → 需要 dense scan
2. 人臉驗證 — DeepFace vs Apple Vision 確認是否為人臉
3. Embedding 品質 — trace 內方差過大 → 可能混入多人
4. 時序衝突 — 同 identity 兩 trace 同時出現 → 需 split
"""
import json, sys, os, time, argparse, io
from collections import defaultdict
from pathlib import Path
DB_URL = "postgresql://accusys@localhost:5432/momentry"
SCHEMA = "dev"
FILE_UUID = "417a7e93860d70c87aee6c4c1b715d70"
VIDEO_PATH = "/Users/accusys/test_video/Old_Time_Movie_Show_-_Charade_1963.HD.mov"
OUT_DIR = Path("/Users/accusys/momentry/output_dev/experiments/trace_quality")
OUT_DIR.mkdir(parents=True, exist_ok=True)
# ============================================================
# Report Header
# ============================================================
print("=" * 70)
print("Trace 品質檢查 — 技術選型實驗報告")
print("=" * 70)
print(f"File: Charade (1963), {FILE_UUID}")
print(f"Traces: 2347, Faces: 6182")
print()
import psycopg2
import psycopg2.extras
import numpy as np
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# ============================================================
# Check 1: Sample Density (取樣密度)
# ============================================================
print("=" * 70)
print("Check 1: 取樣密度 (Sample Density)")
print("=" * 70)
cur.execute(f"""
SELECT
CASE WHEN fc = 1 THEN '1 frame'
WHEN fc <= 3 THEN '2-3 frames'
WHEN fc <= 10 THEN '4-10 frames'
ELSE '11+ frames'
END AS density,
COUNT(*) AS trace_count,
ROUND(COUNT(*)::numeric / (SELECT COUNT(*) FROM (SELECT trace_id, COUNT(*) FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL GROUP BY trace_id) t) * 100, 1) AS pct
FROM (SELECT trace_id, COUNT(*) AS fc FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL GROUP BY trace_id) t
GROUP BY 1 ORDER BY MIN(fc)
""", (FILE_UUID, FILE_UUID))
for density, count, pct in cur.fetchall():
marker = " ← needs dense scan" if "frame" in density and int(density[0]) < 4 else ""
print(f" {density:<15} {count:>6} traces ({pct:>5.1f}%){marker}")
need_dense = sum(1 for _ in cur.fetchall()) if False else 0
cur.execute(f"SELECT COUNT(*) FROM (SELECT trace_id FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL GROUP BY trace_id HAVING COUNT(*) < 4) t", (FILE_UUID,))
need_dense = cur.fetchone()[0]
print(f"\n 需 dense scan: {need_dense} traces ({need_dense/2347*100:.1f}%)")
print()
print(" 技術方案:")
print(" 方案A: swift_face --sample-interval 1 (Apple Vision, ~250fps)")
print(" 方案B: ffmpeg + DeepFace (Python, ~0.2s/face)")
print(" 建議: 方案A無需額外模型速度快已整合於 pipeline")
# ============================================================
# Check 2: Human Face Verification (人臉驗證)
# ============================================================
print()
print("=" * 70)
print("Check 2: 人臉驗證 (Human Face Verification)")
print("=" * 70)
# Sample 20 traces: 10 with high confidence (likely human), 10 with low (possibly non-human)
cur.execute(f"""
(SELECT trace_id, AVG(confidence)::numeric(4,3) AS c, AVG(width)::int AS w, AVG(height)::int AS h,
MIN(frame_number) AS f
FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL
GROUP BY trace_id ORDER BY AVG(confidence) ASC LIMIT 5)
UNION ALL
(SELECT trace_id, AVG(confidence)::numeric(4,3) AS c, AVG(width)::int AS w, AVG(height)::int AS h,
MIN(frame_number) AS f
FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL
GROUP BY trace_id ORDER BY AVG(confidence) DESC LIMIT 5)
""", (FILE_UUID, FILE_UUID))
samples = cur.fetchall()
# Test DeepFace
print(" DeepFace 人臉驗證 (10 samples):")
try:
from deepface import DeepFace
import warnings
warnings.filterwarnings("ignore")
t0 = time.time()
for tid, conf, w, h, frame in samples:
sec = frame / 59.94
img_path = OUT_DIR / f"trace_{tid}_verify.jpg"
if not img_path.exists():
os.system(f'ffmpeg -y -ss {sec:.1f} -i "{VIDEO_PATH}" -frames:v 1 -q:v 3 {img_path} 2>/dev/null')
try:
r = DeepFace.analyze(str(img_path), actions=['age','gender'], enforce_detection=False, detector_backend='opencv')
if isinstance(r, list): r = r[0]
age = r.get('age', 0)
gender = r.get('dominant_gender', 'N/A')
is_human = age > 0 and gender in ('Man', 'Woman')
print(f" trace {tid:>5}: conf={conf:.3f} {w}x{h} → age={age:.0f} gender={gender:<5} {'✅ human' if is_human else '⚠️ non-human?'}")
except Exception as e:
print(f" trace {tid:>5}: conf={conf:.3f} {w}x{h} → ERROR {str(e)[:60]}")
dt = time.time() - t0
print(f" Time: {dt:.1f}s ({dt/10:.1f}s/face)")
except ImportError:
print(" DeepFace not available")
# Test Apple Vision approach (statistical, no ML)
print()
print(" Statistical filter (no ML):")
print(" Rule: confidence < 0.5 OR aspect_ratio deviation > 0.3 → flag")
cur.execute(f"""
SELECT COUNT(*) FROM {SCHEMA}.face_detections
WHERE file_uuid=%s AND trace_id IS NOT NULL AND confidence < 0.5
""", (FILE_UUID,))
low_conf = cur.fetchone()[0]
print(f" Low confidence (<0.5): {low_conf} faces")
print(f" Aspect ratio: all detections are square (Vision bbox), no filtering possible")
print()
print(" 建議: DeepFace verify for low-confidence traces only")
print(" 可選 gateway: conf < 0.6 才跑 DeepFace節省 90% 成本")
# ============================================================
# Check 3: Embedding Quality
# ============================================================
print()
print("=" * 70)
print("Check 3: Embedding Quality (嵌入品質)")
print("=" * 70)
# Check intra-trace embedding variance for top 5 largest traces
cur.execute(f"""
SELECT trace_id, COUNT(*) AS fc, AVG(confidence)::numeric(4,3) AS conf
FROM {SCHEMA}.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL
GROUP BY trace_id ORDER BY fc DESC LIMIT 10
""", (FILE_UUID,))
top_traces = cur.fetchall()
print(" Intra-trace embedding variance (top 10 traces by size):")
for tid, fc, conf in top_traces:
cur.execute(f"""
SELECT embedding FROM {SCHEMA}.face_detections
WHERE file_uuid=%s AND trace_id=%s AND embedding IS NOT NULL
""", (FILE_UUID, tid))
embs = [np.array(row[0]) for row in cur.fetchall() if row[0]]
if len(embs) < 2:
print(f" trace {tid:>5}: {fc:>3} faces, conf={conf:.3f} — not enough embeddings")
continue
# Normalize and compute pairwise cosine similarity
embs_norm = np.array([e / (np.linalg.norm(e) + 1e-10) for e in embs])
sim_matrix = embs_norm @ embs_norm.T
np.fill_diagonal(sim_matrix, 0)
# Exclude diagonal zeros when finding min
non_diag = sim_matrix[sim_matrix > 0.0001]
var = float(1.0 - np.mean(sim_matrix[sim_matrix > 0.0001])) if len(non_diag) > 0 else 0.0
min_sim = float(np.min(non_diag)) if len(non_diag) > 0 else 0.0
quality = "✅ good" if var < 0.3 and min_sim > 0.5 else \
"⚠️ check" if var < 0.5 and min_sim > 0.3 else \
"❌ split likely"
print(f" trace {tid:>5}: {fc:>3} faces, conf={conf:.3f}, variance={var:.3f}, min_sim={min_sim:.3f}{quality}")
print()
print(" 建議: variance > 0.2 OR min_sim < 0.4 → 標記 split")
print(" 純統計方法,無需模型")
# ============================================================
# Check 4: Temporal Collision
# ============================================================
print()
print("=" * 70)
print("Check 4: 時序衝突 (Temporal Collision)")
print("=" * 70)
cur.execute(f"""
SELECT i.name, a.trace_id, a.frame_number AS a_frame, b.trace_id AS b_trace, b.frame_number AS b_frame
FROM {SCHEMA}.face_detections a
JOIN {SCHEMA}.face_detections b ON a.file_uuid=b.file_uuid AND a.frame_number=b.frame_number AND a.trace_id<b.trace_id
JOIN {SCHEMA}.identities i ON a.identity_id=i.id AND b.identity_id=i.id
WHERE a.file_uuid=%s AND a.identity_id IS NOT NULL
ORDER BY a.frame_number LIMIT 10
""", (FILE_UUID,))
collisions = cur.fetchall()
if collisions:
print(" ⚠️ 同一 identity 的 trace 出現在同一幀:")
for name, a_tid, af, b_tid, bf in collisions:
print(f" {name}: trace {a_tid} & {b_tid} at frame {af}")
else:
print(" ✅ No temporal collisions detected")
print()
print(" 建議: 純 SQL 檢測,發現碰撞 → 自動 split into separate identities")
cur.close(); conn.close()
# ============================================================
# Summary
# ============================================================
print()
print("=" * 70)
print("選型建議總結")
print("=" * 70)
print()
print(f" {'檢查':<25} {'技術':<20} {'模型':<12} {'速度':<10} {'可行性'}")
print(f" {'-'*70}")
print(f" {'1.取樣密度':<25} {'SQL + swift_face':<20} {'Apple Vision':<12} {'250fps':<10} {'✅ 已整合'}")
print(f" {'2.人臉驗證':<25} {'DeepFace analyze':<20} {'AgeNet':<12} {'0.2s/face':<10} {'✅ MIT license'}")
print(f" {'3.Embedding 品質':<25} {'numpy statistics':<20} {'None':<12} {'instant':<10} {'✅ 純計算'}")
print(f" {'4.時序衝突':<25} {'SQL JOIN':<20} {'None':<12} {'instant':<10} {'✅ 純查詢'}")
print(f" {'5.Speaker 一致性':<25} {'SQL + overlap':<20} {'None':<12} {'instant':<10} {'✅ 後續追加'}")
print()
print(f" 唯一需要外部模型的: Check 2 (DeepFace, MIT, 0.2s/face)")
print(f" 其他全為純 SQL/統計,可立即實作")