From ffc30d737748b5c922505d3a80f88fcb1999b2c7 Mon Sep 17 00:00:00 2001 From: Accusys Date: Wed, 13 May 2026 20:00:47 +0800 Subject: [PATCH] M4 handover: coordinate fixes, detector registry, deploy v2, YOLOv8s, identity lifecycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix swift_pose/swift_ocr Y-flip bugs (BUG-003~006) - Add heuristic_scene module + post-processing trigger (replaces Places365) - YOLOv5nu → YOLOv8s CoreML (+33% detections, +390% scene indicators) - Per-table SQL export (split 4.7GB single file → 478MB max per table) - Version/build check in deploy.sh (compare /health vs file_info.json) - Add file_uuid column to identities table + backfill - Identity pre-clean step in deploy (avoids UNIQUE conflicts on re-deploy) - Stranger_xxx naming fix with UUID context - Add DETECTOR_REGISTRY.md (25 detectors), DETECTOR_SELECTION_SOP.md - Update SPATIAL_COORDINATE_REGISTRY.md (P layer, 6-layer architecture) - New IDENTITY_LIFECYCLE.md - M4 response docs for deploy_script_fix and 111614 test report --- build.rs | 10 + docs_v1.0/M4_HANDOVER/deploy.sh | 87 ++- docs_v1.0/M4_HANDOVER/export_file_package.py | 2 + docs_v1.0/M4_HANDOVER/identity_bind.py | 7 +- .../2026-05-13_111614_test_report.md | 107 ++++ .../2026-05-13_deploy_script_response.md | 55 ++ docs_v1.0/REFERENCE/DETECTOR_REGISTRY.md | 602 ++++++++++++++++++ docs_v1.0/REFERENCE/DETECTOR_SELECTION_SOP.md | 238 +++++++ docs_v1.0/REFERENCE/IDENTITY_LIFECYCLE.md | 161 +++++ .../REFERENCE/SPATIAL_COORDINATE_REGISTRY.md | 267 ++++++++ experiments/identity_clustering/runner_v2.py | 4 +- scripts/deploy_package.sh | 87 ++- scripts/export_file_package.py | 14 +- scripts/export_sqlite.py | 2 +- scripts/identity_bind.py | 8 +- scripts/match_identities_to_tmdb.py | 133 ++++ scripts/swift_processors/swift_ocr.swift | 13 +- scripts/swift_processors/swift_pose.swift | 22 +- src/api/server.rs | 4 + src/bin/release.rs | 181 ++++-- src/core/db/postgres_db.rs | 2 +- src/core/person_identity.rs | 14 +- src/core/processor/heuristic_scene.rs | 292 +++++++++ src/core/processor/mod.rs | 4 + src/worker/job_worker.rs | 21 + 25 files changed, 2219 insertions(+), 118 deletions(-) create mode 100644 docs_v1.0/M4_workspace/2026-05-13_111614_test_report.md create mode 100644 docs_v1.0/M4_workspace/2026-05-13_deploy_script_response.md create mode 100644 docs_v1.0/REFERENCE/DETECTOR_REGISTRY.md create mode 100644 docs_v1.0/REFERENCE/DETECTOR_SELECTION_SOP.md create mode 100644 docs_v1.0/REFERENCE/IDENTITY_LIFECYCLE.md create mode 100644 docs_v1.0/REFERENCE/SPATIAL_COORDINATE_REGISTRY.md create mode 100644 scripts/match_identities_to_tmdb.py create mode 100644 src/core/processor/heuristic_scene.rs diff --git a/build.rs b/build.rs index 43fd11b..e86a3be 100644 --- a/build.rs +++ b/build.rs @@ -1,4 +1,14 @@ fn main() { let version = std::env::var("CARGO_PKG_VERSION").unwrap_or_else(|_| "unknown".to_string()); + + let git_hash = std::process::Command::new("git") + .args(["rev-parse", "--short", "HEAD"]) + .output() + .ok() + .and_then(|o| String::from_utf8(o.stdout).ok()) + .map(|s| s.trim().to_string()) + .unwrap_or_else(|| "unknown".to_string()); + println!("cargo:rustc-env=BUILD_VERSION={}", version); + println!("cargo:rustc-env=BUILD_GIT_HASH={}", git_hash); } diff --git a/docs_v1.0/M4_HANDOVER/deploy.sh b/docs_v1.0/M4_HANDOVER/deploy.sh index 7120140..f57d1b7 100644 --- a/docs_v1.0/M4_HANDOVER/deploy.sh +++ b/docs_v1.0/M4_HANDOVER/deploy.sh @@ -15,10 +15,38 @@ echo "=== Momentry Package Deploy ===" echo "UUID: $UUID" echo "Time: $(date '+%Y-%m-%d %H:%M:%S')" echo "" +echo "=== Momentry Package Deploy ===" +echo "UUID: $UUID" +echo "Time: $(date '+%Y-%m-%d %H:%M:%S')" +echo "" + +# 0. Version & build compatibility check +echo "[0/8] Checking system version and build..." +PKG_VER=$(python3 -c "import json; f=json.load(open('$DIR/file_info.json')); print(f.get('momentry_version','?'))") +PKG_BUILD=$(python3 -c "import json; f=json.load(open('$DIR/file_info.json')); print(f.get('momentry_build','?'))") +SRV=$(curl -sf http://localhost:3003/health | python3 -c " +import json,sys +d=json.load(sys.stdin) +print(d.get('version','unknown'), d.get('build_git_hash','unknown')) +" 2>/dev/null || echo "down down") +SRV_VER=$(echo "$SRV" | cut -d' ' -f1) +SRV_BUILD=$(echo "$SRV" | cut -d' ' -f2) +if [ "$SRV_VER" = "down" ]; then + echo " ⚠️ Cannot reach server at localhost:3003, skipping version check" +elif [ "$SRV_VER" != "$PKG_VER" ] || [ "$SRV_BUILD" != "$PKG_BUILD" ]; then + echo " ❌ Mismatch:" + echo " Package Server" + echo " Version: $PKG_VER $SRV_VER" + echo " Build: $PKG_BUILD $SRV_BUILD" + echo "" + echo " Please obtain the matching system upgrade package." + exit 1 +else + echo " ✅ Server v$SRV_VER (build $SRV_BUILD) matches package" +fi # 1. Verify package integrity -echo "[1/5] Verifying package..." -REQUIRED_FILES=("data.sql" "file_info.json") +echo "[1/8] Verifying package..." MISSING=0 for f in "${REQUIRED_FILES[@]}"; do if [ ! -f "$DIR/$f" ]; then @@ -32,28 +60,38 @@ if [ $MISSING -eq 1 ]; then fi echo " ✅ Package verified" -# 2. Import data.sql -echo "[2/5] Importing DB data..." -"$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -f "$DIR/data.sql" 2>&1 | tail -3 +# 2. Pre-clean: remove existing identities for this file (avoids UNIQUE(name) conflicts on COPY) +echo "[2/8] Pre-cleaning existing identities for this file..." +"$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -c "DELETE FROM dev.identities WHERE file_uuid = '$UUID'" > /dev/null 2>&1 +echo " ✅ Cleared identities for $UUID" + +# 3. Import data.sql (uses \i to load per-table files from sql/) +echo "[3/8] Importing DB data..." +(cd "$DIR" && "$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -f data.sql 2>&1) | tail -5 echo " ✅ Data imported" -# 3. Copy video to demo dir +# 4. Copy video to demo dir (only this package's video, not scanning others) VIDEO_FILE=$(ls "$DIR"/*.mp4 "$DIR"/*.mov "$DIR"/*.avi "$DIR"/*.mkv 2>/dev/null | head -1) if [ -n "$VIDEO_FILE" ]; then VIDEO_NAME=$(basename "$VIDEO_FILE") DEST="$DEMO_DIR/$VIDEO_NAME" if [ ! -f "$DEST" ]; then cp "$VIDEO_FILE" "$DEST" - echo "[3/5] Video copied: $VIDEO_NAME → $DEMO_DIR" + echo "[4/8] Video copied: $VIDEO_NAME → $DEMO_DIR" else - echo "[3/5] Video already in demo dir, skipping" + echo "[4/8] Video already in demo dir, skipping" fi else - echo "[3/5] No video file in package, skipping" + echo "[4/8] No video file in package, skipping" fi -# 4. Copy output files -echo "[4/5] Copying output files..." +# 5. Set video status to completed (package is fully processed) +echo "[5/8] Setting deployment status..." +"$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -c "UPDATE dev.videos SET status = 'completed' WHERE file_uuid = '$UUID'" > /dev/null 2>&1 +echo " ✅ Status set to 'completed'" + +# 6. Copy output files +echo "[6/8] Copying output files..." COPIED=0 for f in "$DIR"/*.json "$DIR"/*.sqlite "$DIR"/*.sqlite; do if [ -f "$f" ]; then @@ -66,20 +104,25 @@ for f in "$DIR"/*.json "$DIR"/*.sqlite "$DIR"/*.sqlite; do done echo " ✅ $COPIED files copied to $OUTPUT_DIR" -# 5. Verify deployment -echo "[5/5] Verifying deployment..." -CHUNKS=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.chunk WHERE file_uuid='$UUID' AND chunk_type='sentence'" 2>/dev/null || echo "?") +# 7. Verify deployment +echo "[7/8] Verifying deployment..." +CHUNKS=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.chunk WHERE file_uuid='$UUID'" 2>/dev/null || echo "?") FACES=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.face_detections WHERE file_uuid='$UUID'" 2>/dev/null || echo "?") +IDENTS=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.identities WHERE file_uuid='$UUID'" 2>/dev/null || echo "?") +TKG_NODES=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.tkg_nodes WHERE file_uuid='$UUID'" 2>/dev/null || echo "?") +TKG_EDGES=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.tkg_edges WHERE file_uuid='$UUID'" 2>/dev/null || echo "?") echo "" echo "=== Deploy Complete ===" -echo " UUID: $UUID" -echo " Chunks: $CHUNKS" -echo " Faces: $FACES" -echo " Output: $OUTPUT_DIR/" +echo " UUID: $UUID" +echo " Chunks: $CHUNKS" +echo " Faces: $FACES" +echo " Identities: $IDENTS" +echo " TKG nodes: $TKG_NODES" +echo " TKG edges: $TKG_EDGES" +echo " Output: $OUTPUT_DIR/" echo "" -echo "Next: trigger pipeline processing" -echo " curl -X POST http://localhost:3003/api/v1/file/$UUID/process" +echo "Package is self-contained — no further processing needed." echo "" -echo "Or open the offline report:" -echo " python3 render_offline_report.py $OUTPUT_DIR/$UUID.sqlite" +echo "Offline report:" +echo " python3 scripts/render_offline_report.py $OUTPUT_DIR/$UUID.sqlite" diff --git a/docs_v1.0/M4_HANDOVER/export_file_package.py b/docs_v1.0/M4_HANDOVER/export_file_package.py index 7db4687..69e5fa5 100644 --- a/docs_v1.0/M4_HANDOVER/export_file_package.py +++ b/docs_v1.0/M4_HANDOVER/export_file_package.py @@ -13,6 +13,8 @@ TABLES = [ ("dev.chunk", "file_uuid"), ("dev.chunk_vectors", "uuid"), ("dev.face_detections", "file_uuid"), + ("dev.tkg_nodes", "file_uuid"), + ("dev.tkg_edges", "file_uuid"), ] def main(): diff --git a/docs_v1.0/M4_HANDOVER/identity_bind.py b/docs_v1.0/M4_HANDOVER/identity_bind.py index b35145d..714a9c5 100644 --- a/docs_v1.0/M4_HANDOVER/identity_bind.py +++ b/docs_v1.0/M4_HANDOVER/identity_bind.py @@ -77,10 +77,11 @@ for cluster_id in sorted(set(labels)): # Create new identity identity_uuid = None cur.execute(""" - INSERT INTO dev.identities (name, identity_type, source, status, created_at) - VALUES (%s, 'face', 'auto', 'active', NOW()) + INSERT INTO dev.identities (name, identity_type, source, status, created_at, file_uuid) + VALUES (%s, 'face', 'auto', 'active', NOW(), %s) + ON CONFLICT (name) DO UPDATE SET status = 'active', file_uuid = COALESCE(dev.identities.file_uuid, %s) RETURNING id - """, (f"PERSON_{cluster_id}",)) + """, (f"PERSON_{UUID[:8]}_{cluster_id}", UUID, UUID)) identity_id = cur.fetchone()[0] cluster_to_identity[cluster_id] = identity_id print(f" Cluster {cluster_id}: new identity {identity_id} (PERSON_{cluster_id})") diff --git a/docs_v1.0/M4_workspace/2026-05-13_111614_test_report.md b/docs_v1.0/M4_workspace/2026-05-13_111614_test_report.md new file mode 100644 index 0000000..84063b6 --- /dev/null +++ b/docs_v1.0/M4_workspace/2026-05-13_111614_test_report.md @@ -0,0 +1,107 @@ +# V2.0.0 Package v20260513_111614 — 測試報告 + +**Date**: 2026-05-13 +**From**: M5 +**To**: M4 +**Package**: `aeed71342a899fe4b4c57b7d41bcb692_v20260513_111614.tar.gz` +**Version**: 1.0.0 (build d34bcae) + +--- + +## 與上一包 (v20260512_224100) 的差異 + +| 項目 | 224100 (前一包) | 111614 (本包) | 說明 | +|------|:------------:|:------------:|------| +| **data.sql** | 單檔 4.7GB | **split sql/ 目錄** 478MB | 逐 table 匯入,解決 psql OOM | +| **版本比對** | ❌ 無 | **✅** | Step 0/7 檢查 version + build | +| **TKG 匯出** | ❌ 漏 | **✅** | `dev.tkg_nodes` + `dev.tkg_edges` | +| **identity_bindings** | 跨 file 混入 | **✅** 只含此 file 的 trace | `ib.identity_value IN (trace_id from this file)` | +| **deploy.sh** | 5 步驟 | **7 步驟** | 含版本檢查、status=completed、TKG 驗證 | +| **系統版號** | 無 | `momentry_version` + `momentry_build` | 於 `file_info.json` 及 deploy 檢查 | +| **輸出檔** | 部分 | **完整** | `*.json` + `*.sqlite` + `face.json` (修復版) | + +--- + +## 包內容 + +``` +aeed71342a899fe4b4c57b7d41bcb692/ +├── file_info.json (version + build) +├── data.sql (→ \i sql/*.sql) +├── sql/ +│ ├── master.sql +│ ├── dev_videos.sql (1 row) +│ ├── dev_chunk.sql (2,407 rows) +│ ├── dev_chunk_vectors.sql (37 MB, 768D) +│ ├── dev_face_detections.sql (431 MB, 70,691 rows, 512D embedding) +│ ├── dev_identities.sql (424 rows) +│ ├── dev_identity_bindings.sql (7,629 rows) +│ ├── dev_tkg_nodes.sql (6,457 rows) +│ └── dev_tkg_edges.sql (21,028 rows) +├── deploy.sh +├── verify.sh +├── *.face.json (已修復 landmark → 100% pass) +├── *.sqlite (含 vec0 向量表) +├── *.yolo.json, *.asr.json, *.asrx.json, *.cut.json, ... +└── Charade (1963) ... .mp4 +``` + +--- + +## 部署測試流程 + +```bash +# 1. 解包 +mkdir -p /tmp/test_deploy && cd /tmp/test_deploy +tar xzf /Users/accusys/momentry_core_0.1/release/files/aeed71342a899fe4b4c57b7d41bcb692_v20260513_111614.tar.gz +cd aeed71342a899fe4b4c57b7d41bcb692 + +# 2. 執行 deploy.sh +bash deploy.sh +``` + +### 預期 Step 0 結果(版本檢查) + +``` +[0/7] Checking system version and build... + ✅ Server v1.0.0 (build d34bcae) matches package +``` + +若 server 版本不符: +``` +[0/7] Checking system version and build... + ❌ Mismatch: + Package Server + Version: 1.0.0 0.9.0 + Build: d34bcae a1b2c3d + + Please obtain the matching system upgrade package. +``` + +### 預期完成 + +``` +[1/7] Verifying package... ✅ +[2/7] Importing DB data... ✅ (逐 \i 匯入,不 OOM) +[3/7] Copy video... ✅ +[4/7] Set status=completed ✅ +[5/7] Copy output files... ✅ +[6/7] Verify deployment... + Chunks: 2407 + Faces: 70691 + Identities: 424 + TKG nodes: 6457 + TKG edges: 21028 +``` + +--- + +## 已知問題 + +| # | 問題 | 狀態 | +|---|------|:--:| +| 1 | data.sql 的 \i 路徑需 deploy.sh cd 到包目錄方可正確尋找 | 已解決(`(cd "$DIR" && psql -f data.sql)`) | +| 2 | identity_bindings 查詢若無 `SELECT DISTINCT` 會 JOIN explosion | 已解決(加 DISTINCT,7.6K→7.6K) | +| 3 | psql -f 單檔會載入全部內容到記憶體 | 已解決(最大單檔 431MB,可承受) | +| 4 | Night scene segmentation 仍有限(MaskFormer 夜間水體不可見) | 下一版處理 | +| 5 | COCO 80 thing class 不足,缺 desk/window/wall/water 等 stuff | 下一版評估 COCO-Stuff | diff --git a/docs_v1.0/M4_workspace/2026-05-13_deploy_script_response.md b/docs_v1.0/M4_workspace/2026-05-13_deploy_script_response.md new file mode 100644 index 0000000..710b2ca --- /dev/null +++ b/docs_v1.0/M4_workspace/2026-05-13_deploy_script_response.md @@ -0,0 +1,55 @@ +# Deploy Script Fix — 回覆 + +**Date**: 2026-05-13 +**From**: M5 +**To**: M4 +**Ref**: `2026-05-13_deploy_script_fix.md`, `2026-05-13_deploy_sh_remaining.md` + +--- + +## 已修正 + +| # | M4 問題 | 狀態 | +|---|---------|:--:| +| 1 | 影片複製區塊重複(行 54-65) | ✅ 刪除第二段 | +| 2 | 匯入後未設 `status = 'completed'` | ✅ 新增 Step 4/7 | +| 3 | 提示「trigger pipeline」— 內容包已處理完 | ✅ 改為「Package is self-contained」 | + +--- + +## 新增功能 + +| 功能 | 說明 | +|------|------| +| **版本比對** | Step 0/7 新增,讀取 `file_info.json.momentry_version` 比對 `/health` → `version` | +| **Build 比對** | 同上,比對 `momentry_build` 與 `build_git_hash`(git commit) | +| **版本資訊源** | `Cargo.toml` → `build.rs` → `BUILD_VERSION` + `BUILD_GIT_HASH` | +| **API 擴充** | `/health` 及 `/health/detailed` 新增 `build_git_hash` 欄位 | + +--- + +## 相關檔案 + +| 檔案 | 說明 | +|------|------| +| `scripts/deploy_package.sh` | 主 deploy script(7 步驟) | +| `scripts/export_file_package.py` | 同步加入 `momentry_version` + `momentry_build` | +| `src/bin/release.rs` | `file_info.json` 產生時寫入版本/build | +| `src/api/server.rs` | `/health` 回應新增 `build_git_hash` | +| `build.rs` | 自動抓取 git commit hash 作為 BUILD_GIT_HASH | +| `docs_v1.0/M4_HANDOVER/deploy.sh` | 已同步 HANDOVER 版本 | + +--- + +## 流程 + +``` +[0/7] Checking system version and build... ← 新增 + ↓ +[1/7] Verifying package... +[2/7] Importing DB data... +[3/7] Copy video... +[4/7] Set status=completed ← 新增 +[5/7] Copy output files +[6/7] Verify deployment (chunks, faces, identities, TKG) +``` diff --git a/docs_v1.0/REFERENCE/DETECTOR_REGISTRY.md b/docs_v1.0/REFERENCE/DETECTOR_REGISTRY.md new file mode 100644 index 0000000..9716d2f --- /dev/null +++ b/docs_v1.0/REFERENCE/DETECTOR_REGISTRY.md @@ -0,0 +1,602 @@ +# Momentry Core — Detector Registry + +**Date**: 2026-05-13 +**Version**: 1.0 +**Purpose**: 所有模型/演算法檢測器的座標約定、轉換鏈、驗證狀態統整 + +--- + +## 原則 + +1. **每 detector 一條**:獨立記錄輸入/輸出格式、座標原點、單位、轉換公式。 +2. **原始座標系標註**:不隱藏轉換,任何異於 Top-Left pixel 的輸出必須明列。 +3. **轉換鏈可追溯**:從 detector 原始輸出到入庫欄位,每一步轉換都記錄。 +4. **驗證狀態三級**:`verified`(已測試) / `assumed`(文檔推斷,未實測) / `buggy`(已知有誤)。 + +--- + +## 分類總覽 + +| Category | 數量 | Active | Experimental | Deprecated | +|----------|:----:|:------:|:----------:|:--------:| +| face | 8 | 2 | 4 | 2 | +| body | 3 | 1 | 2 | 0 | +| object | 4 | 1 | 3 | 0 | +| text | 3 | 1 | 2 | 0 | +| speech | 3 | 2 | 1 | 0 | +| scene | 2 | 1 | 0 | 1 | +| stamps | 2 | 0 | 2 | 0 | +| **Total** | **25** | **8** | **14** | **3** | + +| Status | 定義 | +|:------:|------| +| **Active** | 生產 pipeline 中執行,`ProcessorType` 有註冊,產出被消費 | +| **Experimental** | 獨立腳本或 CLI,不連 pipeline;評估中或備用 | +| **Deprecated** | 評估後棄用;或已被新版取代但未從 codebase 移除 | + +--- + +## Pipeline Status Quick-Reference + +| # | Detector ID | Short Name | Pipeline Status | Reason | +|---|-------------|-----------|:-----:|--------| +| 1 | DET-CUT-001 | PySceneDetect | active | CUT processor | +| 2 | DET-SCN-001 | Places365 | **active but rejected** ⚠️ | M5 eval rejected; never removed from ProcessorType | +| 3 | DET-ASR-001 | faster-whisper | active | ASR processor | +| 4 | DET-SPCH-003 | ECAPA-TDNN | active | ASRX speaker embedding | +| 5 | DET-OBJ-001 | YOLOv8s | active | YOLO processor (v5nu→v8s, 2026-05-13) | +| 6 | DET-TEXT-001 | swift_ocr | active | OCR processor (primary) | +| 7 | DET-FACE-001/002/003 | swift_face + FaceNet | active | Face processor | +| 8 | DET-BODY-001/002 | swift_pose + YOLOv8-pose | active | Pose processor (primary + fallback) | +| 9 | DET-FACE-006 | AgglomerativeClustering | active | Identity Agent (post-processing) | +| 10 | DET-TEXT-005 | llama.cpp embed | active | Text embedding (chunk vectors) | +| 11 | DET-FACE-005 | InsightFace | experimental | Not in production ProcessorType | +| 12 | DET-FACE-007 | MediaPipe BlazeFace | experimental | MPS fallback, tested but not primary | +| 13 | DET-FACE-008 | MediaPipe Face Mesh | experimental | Lip processor, not in main pipeline | +| 14 | DET-BODY-003 | MediaPipe Holistic | experimental | Tested, not in production | +| 15 | DET-OBJ-003 | OWL-ViT | experimental | Tested for stamps, not in pipeline | +| 16 | DET-OBJ-004 | Grounding DINO | experimental | Tested for stamps/objects | +| 17 | DET-TEXT-002 | Florence-2 | experimental | Tested for stamps | +| 18 | DET-OBJ-002 | Gun Detector | experimental | Evaluated, all FP, rejected for pipeline | +| 19 | DET-STP-001 | OpenCV Stamp | experimental | Used in scan scripts only | +| 20 | DET-STP-002 | Pose Action Decoder | experimental | Derived from pose, standalone | +| 21 | DET-FACE-004 | DeepFace ArcFace | deprecated | Replaced by CoreML FaceNet | +| 22 | DET-SPCH-002 | Apple Speech ASR | deprecated | Replaced by faster-whisper | +| 23 | DET-SCN-001 | Places365 (scene) | ⚠️ deprecated per eval | Still in ProcessorType, needs removal | +| 24 | DET-TEXT-003 | EmbeddingGemma | experimental | Text embed endpoint, not primary | +| 25 | DET-TEXT-004 | mxbai CoreML | experimental | Text embed endpoint, not primary | + +--- + +## Known Misjudgments in Existing Evaluations + +| # | Evaluation | Issue | Impact | Action | +|---|-----------|-------|--------|--------| +| M1 | **Scene Classification** (2026-05-07) | M5 evaluated and REJECTED Places365. But it was never removed from `ProcessorType::all()`. Still runs on every file. | Wastes ~2min per registration. Produces meaningless scene.json. | Remove from pipeline or re-evaluate | +| M2 | **Face Processor** benchmark (2026-04-28) | Compared InsightFace vs MediaPipe vs OpenCV vs Contract v1. But the final pipeline uses **swift_face + FaceNet**, a completely different solution not in the benchmark. | Selection criteria from benchmark don't apply to actual pipeline detector. | Document the actual selection decision for swift_face | +| M3 | **Gun Detector** (2026-05-07) | Properly rejected: 7/7 FP. Correct decision. Model files still in repo. | No impact (correctly excluded). Clean up model files. | Archive or remove `models/gun/` | +| M4 | **OCR processor** | No selection document exists. swift_ocr chosen without comparison against EasyOCR/PaddleOCR. | Unknown if optimal. PaddleOCR fallback may never trigger. | Document selection decision | + +--- + +### 技術分類(有空間座標 vs 無) + +| Category | 數量 | 有空間座標 | 僅 Embedding | 純時間/文字 | +|----------|:----:|:--------:|:----------:|:--------:| +| face | 8 | 5 | 3 | — | +| body | 3 | 3 | — | — | +| object | 4 | 4 | — | — | +| text | 3 | 1 | 2 | — | +| speech | 3 | — | 2 | 1 | +| scene | 2 | — | 1 | 1 | +| stamps | 2 | 2 | — | — | +| **Total** | **25** | **15** | **8** | **2** | + +--- + +## Face Detectors + +### DET-FACE-001 — Face Bbox (Apple Vision) + +| Field | Value | +|-------|-------| +| **Framework** | Apple Vision | +| **Model** | `VNDetectFaceRectanglesRequest` | +| **Input** | `CVPixelBuffer` (BGRA, via CGImage) | +| **Output** | bbox: `x, y, width, height` | +| **Coordinate** | Input: normalized [0-1], origin **bottom-left** | +| **Transform** | `x = bb.origin.x * imgW` | +| | `y = (1.0 - bb.origin.y - bb.size.height) * imgH` | +| **Image size** | `cgImage.width / cgImage.height` | +| **Target** | Top-Left pixel integer | +| **File** | `scripts/swift_processors/swift_face.swift:134-136` | +| **Status** | ✅ verified (2026-05-13, landmark QC + visual check) | + +--- + +### DET-FACE-002 — Face Landmarks (Apple Vision) + +| Field | Value | +|-------|-------| +| **Framework** | Apple Vision | +| **Model** | `VNDetectFaceLandmarksRequest` | +| **Input** | `CVPixelBuffer` (BGRA, via CGImage) | +| **Output** | landmarks: `left_eye (6pt)`, `right_eye (6pt)`, `nose (8pt)`, `outer_lips`, `inner_lips` | +| **Coordinate** | Input: `VNFaceLandmarks2D.pointsInImage(imageSize:)` | +| | Returned: macOS AppKit convention → **bottom-left** origin ⚠️ | +| **Transform** | `y_top_left = imgH - $0.y` (Y-flip) | +| **Image size** | `cgImage.width / cgImage.height` | +| **Target** | Top-Left pixel float → JSON | +| **Pairing** | Not by array index. Landmark observations used as primary source (self-consistent bbox + landmarks). Face rect observations deduplicated via IoU > 0.3. | +| **File** | `scripts/swift_processors/swift_face.swift:155-184` | +| **Status** | ✅ verified (2026-05-13, Y-flip fix, 100% landmark-in-bbox) | +| **Bugs fixed** | BUG-001: index-based pairing (landmarkObs[idx] ≠ faceObs[idx]) | +| | BUG-002: macOS bottom-left Y axis (missing Y-flip) | + +--- + +### DET-FACE-003 — Face Embedding (CoreML FaceNet) + +| Field | Value | +|-------|-------| +| **Framework** | CoreML (ANE-accelerated) | +| **Model** | `models/facenet512.mlpackage` | +| **Input** | Face crop 160×160, RGB, normalized `[-1, 1]` | +| **Output** | 512-dim float embedding | +| **Coordinate** | N/A (no spatial output). Bbox from DET-FACE-001 used for crop. | +| **File** | `scripts/face_processor.py`, `scripts/embed_faces.py`, `scripts/tmdb_embed_extractor.py` | +| **Embedding space** | [-1, 1] per dimension, cosine similarity for matching | +| **Status** | ✅ verified (routinely used for identity matching) | + +--- + +### DET-FACE-004 — Face Embedding (DeepFace ArcFace) + +| Field | Value | +|-------|-------| +| **Framework** | DeepFace / TensorFlow | +| **Model** | `ArcFace` (512-dim) | +| **Input** | Face crop (from bbox), BGR, no explicit normalization | +| **Output** | 512-dim float embedding | +| **Coordinate** | N/A | +| **File** | `scripts/face_embedding_extractor.py` | +| **Status** | 🟡 assumed (legacy fallback, not primary pipeline) | + +--- + +### DET-FACE-005 — Face Recognition (InsightFace) + +| Field | Value | +|-------|-------| +| **Framework** | InsightFace / ONNX Runtime | +| **Model** | `buffalo_l` (detection + recognition + 5-point landmarks) | +| **Input** | Video frame (BGR, numpy array) | +| **Output** | `bbox: [x1, y1, x2, y2]` pixel int | +| | `landmarks: 5-point` (left_eye, right_eye, nose, mouth_left, mouth_right) | +| | `embedding: 512-dim float` | +| **Coordinate** | Bbox: **Top-Left pixel** (InsightFace native) | +| | Landmarks: **normalized [0-1]** to image size | +| **Transform** | Bbox: `face.bbox.astype(int)` — direct | +| | Landmarks: `kps * imgW, kps * imgH` — needs manual conversion ⚠️ | +| **File** | `scripts/face_recognition_processor.py:123-153` | +| **Status** | 🟡 assumed (landmark pixel conversion chain not independently verified) | + +--- + +### DET-FACE-006 — Face Clustering (sklearn) + +| Field | Value | +|-------|-------| +| **Framework** | sklearn | +| **Model** | `AgglomerativeClustering` | +| **Input** | 512-dim face embeddings from DET-FACE-003 or DET-FACE-004 | +| **Output** | cluster labels, centroids (512-dim float) | +| **Coordinate** | N/A (no spatial output) | +| **File** | `scripts/face_clustering_processor.py`, `scripts/identity_bind.py` | +| **Status** | ✅ verified (428 clusters for Charade, identity_bindings created) | + +--- + +### DET-FACE-007 — Face Detection (MediaPipe BlazeFace) + +| Field | Value | +|-------|-------| +| **Framework** | MediaPipe / MPS | +| **Model** | `blaze_face_short_range.tflite` | +| **Input** | Frame (numpy array / MPS image) | +| **Output** | `bbox: [x, y, width, height]` pixel | +| | `6 keypoints`: eyes, nose tip, mouth center, ear tragions — **pixel** | +| **Coordinate** | **Top-Left pixel** (MediaPipe native) | +| **Transform** | Direct, no conversion needed | +| **File** | `scripts/face_processor_mps.py` | +| **Status** | 🟡 assumed (MPS fallback, rarely used in pipeline) | + +--- + +### DET-FACE-008 — Lip Detection (MediaPipe Face Mesh) + +| Field | Value | +|-------|-------| +| **Framework** | MediaPipe | +| **Model** | `Face Mesh` (468 landmarks) | +| **Input** | Face crop or full frame | +| **Output** | `lip_openness: [0-1]` (vertical/mouth_width) | +| | `mouth keypoints`: indices 13, 14, 61, 291 from 468 mesh | +| **Coordinate** | Landmarks: **normalized [0-1]**, Top-Left origin | +| **Transform** | Normalized → pixel: `x * imgW, y * imgH` | +| | Lip openness: derived ratio, unitless | +| **File** | `scripts/lip_processor.py` | +| **Status** | 🟡 assumed | + +--- + +## Body Pose Detectors + +### DET-BODY-001 — Body Pose (Apple Vision) + +| Field | Value | +|-------|-------| +| **Framework** | Apple Vision | +| **Model** | `VNDetectHumanBodyPoseRequest` | +| **Input** | `CGImage` (from frame export or NSImage) | +| **Output** | `19 keypoints`: nose, eyes, ears, neck, root, shoulders, elbows, wrists, hips, knees, ankles | +| | `bbox: [x, y, width, height]` derived from keypoint min/max | +| **Coordinate** | Input: normalized [0-1], origin **bottom-left** | +| **Transform** (current) | ✅ `y = h - location.y * h` — Y-flip applied | +| **Transform** (correct) | `y = h - location.y * h` | +| **Image size** | `cgImage.width / cgImage.height` | +| **Target** | Top-Left pixel float | +| **File** | `scripts/swift_processors/swift_pose.swift:154-159` | +| **Status** | ✅ verified (2026-05-13, Y-flip fix applied) | + +--- + +### DET-BODY-002 — Body Pose (YOLOv8 Pose fallback) + +| Field | Value | +|-------|-------| +| **Framework** | ultralytics / PyTorch | +| **Model** | `yolov8n-pose.pt` | +| **Input** | Frame (PIL or numpy) | +| **Output** | `17 COCO keypoints`: nose, eyes, ears, shoulders, elbows, wrists, hips, knees, ankles | +| | `bbox: [x, y, width, height]` derived from keypoints (conf > 0.1) | +| **Coordinate** | **Top-Left pixel** (YOLO native, `.xy[0]` → numpy float) | +| **Transform** | Direct: `x, y = float(kps[j][0]), float(kps[j][1])` | +| | Bbox: `min(xs), min(ys), max(xs)-min(xs), max(ys)-min(ys)` | +| **File** | `scripts/pose_processor.py:78-97` | +| **Status** | ✅ top-left native | + +--- + +### DET-BODY-003 — Full Body (MediaPipe Holistic) + +| Field | Value | +|-------|-------| +| **Framework** | MediaPipe | +| **Model** | `Holistic` (pose + face mesh + hands) | +| **Input** | Frame (BGR numpy) | +| **Output** | `468 face mesh`: `[[x, y, z], ...]` normalized [0-1] | +| | `33 body pose`: `[[x, y, z, visibility], ...]` normalized [0-1] | +| | `21 hand × 2`: `[[x, y, z], ...]` normalized [0-1] | +| **Coordinate** | **normalized [0-1]**, Top-Left origin | +| **Transform** | `x * imgW, y * imgH` → pixel (if needed) | +| | Z: depth relative, not metric | +| **File** | `scripts/mediapipe_holistic_processor.py` | +| **Status** | ✅ top-left native, normalized→pixel straightforward | + +--- + +## Object Detectors + +### DET-OBJ-001 — Object Detection (YOLOv8s) + +| Field | Value | +|-------|-------| +| **Framework** | ultralytics / CoreML + PyTorch fallback | +| **Model** | `yolov8s.mlpackage` (primary, CoreML ANE), `yolov8s.pt` (fallback) | +| **mAP (COCO)** | 44.9 (was 34.3 with YOLOv5nu, +31%) | +| **Input** | Frame (PIL or numpy) | +| **Output** | `bbox: [x1, y1, x2, y2]` — float pixel | +| | `class_name, class_id` (80 COCO classes) | +| | `confidence: [0-1]` | +| **Coordinate** | **Top-Left pixel** (YOLO `.xyxy[0]` → float) | +| **Transform** | Rust: `x = detection.x1 as i32, y = detection.y1 as i32` — **int truncation** | +| | `width = x2 - x1, height = y2 - y1` | +| **Image size** | YOLO auto-handles via ultralytics inference | +| **File** | `scripts/yolo_processor.py:272-285`, `src/core/processor/yolo.rs:83-117` | +| **Status** | ✅ verified (2026-05-13, replaced YOLOv5nu, +19% detections, scene indicators +162~+473%) | +| **Replaced** | YOLOv5nu (mAP 34.3, removed 2026-05-13) | + +--- + +### DET-OBJ-002 — Weapon Detection (YOLOv8n Fine-tuned) + +| Field | Value | +|-------|-------| +| **Framework** | ultralytics / PyTorch | +| **Model** | `models/gun/gun_detector/weights/best.pt` | +| **Input** | Frame (numpy array) | +| **Output** | `bbox: [x1, y1, x2, y2]` pixel | +| | `class: {0: grenade, 1: knife, 2: pistol, 3: rifle}` | +| **Coordinate** | **Top-Left pixel** (YOLO native) | +| **File** | `scripts/gun_detector_scan.py` | +| **Status** | ✅ top-left native | + +--- + +### DET-OBJ-003 — Open-Vocabulary Detection (OWL-ViT) + +| Field | Value | +|-------|-------| +| **Framework** | HuggingFace Transformers | +| **Model** | `google/owlvit-base-patch32` | +| **Input** | PIL Image + text queries | +| **Output** | `bbox, scores, labels` | +| **Coordinate** | post_process_object_detection returns boxes in `[x1, y1, x2, y2]` format | +| | scaled to `target_sizes` parameter | +| **Transform** | `target_sizes = torch.Tensor([image_pil.size[::-1]])` — PIL (w,h) → (h,w) | +| | `box.int().tolist()` or `box.tolist()` → Python list | +| **Format risk** | HuggingFace processor version may return `[cx, cy, w, h]` not `[x1,y1,x2,y2]` | +| **File** | `scripts/test_owl_vit_stamps.py:69-80`, `scripts/magnifying_glass_owl.py:65-77` | +| **Status** | 🟡 **assumed** (bbox format not independently verified with visual check) | +| **Verify** | Render bbox overlay on a known target image, confirm x1 < x2, y1 < y2 | + +--- + +### DET-OBJ-004 — Open-Vocabulary Detection (Grounding DINO) + +| Field | Value | +|-------|-------| +| **Framework** | HuggingFace Transformers | +| **Model** | `IDEA-Research/grounding-dino-base` | +| **Input** | PIL Image + text prompts | +| **Output** | `boxes, labels, scores` | +| **Coordinate** | processor rescales to `target_sizes`, returns pixel boxes | +| **Transform** | `target_sizes=[img.size[::-1]]` — PIL (w,h) → (h,w) | +| | `[round(v, 1) for v in dets["boxes"][i].tolist()]` | +| **Format risk** | `[::-1]` order depends on processor expectations. If processor expects (w,h), axes swapped. | +| **File** | `scripts/gdino_frame_api.py:176-180` | +| **Status** | 🟡 **assumed** (rescale direction not independently verified) | +| **Verify** | Single-frame output: check bbox x range ≤ imgW, y range ≤ imgH | + +--- + +## Text / OCR Detectors + +### DET-TEXT-001 — OCR (Apple Vision) + +| Field | Value | +|-------|-------| +| **Framework** | Apple Vision | +| **Model** | `VNRecognizeTextRequest` (accurate/fast) | +| **Input** | `CVPixelBuffer` (via CGImage) | +| **Output** | `text: string`, `bbox: [x, y, w, h]`, `confidence: [0-1]` | +| **Coordinate** | Input: `VNRecognizedTextObservation.boundingBox` — normalized [0-1], origin **bottom-left** | +| **Transform** | ✅ `y = (1.0 - bb.origin.y - bb.size.height) * cgH` — Y-flip applied | +| **Image size** | Main loop: `cgImage.width / cgImage.height` ✅ | +| | `recognizeText()` helper: `CVPixelBufferGetWidth/Height` ✅ | +| **File** | `scripts/swift_processors/swift_ocr.swift:125-133`, `:181-182` | +| **Status** | ✅ verified (2026-05-13, Y-flip + image size fix applied) | + +--- + +### DET-TEXT-002 — Open-Vocabulary (Florence-2) + +| Field | Value | +|-------|-------| +| **Framework** | HuggingFace Transformers | +| **Model** | `microsoft/Florence-2-base` | +| **Input** | PIL Image + task prompt | +| **Output** | `bbox: [x1, y1, x2, y2]` pixel | +| | `label, text` (depending on task) | +| **Coordinate** | processor `post_process_generation` rescales to `image_size`, returns pixel | +| **Transform** | `x1, y1, x2, y2 = map(int, bbox)` — direct | +| | `image_size=(image_pil.width, image_pil.height)` — (w, h) order ✅ | +| **File** | `scripts/florence2_scan_stamps.py:67-79`, `scripts/test_florence2_direct.py` | +| **Status** | ✅ top-left native (HuggingFace post_process output) | + +--- + +### DET-TEXT-003 — Text Embedding (EmbeddingGemma) + +| Field | Value | +|-------|-------| +| **Framework** | HuggingFace / PyTorch MPS | +| **Model** | `google/embeddinggemma-300m` | +| **Input** | Text string | +| **Output** | Embedding vector (L2 normalized, dimension model-dependent) | +| **Coordinate** | N/A | +| **File** | `scripts/embeddinggemma_server.py` | +| **Status** | ✅ verified (embedding API server) | + +--- + +## Text Embedding (Non-Detector) + +### DET-TEXT-004 — Text Embedding (mxbai CoreML) + +| Field | Value | +|-------|-------| +| **Framework** | CoreML (ANE-accelerated) | +| **Model** | `mxbai-embed-large-v1.mlpackage` | +| **Input** | Text tokenized | +| **Output** | Embedding vector | +| **Coordinate** | N/A | +| **File** | `scripts/coreml_embed_server.py` | +| **Status** | 🟡 assumed | + +--- + +### DET-TEXT-005 — Text Embedding (Ollama / llama.cpp) + +| Field | Value | +|-------|-------| +| **Framework** | llama.cpp / Ollama API | +| **Model** | llama.cpp embedding endpoint (port 11436) | +| **Input** | Text (optionally prefixed `search_document:`) | +| **Output** | 768-dim float embedding | +| **Coordinate** | N/A | +| **File** | `src/core/embedding/comic_embed.rs` | +| **Status** | ✅ verified (embedding pipeline) | + +--- + +## Speech / Audio Detectors + +### DET-SPCH-001 — ASR (faster-whisper) + +| Field | Value | +|-------|-------| +| **Framework** | faster-whisper / CTranslate2 | +| **Model** | `faster-whisper/small` (int8 CPU) | +| **Input** | Audio extracted from video | +| **Output** | `[{start, end, text}, ...]` — temporal segments (seconds) | +| **Coordinate** | Temporal only (seconds), no spatial | +| **File** | `scripts/asr_processor.py` | +| **Status** | ✅ verified (ASR pipeline) | + +--- + +### DET-SPCH-002 — ASR (Apple Speech) + +| Field | Value | +|-------|-------| +| **Framework** | Apple Speech (ANE) | +| **Model** | `SFSpeechRecognizer` | +| **Input** | Audio file | +| **Output** | `[{start, end, text, confidence}, ...]` — temporal segments | +| **Coordinate** | Temporal only (seconds), no spatial | +| **File** | `scripts/swift_processors/asr_swift.swift` | +| **Status** | 🟡 assumed (Apple Speech quality lower than faster-whisper) | + +--- + +### DET-SPCH-003 — Speaker Embedding (ECAPA-TDNN) + +| Field | Value | +|-------|-------| +| **Framework** | SpeechBrain / PyTorch | +| **Model** | `speechbrain/spkrec-ecapa-voxceleb` | +| **Input** | Audio segments per speaker | +| **Output** | `192-dim float embedding` | +| **Coordinate** | N/A (vector space, cosine similarity) | +| **File** | `scripts/asrx_processor_custom.py`, `scripts/voice_embedding_extractor.py` | +| **Status** | ✅ verified (voice embeddings exported to SQLite + Qdrant) | + +--- + +## Scene Detectors + +### DET-SCN-001 — Scene Classification (Places365) + +| Field | Value | +|-------|-------| +| **Framework** | CoreML (ANE) + PyTorch MPS fallback | +| **Model** | `resnet18_places365.mlpackage` | +| **Input** | Frame resized to 224×224 | +| **Output** | `[{scene_type, confidence, top_5}, ...]` — temporal segments | +| **Coordinate** | Temporal only, no spatial | +| **File** | `scripts/scene_classifier.py` | +| **Status** | ✅ verified | + +--- + +### DET-SCN-002 — Scene Cut Detection (PySceneDetect) + +| Field | Value | +|-------|-------| +| **Framework** | PySceneDetect | +| **Model** | `ContentDetector` (threshold-based frame difference) | +| **Input** | Video frames | +| **Output** | `[{scene_number, start_frame, end_frame, start_time, end_time}]` | +| **Coordinate** | Temporal (frames + seconds), no spatial | +| **File** | `scripts/cut_processor.py` | +| **Status** | ✅ verified | + +--- + +## Stamp / Specific Target Detectors + +### DET-STP-001 — Stamp Detection (OpenCV Color) + +| Field | Value | +|-------|-------| +| **Framework** | OpenCV | +| **Model** | HSV color masking + contour analysis (rule-based, no ML) | +| **Input** | Frame (BGR numpy) | +| **Output** | `bbox: [x, y, w, h]` pixel | +| **Coordinate** | **Top-Left pixel** (`cv2.boundingRect()` native) | +| **Transform** | Direct, no conversion | +| **File** | `scripts/scan_full_video_stamps.py`, `scripts/find_blue_stamp_opencv.py` | +| **Status** | ✅ top-left native | + +--- + +### DET-STP-002 — Pose Action Decoder (Coordinate-derived) + +| Field | Value | +|-------|-------| +| **Framework** | Rule-based from keypoints | +| **Model** | N/A (derived from DET-BODY-001/002/003 keypoints) | +| **Input** | Pose keypoints (pixel) | +| **Output** | Action labels: turn_left, turn_right, look_up, look_down, shake_head, nod_head, blink, smile, etc. | +| **Coordinate** | Derived angles/ratios, no raw spatial output | +| **File** | `scripts/utils/pose_action_decoder.py`, `scripts/utils/integrated_body_action_decoder.py` | +| **Status** | 🟡 assumed (actions derived from pose keypoints; dependent on upstream keypoint correctness) | +| **Warning** | Affected by DET-BODY-001 Y-flip bug — all action labels wrong when using Vision pose | + +--- + +## Known Bugs Summary + +| Bug ID | Detector | Issue | Impact | Fixed | +|:------|----------|-------|--------|:-----:| +| BUG-001 | DET-FACE-001/002 | Index-based landmark↔face pairing | Wrong landmarks assigned to wrong faces | ✅ 2026-05-13 | +| BUG-002 | DET-FACE-002 | macOS bottom-left → missing Y-flip | Landmarks 731px offset from bbox | ✅ 2026-05-13 | +| BUG-003 | DET-BODY-001 | Missing Y-flip on keypoints | All 19 joint Y coordinates inverted | ✅ 2026-05-13 | +| BUG-004 | DET-BODY-001 | Derived bbox Y inverted | Bbox doesn't cover actual person | ✅ 2026-05-13 | +| BUG-005 | DET-TEXT-001 | Missing Y-flip on bbox | Text bbox Y inverted | ✅ 2026-05-13 | +| BUG-006 | DET-TEXT-001 | Hardcoded 640×360 in `recognizeText()` | Wrong bbox scale for non-640×360 images | ✅ 2026-05-13 | + +--- + +## Coordinate Convention Quick Reference + +### Apple Vision (all detectors) + +| Item | Convention | +|------|-----------| +| boundingBox origin | Bottom-Left | +| boundingBox units | normalized [0-1] | +| pointsInImage Y axis | Bottom-Left (macOS AppKit) | +| Required Y-flip formula | bbox: `y = (1 - y_norm - h_norm) * imgH` | +| | points: `y = imgH - raw_y` | + +### Non-Vision Detectors + +| Framework | Origin | Units | +|-----------|:------:|-------| +| YOLO (ultralytics) | Top-Left | pixel float | +| MediaPipe | Top-Left | normalized [0-1] | +| InsightFace bbox | Top-Left | pixel int | +| InsightFace landmarks | Top-Left | normalized [0-1] | +| HuggingFace (post_process) | Top-Left | pixel (after rescale) | +| OpenCV | Top-Left | pixel int | + +--- + +## 納管規則 + +1. **新增 detector**:必須在此 Registry 註冊,含座標系、轉換公式、檔案位置。 +2. **座標變更**:任何轉換公式修改,必須更新此文件並標註變更日期。 +3. **驗證要求**:每個有空間座標的 detector 必須通過至少一次 visual check(bbox/keypoints 疊加原圖)。 +4. **跨 detector 比對**:同一 frame 的不同 detector 輸出 bbox,IoU 應合理(非零且非 1.0)。 +5. **Vision detector 鐵律**:任何使用 Apple Vision Framework 的 detector,必須確認 Y-flip 已實作。 + +--- + +## 維護 + +- **Owner**: M5 +- **更新頻率**: 每次新增 processor 或修改座標轉換時 +- **參照**: `SPATIAL_COORDINATE_REGISTRY.md`(上層座標系統) diff --git a/docs_v1.0/REFERENCE/DETECTOR_SELECTION_SOP.md b/docs_v1.0/REFERENCE/DETECTOR_SELECTION_SOP.md new file mode 100644 index 0000000..01dbf92 --- /dev/null +++ b/docs_v1.0/REFERENCE/DETECTOR_SELECTION_SOP.md @@ -0,0 +1,238 @@ +# Momentry Core — Detector 選型標準作業程序 (SOP) + +**Date**: 2026-05-13 +**Version**: 1.0 +**Ref**: `DETECTOR_REGISTRY.md`, `SPATIAL_COORDINATE_REGISTRY.md` + +--- + +## 目的 + +規範 detector(模型/演算法)的新增、評估、選型、入庫流程,確保每個進入生產 pipeline 的 detector 都經過完整驗證。 + +--- + +## 選型流程(6 Phase) + +``` +Phase 1: 需求定義 → Phase 2: 候選名單 → Phase 3: 基準測試 +→ Phase 4: 座標校驗 → Phase 5: 選型決策 → Phase 6: 入庫納管 +``` + +--- + +## Phase 1 — 需求定義 + +### 1.1 輸出規格 + +| 項目 | 必填 | +|------|:--:| +| 輸出類型(bbox / landmarks / keypoints / embedding / label / text) | ✅ | +| 有無空間座標 | ✅ | +| 預期精度(如:IoU > 0.5 with ground truth) | ✅ | +| 預期速度(如:< 0.1s/frame on MPS) | ✅ | +| 預期 memory(如:< 1GB) | ✅ | +| 授權限制(MIT / Apache / GPL / commercial) | ✅ | + +### 1.2 輸入規格 + +| 項目 | 必填 | +|------|:--:| +| 輸入型別(frame image / audio / text) | ✅ | +| 是否需要前處理(resize / crop / normalize) | ✅ | +| 需要的輸入尺寸 | ✅ | + +--- + +## Phase 2 — 候選名單 + +### 2.1 蒐集條件 + +至少收集 **3 個候選**,涵蓋不同技術路線: + +| 技術路線 | 範例 | +|---------|------| +| Apple Vision (ANE) | swift_face, swift_pose, swift_ocr | +| PyTorch / CoreML | YOLOv5n, FaceNet, ResNet18 | +| HuggingFace Transformers | OWL-ViT, Florence-2, Grounding DINO | +| 傳統 CV | OpenCV Haar, HSV masking | +| MediaPipe | BlazeFace, Holistic, Face Mesh | + +### 2.2 排除條件 + +以下任一成立即排除,不進入測試: + +- 授權不合(GPL/AGPL 在無 commercial license 時排除) +- 已知在 target 平台無法運行(如 CUDA-only on Mac) +- 維護狀態超過 2 年未更新(除非無替代方案) +- 模型大小超過 1GB(除非有強烈理由) + +--- + +## Phase 3 — 基準測試 + +### 3.1 測試項目(全部強制) + +| # | 測試項目 | 方法 | 最低門檻 | +|---|---------|------|:--:| +| T1 | **處理速度** | 同影片 100 frame sample,測 wall time | 候選中最快 ±20% 內 | +| T2 | **Memory 峰值** | `psutil` 監控,記錄 process RSS peak | < 2GB | +| T3 | **檢出率** | vs 人工標註 ground truth(≥50 frame),算 Precision/Recall | Recall > 0.6 | +| T4 | **誤報率** | TP / (TP + FP),從同上 ground truth | Precision > 0.3(視任務) | +| T5 | **輸出完整性** | 檢查 output JSON 格式符合 schema | 100% 欄位存在 | +| **T6** | **座標正規化** | ← **新增,見 Phase 4** | | + +### 3.2 基準測試腳本規範 + +每組候選必須產出: + +``` +output/benchmark/{category}/ +├── BENCHMARK_REPORT.md # 人類可讀報告 +├── BENCHMARK_REPORT.json # 機器可讀結果 +└── {scheme}_{detector}.json # 各候選原始輸出 +``` + +使用現有 `*_benchmark_runner.py` 模板,或參考 `scripts/compare_*.py`。 + +--- + +## Phase 4 — 座標正規化校驗(T6)← 強制新增 + +### 4.1 為何強制 + +以下 6 個已發現的座標 bug 全部來自**選型時未校驗座標**: + +| Bug | Detector | 問題 | +|-----|----------|------| +| BUG-001 | face landmarks | index-based pairing 錯誤 | +| BUG-002 | face landmarks | macOS Vision Y-flip 遺漏 | +| BUG-003 | body pose | Y-flip 遺漏 | +| BUG-004 | body pose | bbox Y 反轉 | +| BUG-005 | OCR text | Y-flip 遺漏 | +| BUG-006 | OCR text | hardcoded 640×360 image size | + +> **原則:任何產出空間座標的 detector,座標校驗為選型的必要條件,未通過不得納入 pipeline。** + +### 4.2 校驗項目 + +| # | 項目 | 方法 | 門檻 | +|---|------|------|:--:| +| C1 | **原點確認** | 查閱 detector framework 文檔,記錄原始座標系(BL/TL/Center) | 必須明列 | +| C2 | **軸向確認** | 同上,記錄 X/Y 軸方向(right-positive / down-positive) | 必須明列 | +| C3 | **單位確認** | 記錄原始輸出單位(normalized [0-1] / pixel / 其他) | 必須明列 | +| C4 | **Y-flip 驗證** | 對 Apple Vision detector 輸出 Y 值:若 face 在 frame 上半部,bbox y 應 < frame_height/2 | 必須 pass | +| C5 | **bbox↔landmark 一致性** | 對同一 detection,檢查 ≥50% landmark 點在 bbox 內 | ≥90% faces pass | +| C6 | **bbox 範圍檢查** | 確認 x ∈ [0, imgW], y ∈ [0, imgH], w > 0, h > 0 | 100% | +| C7 | **跨 detector 對齊** | 同一 frame 的不同 detector bbox,IoU 應合理(置信度加權) | — | +| C8 | **轉換鏈文件化** | 寫出完整的 E→P→A 座標轉換公式,含每一步的 image size 來源 | 必須完成 | + +### 4.3 校驗腳本 + +使用 `scripts/face_landmark_qc.py` 模式(可擴展到其他類別): + +```python +# 對每個 frame: +# 1. 讀取 detector 輸出 +# 2. 檢查 x ∈ [0, imgW], y ∈ [0, imgH] +# 3. 若有 landmarks: 檢查 ≥50% inside bbox +# 4. 輸出 pass/fail report +``` + +完成後在 `DETECTOR_REGISTRY.md` 中標記 `verified`。 + +--- + +## Phase 5 — 選型決策 + +### 5.1 評分矩陣 + +| 權重 | 維度 | 評分方式 | +|:---:|------|---------| +| 30% | 品質(Precision/Recall/準確度) | vs ground truth | +| 25% | 速度(throughput) | ms/frame,越低越好 | +| 15% | 座標正確性(C1-C8) | 全 pass = 滿分 | +| 15% | Memory | MB peak,越低越好 | +| 10% | 維護性(license, dep, 更新頻率) | 主觀評分 | +| 5% | 輸出豐富度(額外資訊如 pose/age/gender) | 加分項 | + +### 5.2 決策記錄 + +決策必須以文件記錄,格式: + +```markdown +# {Category} Detector 選型決策 + +**日期**: YYYY-MM-DD +**決策者**: {name} +**選中**: {detector_id} +**淘汰**: {列出所有候選及淘汰原因} + +## 評估數據 +| 候選 | 品質 | 速度 | 座標 | Memory | 總分 | +|------|------|------|------|--------|------| +| A | | | | | | +| B | | | | | | + +## 座標校驗 +| 候選 | C1-C3 | C4 | C5 | C6 | C7 | C8 | Pass | +|------|-------|----|----|----|----|----|:--:| +| A | | | | | | | | +| B | | | | | | | | + +## 決策理由 +(1-2 段解釋為何選 A 不選 B) +``` + +保存至 `docs_v1.0/decisions/{YYYY-MM-DD}_{category}_detector_selection.md`。 + +--- + +## Phase 6 — 入庫納管 + +### 6.1 Registry 更新 + +選定後必須更新: + +1. `DETECTOR_REGISTRY.md` — 新增 detector 條目(若未存在),狀態標 `verified` +2. `SPATIAL_COORDINATE_REGISTRY.md` — 更新 E 層 + P 層校準路徑 +3. 在 `src/worker/processor.rs` 或對應呼叫處,新增註解標註 detector ID + +### 6.2 Rollback 機制 + +若偵測到已部署 detector 有嚴重問題(如 BUG-003/004),執行: + +1. 立即標記 `buggy` 在 `DETECTOR_REGISTRY.md` +2. 修復後重新 build +3. 更新 `SPATIAL_COORDINATE_REGISTRY.md` 校準狀態 + +--- + +## 現有 Detector 重新檢視清單 + +以下為目前 pipeline 中所有 active detector,需逐一檢視是否符合此 SOP: + +| # | Detector | 目前狀態 | 座標校驗 | 有選型文件 | +|---|----------|:------:|:--:|:--:| +| 1 | Cut (PySceneDetect) | active ✅ | N/A(無空間座標) | ✅ | +| 2 | Scene (Places365) | **active but rejected in eval** ⚠️ | N/A | ❌ 評估建議棄用但未移除 | +| 3 | ASR (faster-whisper) | active ✅ | N/A | ✅ | +| 4 | ASRX (ECAPA-TDNN) | active ✅ | N/A | ✅ | +| 5 | YOLO (YOLOv5n) | active ✅ | TL native | ✅ | +| 6 | OCR (swift_ocr) | active ✅ | ✅ fixed | ❌ 無選型文件 | +| 7 | Face (swift_face + FaceNet) | active ✅ | ✅ fixed | ❌ 無選型文件 | +| 8 | Pose (swift_pose + YOLOv8-pose) | active ✅ | ✅ fixed | ❌ 無選型文件 | +| 9 | VisualChunk | active ✅ | N/A(衍生) | ❌ 無選型文件 | +| 10 | Story (Gemma4) | active ✅ | N/A(LLM) | ❌ 無選型文件 | +| 11 | TKG Builder | active ✅ | N/A(graph) | — | +| 12 | TMDB Matcher | active ✅ | N/A(cosine) | — | +| 13 | Identity Agent | active ✅ | N/A(clustering) | — | +| 14 | Embedding (llama.cpp) | active ✅ | N/A(vector) | ✅ | + +--- + +## 維護 + +- **Owner**: M5 +- **更新頻率**: 每次新增 detector 時 +- **稽核**: 每季度檢視一次所有 active detector 是否仍符合品質標準 diff --git a/docs_v1.0/REFERENCE/IDENTITY_LIFECYCLE.md b/docs_v1.0/REFERENCE/IDENTITY_LIFECYCLE.md new file mode 100644 index 0000000..8da1f42 --- /dev/null +++ b/docs_v1.0/REFERENCE/IDENTITY_LIFECYCLE.md @@ -0,0 +1,161 @@ +# Identity 生命週期 — 轉移前 → 內容包 → 轉移後 + +**Date**: 2026-05-13 +**Ref**: `dev.identities` table, `file_uuid` column + +--- + +## 三階段架構 + +``` +轉移前(Source DB) 內容包(.tar.gz) 轉移後(Target DB) +──────────────────── ────────────────── ──────────────────── +dev.identities sql/dev_identities.sql dev.identities +├── PERSON_UUID_cluster → WHERE file_uuid = '{u}' → INSERT/COPY +├── Stranger_FILE_cluster → (同上) → (同上) +├── tmdb (global) → WHERE file_uuid IS NULL → UPDATE (merge) +│ AND source IN ('tmdb',..) +├── merged (global) → (同上) → (同上) +├── auto inactive → ❌ 不匯出 → (不存在) +├── Stranger_original → ❌ 已被改名 → (不存在舊名) +└── user_defined (global) → (同上) → (同上) +``` + +--- + +## 階段 1:轉移前(Source Database) + +### 資料分類 + +| Category | 筆數 | file_uuid | 來源 | 用途 | +|----------|:---:|:---------:|------|------| +| `PERSON_{UUID8}_{cluster}` | ~428/檔案 | 設定 | identity_bind.py | 自動聚類 identity,每個檔案獨立命名 | +| `Stranger_{UUID8}_{counter}` | ~25/檔案 | 設定 | experiment runner | 單筆 trace 臨時 identity | +| `tmdb` | ~15 (全局) | NULL | tmdb_identity_integration | 全局 TMDB 演員 identity | +| `auto` inactive | ~3051 (全局) | NULL | identity_bind.py (被取代) | 被 TMDB 覆蓋的舊 auto identity,不匯出 | +| `merged` | ~11 | NULL | match_identities_to_tmdb.py | 已與 TMDB 合併的 auto identity | +| `user_defined` | — | NULL | 使用者手動建立 | 保留 | + +### 衝突預防機制 + +``` +命名規則: + PERSON_{file_uuid[:8]}_{cluster_id} + Stranger_{file_uuid[:8]}_{counter} + +→ 不同檔案的 identity 不會撞名 +→ UNIQUE (name) constraint 安全 +``` + +--- + +## 階段 2:內容包內(Package) + +### 匯出查詢 + +```sql +COPY ( + SELECT * FROM dev.identities + WHERE file_uuid = '{uuid}' -- 此檔案的 identity + OR (file_uuid IS NULL AND source IN + ('tmdb', 'merged', 'user_defined')) -- 全局 global identity +) TO STDOUT WITH CSV HEADER +``` + +### 包內身份清單範例 + +| name | source | file_uuid | 屬於 | +|------|--------|-----------|------| +| PERSON_aeed7134_11 | auto | aeed7134... | ✅ 此檔案 | +| PERSON_aeed7134_18 | auto | aeed7134... | ✅ 此檔案 | +| Cary Grant | tmdb | NULL | 🌐 全局 | +| Audrey Hepburn | tmdb | NULL | 🌐 全局 | +| Paul Bonifas (merged) | merged | NULL | 🌐 全局 | +| Stranger_417a7e93_001 | auto_temp | 417a7e93... | ✅ 此檔案 | +| (PERSON_417a7e93_xxx) | auto | 417a7e93... | ❌ 不匯出(非此檔案) | + +--- + +## 階段 3:轉移後(Target Database) + +### 匯入流程 + +``` +接收包 → bash deploy.sh + │ + ├─ cd "$DIR" && psql -f data.sql + │ ├─ \i sql/dev_videos.sql (單筆,INSERT) + │ ├─ \i sql/dev_chunk.sql (批次,COPY) + │ ├─ \i sql/dev_face_detections.sql (批次,COPY) + │ ├─ \i sql/dev_identities.sql → HERE + │ ├─ \i sql/dev_identity_bindings.sql + │ ├─ \i sql/dev_tkg_nodes.sql + │ └─ \i sql/dev_tkg_edges.sql +``` + +### COPY 面臨的問題 + +`COPY` 指令沒有 `ON CONFLICT` 機制。若目標 DB 已有同名 identity,COPY 會因 `UNIQUE(name)` 而失敗。 + +| 情境 | 風險 | 處理方式 | +|------|:--:|---------| +| target 無此檔案 → 新 deploy | ✅ 正常 | COPY 順利 | +| target 已有此檔案 → 重新 deploy | ⚠️ `PERSON_xxx` 已存在 | COPY 失敗 | +| target 已有其他檔案 deploy 過 | ⚠️ TMDB identity(如 Cary Grant)已存在 | COPY 失敗 | +| 兩個包同時含有相同 TMDB 演員 | ⚠️ 同名 global identity | COPY 失敗 | + +### 解法 + +`deploy.sh` 的資料匯入需要使用 `psql` 的 ON CONFLICT 處理,而非直接 COPY。 + +**方案 A:COPY 前先 DELETE 同名 identity** + +```sql +DELETE FROM dev.identities WHERE file_uuid = '{uuid}'; +COPY dev.identities FROM STDIN WITH CSV HEADER; +``` + +但這個方案會誤刪 global identity(因 TMDB identity 的 file_uuid IS NULL,WHERE file_uuid = '{uuid}' 不會刪到 global identity)。 + +**方案 B:使用 `\COPY` + 暫存表** + +```sql +CREATE TEMP TABLE tmp_identities (LIKE dev.identities); +\copy tmp_identities FROM 'identities.csv' WITH CSV HEADER; + +INSERT INTO dev.identities AS t +SELECT * FROM tmp_identities i +ON CONFLICT (name) DO UPDATE + SET file_uuid = COALESCE(t.file_uuid, EXCLUDED.file_uuid), + source = EXCLUDED.source, + face_embedding = COALESCE(EXCLUDED.face_embedding, t.face_embedding), + tmdb_id = COALESCE(EXCLUDED.tmdb_id, t.tmdb_id); +``` + +**方案 C:deploy.sh 中包一層 ON CONFLICT 邏輯** + +```bash +# 對 identity_bindings 等小 table 直接用 COPY(已有 ON CONFLICT 容忍) +# 對 identities 用 ON CONFLICT 的 INSERT +for f in "$DIR"/sql/dev_identities.sql; do + "$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" <<-EOSQL + BEGIN; + -- Use temporary table for ON CONFLICT handling + DELETE FROM dev.identities WHERE file_uuid = '$UUID'; + \i $f + COMMIT; +EOSQL +done +``` + +--- + +## 結論 + +| 層面 | 現狀 | 風險 | 建議 | +|------|------|:--:|------| +| 命名衝突 | `PERSON_UUID_cluster` 防撞 | ✅ 安全 | — | +| TMDB 重複匯入 | COPY 無 ON CONFLICT | ❌ 會失敗 | 方案 C:DELETE WHERE file_uuid='{UUID}' 再 COPY | +| 跨檔案合併 | global identity 透過 `file_uuid IS NULL` 區分 | ⚠️ 需確認 | 目前 TMDB identity 已無 file_uuid | +| 舊環境覆蓋 | 重新 deploy 會撞名 | ❌ 會失敗 | 同上方案 C | +| Stranger 命名修正 | 已補 `{UUID8}` | ✅ 安全 | — | diff --git a/docs_v1.0/REFERENCE/SPATIAL_COORDINATE_REGISTRY.md b/docs_v1.0/REFERENCE/SPATIAL_COORDINATE_REGISTRY.md new file mode 100644 index 0000000..eb6c222 --- /dev/null +++ b/docs_v1.0/REFERENCE/SPATIAL_COORDINATE_REGISTRY.md @@ -0,0 +1,267 @@ +# Momentry Core — Spatial Coordinate System Registry + +**Date**: 2026-05-13 +**Version**: 1.0 +**Purpose**: 所有空間定位的輸入輸出統一校準、文檔化、納管 + +--- + +## 原則 + +1. **原點一律 Top-Left**(圖像習慣)。需要 Bottom-Left 或 Center 的系統在轉換層處理,不得洩漏到儲存層。 +2. **單位一律 Pixels (INTEGER)**。歸一化座標 [0,1] 只存在原始檢測輸出層,不得入庫。 +3. **深度一律無因次比例**(bbox area / frame area)。不做 true depth,除非有 depth sensor。 + +--- + +## 架構總覽(六層) + +``` +E 檢測層 (Detection) → Apple Vision / YOLO / MediaPipe / HuggingFace +P 處理層 (Processing) → Python scripts: parse, QC, normalize, dedup +A1 暫存層 (Staging) → pre_chunks, monitor_jobs, processor_results +A2 正規層 (Canonical) → videos, chunk, face_detections, identities, TKG +B API 層 (Query) → axum endpoints → JSON +C/V 渲染層 (Render / Viz) → ffmpeg (server) / canvas/SVG/WebGL (client) +``` + +| 層 | 座標格式 | 可逆 | 註 | +|----|---------|:--:|----| +| E | 異質 (BL norm / TL norm / TL pixel) | — | 每個 detector 自訂 | +| P | 強制 TL pixel integer | E→P 不可逆 | 清洗層,所有座標在此正規化 | +| A1 | TL pixel / temporal | — | API 不可讀,export 不含 | +| A2 | TL pixel integer / temporal | 不可逆 | canonical storage | +| B | TL pixel integer → JSON number | 唯讀 | 只讀 A2 | +| C/V | 目標繪圖座標系(TL pixel / GL [-1,+1]) | 自由轉換 | 從 B 輸入 | + +--- + +## 座標系統清單(18 個) + +### A. 儲存層(入庫,不可改) + +| # | 系統 | 欄位 | 原點 | 單位 | 定義位置 | +|---|------|------|:--:|:--:|------| +| A1 | `face_detections` | `x, y, width, height` | Top-Left | pixels (INTEGER) | migration 006 | +| A2 | `face_detections` | `frame_number` | 0-based | frames | migration 006 | +| A3 | `face_detections` | `confidence` | — | REAL [0,1] | migration 006 | +| A4 | `face_detections` | `timestamp_secs` | 0.0 | seconds | migration 006 | +| A5 | YOLO pre_chunks | `x1, y1, x2, y2` | Top-Left | pixels (f32) | yolo.rs:57 | +| A6 | Pose pre_chunks | keypoint `x, y` | Top-Left | pixels (f32) | pose.rs:29 | +| A7 | Pose pre_chunks | bbox `x, y, w, h` | Top-Left | pixels (i32) | pose.rs:37 | +| A8 | Chunk (types.rs) | `BoundingBox.x, y, w, h` | Top-Left | pixels (i32) | chunk/types.rs:48 | + +### B. API 層(輸出,不改輸入) + +| # | 系統 | 欄位 | 原點 | 單位 | 定義位置 | +|---|------|------|:--:|:--:|------| +| B1 | TraceFaceItem | `x, y, width, height` | Top-Left | pixels (i32) | trace_agent_api.rs:180 | +| B2 | TraceFaceItem | `z_rel` | — | 無因次 [0,~1] | trace_agent_api.rs:188 | +| B3 | z_rel formula | `(w*h) / (video_w * video_h)` | — | 無因次 | trace_agent_api.rs:331 | + +### C. 渲染層(輸出) + +| # | 系統 | 原點 | 單位 | 定義位置 | +|---|------|:--:|:--:|------| +| C1 | ffmpeg `drawbox` | Top-Left | pixels | media_api.rs:175 | +| C2 | ffmpeg `crop` | Top-Left | pixels | media_api.rs:539 | +| C3 | ffmpeg `render_text` | Top-Left (offset x+6, y+6) | pixels | media_api.rs:81 | + +### D. 視覺化層(靜態轉換) + +| # | 系統 | 原點 | 轉換 | 定義位置 | +|---|------|:--:|------|------| +| D1 | SpaceTimeCube 4D | Cube Center [-1,+1] | `x_3d = (x/fw)*2-1` | SpaceTimeCube.vue:149 | +| D2 | SpaceTimeCube Y flip | Top→Bottom flip | `y_3d = -((y/fh)*2-1)` | SpaceTimeCube.vue:150 | +| D3 | SpaceTimeCube Z | — | `z_3d = z_rel*2-1` | SpaceTimeCube.vue:151 | +| D4 | Face3DViewer | Center [-1,+1] | `x_3d = (x-0.5)*2` | Face3DViewer.vue:73 | +| D5 | Face3DViewer Y flip | Top→Bottom flip | `y_3d = -(y-0.5)*2` | Face3DViewer.vue:74 | + +### E. 檢測層(原始輸出,不入庫) + +> 細粒度規格見 `DETECTOR_REGISTRY.md`(25 個 detector,含座標系與轉換公式) + +| # | 系統 | 原點 | 單位 | Detector ID | +|---|------|:--:|:--:|------| +| E1 | Apple Vision bbox (face + text) | **Bottom-Left** | norm. [0,1] | DET-FACE-001, DET-TEXT-001 | +| E2 | Vision → Top-Left 轉換(face bbox) | — | `faceY = (1-y-h)*height` | DET-FACE-001:134-136 ✅ | +| E3 | Vision → Top-Left 轉換(text bbox) | — | ✅ `y = (1-y-h)*cgH` | DET-TEXT-001:129 | +| E4 | Apple Vision landmarks | **Bottom-Left** (pointsInImage) | pixel (AppKit) | DET-FACE-002 | +| E5 | Vision landmarks → Top-Left 轉換 | — | `y = imgH - rawY` | DET-FACE-002:165 ✅ | +| E6 | Apple Vision body pose keypoints | **Bottom-Left** | norm. [0,1] | DET-BODY-001 | +| E7 | Vision pose → pixel 轉換 | — | ✅ `y = h - location.y * h` | DET-BODY-001:157 | +| E8 | YOLO bbox (xyxy) | Top-Left | pixel (float) | DET-OBJ-001 | +| E9 | YOLO pose keypoints | Top-Left | pixel (float) | DET-BODY-002 | +| E10 | MediaPipe landmarks | Top-Left | norm. [0,1] | DET-FACE-007, DET-FACE-008, DET-BODY-003 | +| E11 | InsightFace bbox + landmarks | Top-Left (bbox), norm (lm) | pixel + norm | DET-FACE-005 | +| E12 | OWL-ViT / Grounding DINO / Florence-2 | Top-Left (post_process) | pixel | DET-OBJ-003, DET-OBJ-004, DET-TEXT-002 | + +### P. 處理層(清洗 + 正規化,新建) + +> 所有 E 層輸出必須經過此層才能入 A2。座標在此強制轉為 TL pixel integer。 + +| # | 系統 | 輸入來自 | 轉換 | QC 項目 | 定義位置 | +|---|------|---------|------|---------|------| +| P1 | face_processor | DET-FACE-001/002 | merge bbox+landmarks, Y-flip, frame→time | landmark-in-bbox check | `face_processor.py` | +| P2 | yolo_processor | DET-OBJ-001 | float xyxy → int bbox | None | `yolo.rs:83` ⚠️ int truncation | +| P3 | pose_processor | DET-BODY-001/002 | keypoint→bbox derivation | keypoint conf > 0.1 | `pose_processor.py:94` | +| P4 | identity_bind | DET-FACE-001/003 | face trace→cluster→identity | dedup, confidence | `identity_bind.py` | +| P5 | ocr_processor | DET-TEXT-001 | text bbox Y-flip ❌ | None | `ocr.rs` | + +> ⚠️ P2: `yolo.rs:83` uses `detection.x1 as i32` (float truncation). Consider `.round() as i32` to match Rust convention (M4 B1). + +--- + +## 校準一致性檢查 + +| 路徑 | 從 | 到 | 原點一致 | 單位一致 | 狀態 | +|------|----|----|:--:|:--:|:--:| +| E1 → P1 → A1 | Vision face bbox (BL,norm) | face_detections (TL,px) | ✅ 轉換 | ✅ 轉換 | ✅ | +| E2 → P1 → A1 | Vision face landmarks (BL,px) | face_detections (TL,px) | ✅ Y-flip | ✅ px | ✅ fixed 2026-05-13 | +| E6 → P3 → A7 | Vision pose (BL,norm) | pose pre_chunks (TL,px) | ✅ Y-flip fixed | ✅ px | ✅ | +| E6 → P5 → A7 | Vision text bbox (BL,norm) | ocr pre_chunks (TL,px) | ✅ Y-flip fixed | ✅ px | ✅ | +| E8 → P2 → A5 | YOLO bbox (TL,float) | YOLO pre_chunks (TL,px) | ✅ | ✅ (float→int) | ✅ | +| A1/A2 → B | face_detections (TL,px) | TraceFaceItem (TL,px) | ✅ | ✅ | ✅ | +| B → C | TraceFaceItem (TL,px) | ffmpeg drawbox (TL,px) | ✅ | ✅ | ✅ | +| A2 → B | face_detections (TL,px) | heatmap SVG (TL,px) | ✅ | ✅ | ✅ | +| B → D | TraceFaceItem (TL,px) | SpaceTimeCube (Center,norm) | ✅ Y flip | ✅ normalize | ✅ | + +--- + +## 已發現問題 + +| # | 問題 | 狀態 | 對策 | +|---|------|:--:|------| +| BUG-001 | Face landmark 座標與 bbox 不匹配(index pairing) | ✅ Fixed | 改用 landmark obs 為主 + IoU dedup | +| BUG-002 | Face landmark macOS bottom-left → 缺少 Y-flip | ✅ Fixed | `y = imgH - raw_y` | +| BUG-003 | **swift_pose** 關節點 Y 軸未翻轉 | ✅ Fixed 2026-05-13 | `y = h - location.y * h` | +| BUG-004 | **swift_pose** derived bbox Y 未翻轉 | ✅ Fixed 2026-05-13 | 同 BUG-003 | +| BUG-005 | **swift_ocr** 文字 bbox Y 軸未翻轉 | ✅ Fixed 2026-05-13 | `y = (1-y-h) * cgH` | +| BUG-006 | **swift_ocr** `recognizeText()` hardcoded 640×360 | ✅ Fixed 2026-05-13 | 改用 CVPixelBuffer 實際尺寸 | +| BUG-007 | Python `int()` vs Rust `.round()` 不一致 | ❌ Open | M4 B1: Python scripts 改用 `round()` | +| BUG-008 | 15+ Python scripts hardcoded FPS | ❌ Open | M4 B2: 從 probe 讀取 | +| BUG-009 | YOLO float→int truncation (`as i32`) | ⚠️ Low | 影響 <1px,可暫緩 | +| BUG-010 | OWL-ViT bbox 格式需驗證 | ⚠️ Verify | 視覺疊加檢查 | +| — | z_rel 為面積比例,非真實深度 | ⚠️ By Design | — | + +--- + +## 時間座標系統(Temporal) + +### 基本單位定義 + +| 單位 | 說明 | 資料型別 | 用途 | +|------|------|:--:|------| +| **Frame** | 0-based frame index | `i64` / `INTEGER` | 權威時間單位(fps 不變則穩定) | +| **Second** | 從影片起點計算的秒數 | `f64` / `DOUBLE PRECISION` | 衍生單位,frame/fps 計算得出 | +| **Timestamp (wall clock)** | Unix epoch / ISO 8601 | `TIMESTAMP` | 系統操作時間,與影片時間**無關** | +| **fps** | Frames Per Second | `f64` | Frame ↔ Second 轉換係數 | + +### 權威單位原則 + +Frame 是**唯一權威時間單位**。Second 始終是 `frame / fps` 的衍生值,精度取決於 fps。 + +``` +src/api/search.rs:22-23: +"start_frame / end_frame = authoritative unit" +"start_time / end_time = derived from frames, subject to FPS variation, not precise" +``` + +### 轉換公式標準 + +| 轉換 | 公式 | 標準方法 | 位置 | +|------|------|---------|------| +| **frame → second** | `frame / fps` | 除法 | `FrameTime::seconds()` | +| **second → frame (Rust)** | `(second * fps).round() as i64` | `.round()` | `FrameTime::from_seconds()` | +| **second → frame (Python)** | ~~`int(second * fps)`~~ | **應改用** `round(second * fps)` | ❌ 不一致 | +| **second → frame (TypeScript)** | `Math.floor(second * fps)` | 截斷 | `client.ts:273` | +| **duration (seconds)** | `(end_frame - start_frame) / fps` | 除法 | `Chunk::duration_seconds()` | +| **duration (frames)** | `end_frame - start_frame` | 減法 | `Chunk::duration_frames()` | + +### 時間欄位清單(15 個系統) + +#### 儲存層 + +| # | 系統 | 欄位 | 單位 | 型別 | 定義 | +|---|------|------|:--:|------|------| +| T1 | `videos` | `duration` | seconds | `DOUBLE PRECISION` | ffprobe `format.duration` | +| T2 | `videos` | `fps` | frames/sec | `DOUBLE PRECISION` | ffprobe `r_frame_rate` | +| T3 | `videos` | `total_frames` | frames | `BIGINT` | ffprobe `nb_frames` | +| T4 | `chunk` | `start_frame` / `end_frame` | frames | `BIGINT` | 權威時間 | +| T5 | `chunk` | `start_time` / `end_time` | seconds | `DOUBLE PRECISION` | 衍生值 = frame/fps | +| T6 | `chunk` | `fps` | frames/sec | `DOUBLE PRECISION` | 從 probe 複製 | +| T7 | `face_detections` | `frame_number` | frames | `BIGINT` | 0-based | +| T8 | `face_detections` | `timestamp_secs` | seconds | `DOUBLE PRECISION` | ⚠️ 冗餘,與 frame_number 可能不一致 | +| T9 | `pre_chunks` | `coordinate_index` | frames/time/page | `BIGINT` | 依 `coordinate_type` 決定 | + +#### API 層 + +| # | 系統 | 欄位 | 單位 | 精度 | 定義 | +|---|------|------|:--:|------|------| +| T10 | `TraceFaceItem` | `start_time` | seconds | **0.1s** (round) | `(frame/fps*10).round()/10` | +| T11 | `TraceData` | `first_sec` / `last_sec` | seconds | **0.1s** (SQL ROUND) | `MIN(MAX(frame)/fps` | +| T12 | `SearchResult` | `start_time` / `end_time` | seconds | float64 | 衍生值 | + +#### 渲染層 (ffmpeg) + +| # | 系統 | 參數 | 單位 | 來源 | +|---|------|------|:--:|------| +| T13 | `trace_video` | `-ss` (seek) | seconds | `first_frame/fps - padding` | +| T14 | `trace_video` | `-t` (duration) | seconds | `(last_frame-first_frame)/fps + 2*padding` | +| T15 | `stream_video` | `-ss` (seek) | seconds | query param `?start=` | +| T16 | `stream_video` | `-t` (duration) | seconds | `end - start` | + +### 已發現問題 + +| # | 問題 | 影響 | 位置 | +|---|------|------|------| +| B1 | **Python `int()` vs Rust `.round()` 不一致** | Frame 差 ±1。1.999s@30fps: Rust=60f, Python=59f | Python scripts vs `FrameTime` | +| B2 | **15+ Python scripts hardcoded FPS** | `25.0`, `24.0`, `30.0` 硬寫,非 probe | `story_pipeline_full.py` 等 | +| B3 | **`register_single_file` total_frames 不一致** | Line 763: `as u64` (truncate) vs line 1136: `.floor()` | `server.rs` | +| B4 | **`timestamp_secs` 冗餘欄位** | 與 `frame_number` 可能不同步 | `face_detections` | +| B5 | **`face_detections.format_sec_frame()` 用 `.ceil()`** | 29.97fps 會被當 30fps 處理 | `time.rs:99` | +| B6 | **TypeScript 預設 fps=30** | 非 30fps 影片 frame 計算錯誤 | `client.ts:273` | +| B7 | **`start_time` 只到 0.1s 精度** | 高精度 seek 會失真 | `trace_agent_api.rs:311,335` | +| B8 | **`total_frames` 型別不一致** | `Option` / `u64` / `i64` 三種 | DB structs | + +### 時間校準規則 + +1. **Frame 為權威單位**。Second 一律從 frame 衍生,不可反過來。 +2. **FPS 來源單一**:從 probe 取得,存入 `videos.fps`。所有轉換引用此值。 +3. **Python int() → round()**:全部 Python scripts 改用 `round(seconds * fps)`,與 Rust 一致。 +4. **禁止 hardcoded FPS**:從 `{uuid}.probe.json` 或 DB 讀取。 +5. **timestamp_secs 標記 deprecated**:改用 `frame_number / videos.fps`。 +6. **wall clock vs video time**:`created_at`/`updated_at` 是系統時間,與影片時間無關,不可混用。 + +--- + +## 納管規則 + +### 空間 + +1. **新欄位**:任何新增的空間欄位必須在此文件註冊。 +2. **原點**:預設 Top-Left。若不同,必須寫明轉換公式並在此登記。 +3. **單位**:預設 integer pixels。歸一化、秒、frame index 必須標示。 +4. **z 軸**:無因次比例 (bbox area/frame area)。不使用 true depth。 +5. **3D 轉換**:Y 軸翻轉 (Top-Left → Bottom-Left graphics) 必須在視覺化層完成,不得回傳。 +6. **Landmark**:需與 bbox 同座標系(Top-Left, pixels),不可獨立歸一化。 +7. **處理層 (P) 強制轉換點**:任何檢測層 (E) 座標必須經 P 層正規化後才能入 A2。座標離開 P 層前必須是 TL pixel integer。 +8. **暫存層 (A1) 隔離**:A1 (`pre_chunks`, `monitor_jobs`, `processor_results`) 不開放 API 讀取,不納入 export。 +9. **API 層 (B) 註冊**:每個回傳空間座標的 API endpoint 必須在此 Registry 註冊其輸出格式。 +10. **新增 detector**:必須同步在 `DETECTOR_REGISTRY.md` 註冊座標系與轉換公式。 +11. **Vision detector 鐵律**:任何使用 Apple Vision Framework 的 detector,必須確認 Y-flip 已實作(`y = imgH - raw_y`)。 + +### 時間 + +7. **Frame 為權威單位**:second 從 frame 衍生,不可反過來。 +8. **FPS 來源單一**:從 probe 取得,存入 `videos.fps`。禁用 hardcoded FPS。 +9. **Python second→frame 改用 `round()`**:與 Rust `.round()` 一致,消除 ±1 frame 誤差。 +10. **wall clock ≠ video time**:`created_at`/`updated_at` 是系統時間,不可當影片時間使用。 + +--- + +## 維護 + +- **Owner**: M4 (持續驗證) +- **Source**: M5 (定義檢測層輸出格式) +- **更新頻率**: 每次 schema 變更或新增 processor 時 diff --git a/experiments/identity_clustering/runner_v2.py b/experiments/identity_clustering/runner_v2.py index f20e9d2..ce3a5e6 100644 --- a/experiments/identity_clustering/runner_v2.py +++ b/experiments/identity_clustering/runner_v2.py @@ -419,9 +419,9 @@ def run_experiment(config: dict) -> dict: if len(tids) >= 1: temp_count += 1 if len(tids) >= 2: - temp_name = f"Person_{temp_count:03d}" + temp_name = f"Person_{file_uuid[:8]}_{temp_count:03d}" else: - temp_name = f"Stranger_{temp_count:03d}" + temp_name = f"Stranger_{file_uuid[:8]}_{temp_count:03d}" label["binding"] = { "name": temp_name, "source": "auto_temp", diff --git a/scripts/deploy_package.sh b/scripts/deploy_package.sh index 7120140..f57d1b7 100644 --- a/scripts/deploy_package.sh +++ b/scripts/deploy_package.sh @@ -15,10 +15,38 @@ echo "=== Momentry Package Deploy ===" echo "UUID: $UUID" echo "Time: $(date '+%Y-%m-%d %H:%M:%S')" echo "" +echo "=== Momentry Package Deploy ===" +echo "UUID: $UUID" +echo "Time: $(date '+%Y-%m-%d %H:%M:%S')" +echo "" + +# 0. Version & build compatibility check +echo "[0/8] Checking system version and build..." +PKG_VER=$(python3 -c "import json; f=json.load(open('$DIR/file_info.json')); print(f.get('momentry_version','?'))") +PKG_BUILD=$(python3 -c "import json; f=json.load(open('$DIR/file_info.json')); print(f.get('momentry_build','?'))") +SRV=$(curl -sf http://localhost:3003/health | python3 -c " +import json,sys +d=json.load(sys.stdin) +print(d.get('version','unknown'), d.get('build_git_hash','unknown')) +" 2>/dev/null || echo "down down") +SRV_VER=$(echo "$SRV" | cut -d' ' -f1) +SRV_BUILD=$(echo "$SRV" | cut -d' ' -f2) +if [ "$SRV_VER" = "down" ]; then + echo " ⚠️ Cannot reach server at localhost:3003, skipping version check" +elif [ "$SRV_VER" != "$PKG_VER" ] || [ "$SRV_BUILD" != "$PKG_BUILD" ]; then + echo " ❌ Mismatch:" + echo " Package Server" + echo " Version: $PKG_VER $SRV_VER" + echo " Build: $PKG_BUILD $SRV_BUILD" + echo "" + echo " Please obtain the matching system upgrade package." + exit 1 +else + echo " ✅ Server v$SRV_VER (build $SRV_BUILD) matches package" +fi # 1. Verify package integrity -echo "[1/5] Verifying package..." -REQUIRED_FILES=("data.sql" "file_info.json") +echo "[1/8] Verifying package..." MISSING=0 for f in "${REQUIRED_FILES[@]}"; do if [ ! -f "$DIR/$f" ]; then @@ -32,28 +60,38 @@ if [ $MISSING -eq 1 ]; then fi echo " ✅ Package verified" -# 2. Import data.sql -echo "[2/5] Importing DB data..." -"$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -f "$DIR/data.sql" 2>&1 | tail -3 +# 2. Pre-clean: remove existing identities for this file (avoids UNIQUE(name) conflicts on COPY) +echo "[2/8] Pre-cleaning existing identities for this file..." +"$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -c "DELETE FROM dev.identities WHERE file_uuid = '$UUID'" > /dev/null 2>&1 +echo " ✅ Cleared identities for $UUID" + +# 3. Import data.sql (uses \i to load per-table files from sql/) +echo "[3/8] Importing DB data..." +(cd "$DIR" && "$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -f data.sql 2>&1) | tail -5 echo " ✅ Data imported" -# 3. Copy video to demo dir +# 4. Copy video to demo dir (only this package's video, not scanning others) VIDEO_FILE=$(ls "$DIR"/*.mp4 "$DIR"/*.mov "$DIR"/*.avi "$DIR"/*.mkv 2>/dev/null | head -1) if [ -n "$VIDEO_FILE" ]; then VIDEO_NAME=$(basename "$VIDEO_FILE") DEST="$DEMO_DIR/$VIDEO_NAME" if [ ! -f "$DEST" ]; then cp "$VIDEO_FILE" "$DEST" - echo "[3/5] Video copied: $VIDEO_NAME → $DEMO_DIR" + echo "[4/8] Video copied: $VIDEO_NAME → $DEMO_DIR" else - echo "[3/5] Video already in demo dir, skipping" + echo "[4/8] Video already in demo dir, skipping" fi else - echo "[3/5] No video file in package, skipping" + echo "[4/8] No video file in package, skipping" fi -# 4. Copy output files -echo "[4/5] Copying output files..." +# 5. Set video status to completed (package is fully processed) +echo "[5/8] Setting deployment status..." +"$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -c "UPDATE dev.videos SET status = 'completed' WHERE file_uuid = '$UUID'" > /dev/null 2>&1 +echo " ✅ Status set to 'completed'" + +# 6. Copy output files +echo "[6/8] Copying output files..." COPIED=0 for f in "$DIR"/*.json "$DIR"/*.sqlite "$DIR"/*.sqlite; do if [ -f "$f" ]; then @@ -66,20 +104,25 @@ for f in "$DIR"/*.json "$DIR"/*.sqlite "$DIR"/*.sqlite; do done echo " ✅ $COPIED files copied to $OUTPUT_DIR" -# 5. Verify deployment -echo "[5/5] Verifying deployment..." -CHUNKS=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.chunk WHERE file_uuid='$UUID' AND chunk_type='sentence'" 2>/dev/null || echo "?") +# 7. Verify deployment +echo "[7/8] Verifying deployment..." +CHUNKS=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.chunk WHERE file_uuid='$UUID'" 2>/dev/null || echo "?") FACES=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.face_detections WHERE file_uuid='$UUID'" 2>/dev/null || echo "?") +IDENTS=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.identities WHERE file_uuid='$UUID'" 2>/dev/null || echo "?") +TKG_NODES=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.tkg_nodes WHERE file_uuid='$UUID'" 2>/dev/null || echo "?") +TKG_EDGES=$("$PG_BIN/psql" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT COUNT(*) FROM dev.tkg_edges WHERE file_uuid='$UUID'" 2>/dev/null || echo "?") echo "" echo "=== Deploy Complete ===" -echo " UUID: $UUID" -echo " Chunks: $CHUNKS" -echo " Faces: $FACES" -echo " Output: $OUTPUT_DIR/" +echo " UUID: $UUID" +echo " Chunks: $CHUNKS" +echo " Faces: $FACES" +echo " Identities: $IDENTS" +echo " TKG nodes: $TKG_NODES" +echo " TKG edges: $TKG_EDGES" +echo " Output: $OUTPUT_DIR/" echo "" -echo "Next: trigger pipeline processing" -echo " curl -X POST http://localhost:3003/api/v1/file/$UUID/process" +echo "Package is self-contained — no further processing needed." echo "" -echo "Or open the offline report:" -echo " python3 render_offline_report.py $OUTPUT_DIR/$UUID.sqlite" +echo "Offline report:" +echo " python3 scripts/render_offline_report.py $OUTPUT_DIR/$UUID.sqlite" diff --git a/scripts/export_file_package.py b/scripts/export_file_package.py index 94f3e0f..a7f2bec 100644 --- a/scripts/export_file_package.py +++ b/scripts/export_file_package.py @@ -13,6 +13,8 @@ TABLES = [ ("dev.chunk", "file_uuid"), ("dev.chunk_vectors", "uuid"), ("dev.face_detections", "file_uuid"), + ("dev.tkg_nodes", "file_uuid"), + ("dev.tkg_edges", "file_uuid"), ] def main(): @@ -47,8 +49,9 @@ def main(): f.write("\n") f.write("\\.\n\n") - # Export identities referenced by this file's face_detections - f.write(f"-- dev.identities (referenced by face_detections WHERE file_uuid='{uuid}')\n") + # Export identities for this file (by file_uuid column) plus global identities + # Global: tmdb + merged + user_defined (exclude inactive auto) + f.write(f"-- dev.identities (WHERE file_uuid='{uuid}' OR global tmdb/merged/user_defined)\n") r = subprocess.run( [f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-t", "-A", "-c", "SELECT string_agg(column_name, ', ' ORDER BY ordinal_position) FROM information_schema.columns WHERE table_schema='dev' AND table_name='identities' AND is_updatable='YES'"], @@ -56,7 +59,7 @@ def main(): cols = r.stdout.strip() r = subprocess.run( [f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-c", - f"COPY (SELECT DISTINCT i.* FROM dev.identities i INNER JOIN dev.face_detections fd ON fd.identity_id = i.id WHERE fd.file_uuid = '{uuid}') TO STDOUT WITH CSV HEADER"], + f"COPY (SELECT * FROM dev.identities WHERE file_uuid = '{uuid}' OR (file_uuid IS NULL AND source IN ('tmdb', 'merged', 'user_defined'))) TO STDOUT WITH CSV HEADER"], capture_output=True, text=True, timeout=60) if r.stdout.strip(): f.write(f"COPY dev.identities ({cols}) FROM STDIN WITH CSV HEADER;\n") @@ -74,7 +77,7 @@ def main(): cols = r.stdout.strip() r = subprocess.run( [f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-c", - f"COPY (SELECT DISTINCT ib.* FROM dev.identity_bindings ib INNER JOIN dev.face_detections fd ON fd.identity_id = ib.identity_id WHERE fd.file_uuid = '{uuid}') TO STDOUT WITH CSV HEADER"], + f"COPY (SELECT ib.* FROM dev.identity_bindings ib INNER JOIN dev.face_detections fd ON fd.identity_id = ib.identity_id AND fd.trace_id IS NOT NULL WHERE fd.file_uuid = '{uuid}' AND ib.identity_value IN (SELECT DISTINCT trace_id::text FROM dev.face_detections WHERE file_uuid = '{uuid}' AND trace_id IS NOT NULL)) TO STDOUT WITH CSV HEADER"], capture_output=True, text=True, timeout=60) if r.stdout.strip(): f.write(f"COPY dev.identity_bindings ({cols}) FROM STDIN WITH CSV HEADER;\n") @@ -111,6 +114,9 @@ def main(): capture_output=True, text=True, timeout=15) if r.stdout.strip(): info = json.loads(r.stdout.strip()) + info["momentry_version"] = "1.0.0" # keep in sync with Cargo.toml version + info["momentry_build"] = subprocess.run(["git", "rev-parse", "--short", "HEAD"], + capture_output=True, text=True, timeout=5).stdout.strip() with open(os.path.join(outdir, "file_info.json"), "w") as f: json.dump(info, f, indent=2) print(f" file_info.json") diff --git a/scripts/export_sqlite.py b/scripts/export_sqlite.py index bc7402d..84137c8 100644 --- a/scripts/export_sqlite.py +++ b/scripts/export_sqlite.py @@ -87,7 +87,7 @@ pg_to_sqlite( # chunk pg_to_sqlite( - "SELECT file_uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, metadata->>'speaker_id' as speaker_id FROM dev.chunk WHERE file_uuid=%s AND chunk_type='sentence' ORDER BY chunk_id", + "SELECT file_uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, metadata->>'speaker_id' as speaker_id FROM dev.chunk WHERE file_uuid=%s ORDER BY chunk_id", "chunk", """CREATE TABLE IF NOT EXISTS chunk ( file_uuid TEXT, chunk_id TEXT, chunk_type TEXT, diff --git a/scripts/identity_bind.py b/scripts/identity_bind.py index 8a01896..714a9c5 100644 --- a/scripts/identity_bind.py +++ b/scripts/identity_bind.py @@ -77,11 +77,11 @@ for cluster_id in sorted(set(labels)): # Create new identity identity_uuid = None cur.execute(""" - INSERT INTO dev.identities (name, identity_type, source, status, created_at) - VALUES (%s, 'face', 'auto', 'active', NOW()) - ON CONFLICT (name) DO UPDATE SET status = 'active' + INSERT INTO dev.identities (name, identity_type, source, status, created_at, file_uuid) + VALUES (%s, 'face', 'auto', 'active', NOW(), %s) + ON CONFLICT (name) DO UPDATE SET status = 'active', file_uuid = COALESCE(dev.identities.file_uuid, %s) RETURNING id - """, (f"PERSON_{UUID[:8]}_{cluster_id}",)) + """, (f"PERSON_{UUID[:8]}_{cluster_id}", UUID, UUID)) identity_id = cur.fetchone()[0] cluster_to_identity[cluster_id] = identity_id print(f" Cluster {cluster_id}: new identity {identity_id} (PERSON_{cluster_id})") diff --git a/scripts/match_identities_to_tmdb.py b/scripts/match_identities_to_tmdb.py new file mode 100644 index 0000000..6911a4e --- /dev/null +++ b/scripts/match_identities_to_tmdb.py @@ -0,0 +1,133 @@ +#!/opt/homebrew/bin/python3.11 +""" +Match auto-generated identities to TMDB identities via centroid embedding similarity. +Updates identity name, tmdb_id, source for matches above threshold. + +Usage: python3 match_identities_to_tmdb.py +""" +import sys +import psycopg2 +import psycopg2.extras +import numpy as np + +DB = "dbname=momentry user=accusys host=localhost" +THRESHOLD = 0.55 + + +def cosine_similarity(a, b): + dot = np.dot(a, b) + na = np.linalg.norm(a) + nb = np.linalg.norm(b) + if na == 0 or nb == 0: + return 0.0 + return dot / (na * nb) + + +def main(): + uuid = sys.argv[1] if len(sys.argv) > 1 else "aeed71342a899fe4b4c57b7d41bcb692" + conn = psycopg2.connect(DB) + cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) + + # Load TMDB identities with face_embedding (pgvector) + cur.execute(""" + SELECT id, name, tmdb_id, face_embedding::text as emb_text + FROM dev.identities + WHERE source = 'tmdb' AND face_embedding IS NOT NULL + """) + tmdb_identities = [] + for row in cur.fetchall(): + emb_str = row["emb_text"] + if not emb_str: + continue + emb = np.array([float(x) for x in emb_str.strip("[]").split(",")]) + tmdb_identities.append({ + "id": row["id"], + "name": row["name"], + "tmdb_id": row["tmdb_id"], + "embedding": emb, + }) + print(f"Loaded {len(tmdb_identities)} TMDB identities with embeddings") + + if not tmdb_identities: + print("No TMDB identities found. Run tmdb_embed_extractor.py first.") + cur.close() + conn.close() + return + + # Get auto identities linked to this file with their centroid embeddings + cur.execute(""" + SELECT DISTINCT i.id, i.name + FROM dev.identities i + INNER JOIN dev.face_detections fd ON fd.identity_id = i.id + WHERE fd.file_uuid = %s AND i.source = 'auto' + """, (uuid,)) + auto_rows = cur.fetchall() + print(f"Auto identities for {uuid[:8]}...: {len(auto_rows)}") + + matched = 0 + for row in auto_rows: + auto_id = row["id"] + auto_name = row["name"] + + # Get face embeddings from face_detections for this identity + cur.execute(""" + SELECT embedding + FROM dev.face_detections + WHERE file_uuid = %s AND identity_id = %s AND embedding IS NOT NULL + LIMIT 500 + """, (uuid, auto_id)) + emb_rows = cur.fetchall() + if not emb_rows: + continue + + # Compute centroid + all_embs = [np.array(r["embedding"], dtype=np.float32) for r in emb_rows] + centroid = np.mean(all_embs, axis=0) + + # Match against TMDB identities + best_sim = 0.0 + best_tmdb = None + for tmdb in tmdb_identities: + sim = cosine_similarity(centroid, tmdb["embedding"]) + if sim > best_sim: + best_sim = sim + best_tmdb = tmdb + + if best_tmdb and best_sim >= THRESHOLD: + fm = best_tmdb["name"] + tmdb_identity_id = best_tmdb["id"] + print(f" {auto_name} → {fm} (sim={best_sim:.3f})") + + # Update face_detections to point to TMDB identity + cur.execute(""" + UPDATE dev.face_detections + SET identity_id = %s + WHERE file_uuid = %s AND identity_id = %s + """, (tmdb_identity_id, uuid, auto_id)) + + # Update identity_bindings to point to TMDB identity + cur.execute(""" + UPDATE dev.identity_bindings + SET identity_id = %s + WHERE identity_id = %s + """, (tmdb_identity_id, auto_id)) + + # Mark auto identity as merged (or we could delete it) + cur.execute(""" + UPDATE dev.identities + SET source = 'merged', tmdb_id = %s + WHERE id = %s + """, (best_tmdb["tmdb_id"], auto_id)) + + matched += 1 + + conn.commit() + print(f"\nMatched {matched}/{len(auto_rows)} auto identities to TMDB") + print(f"Threshold: {THRESHOLD}") + + cur.close() + conn.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/swift_processors/swift_ocr.swift b/scripts/swift_processors/swift_ocr.swift index a2190c1..c13d6c4 100644 --- a/scripts/swift_processors/swift_ocr.swift +++ b/scripts/swift_processors/swift_ocr.swift @@ -126,7 +126,7 @@ struct SwiftOCR: ParsableCommand { let item: [String: Any] = [ "text": candidate.string, "x": Int(bb.origin.x * CGFloat(cgW)), - "y": Int(bb.origin.y * CGFloat(cgH)), + "y": Int((1.0 - bb.origin.y - bb.size.height) * CGFloat(cgH)), "width": Int(bb.size.width * CGFloat(cgW)), "height": Int(bb.size.height * CGFloat(cgH)), "confidence": conf @@ -183,16 +183,19 @@ struct SwiftOCR: ParsableCommand { guard (try? handler.perform([request])) != nil, let results = request.results else { return texts } + let cgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer)) + let cgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer)) + for obs in results { guard let candidate = obs.topCandidates(1).first, candidate.confidence > 0.2 else { continue } let bb = obs.boundingBox texts.append([ "text": candidate.string, - "x": Int(bb.origin.x * 640), - "y": Int(bb.origin.y * 360), - "width": Int(bb.size.width * 640), - "height": Int(bb.size.height * 360), + "x": Int(bb.origin.x * cgW), + "y": Int((1.0 - bb.origin.y - bb.size.height) * cgH), + "width": Int(bb.size.width * cgW), + "height": Int(bb.size.height * cgH), "confidence": candidate.confidence ]) } diff --git a/scripts/swift_processors/swift_pose.swift b/scripts/swift_processors/swift_pose.swift index c8ed994..67c5069 100644 --- a/scripts/swift_processors/swift_pose.swift +++ b/scripts/swift_processors/swift_pose.swift @@ -151,17 +151,19 @@ struct SwiftPose: ParsableCommand { if let mapped = nameMap[rawName] { rawName = mapped } + let px = point.location.x * CGFloat(w) + let py = CGFloat(h) - point.location.y * CGFloat(h) keypoints.append([ "name": rawName.isEmpty ? "\(joint)" : rawName, - "x": point.location.x * CGFloat(w), - "y": point.location.y * CGFloat(h), + "x": px, + "y": py, "confidence": point.confidence, ]) if point.confidence > 0.1 { - minX = min(minX, point.location.x) - minY = min(minY, point.location.y) - maxX = max(maxX, point.location.x) - maxY = max(maxY, point.location.y) + minX = min(minX, px) + minY = min(minY, py) + maxX = max(maxX, px) + maxY = max(maxY, py) } } } @@ -171,10 +173,10 @@ struct SwiftPose: ParsableCommand { ] if maxX > minX { bbox = [ - "x": Int(minX * CGFloat(w)), - "y": Int(minY * CGFloat(h)), - "width": Int((maxX - minX) * CGFloat(w)), - "height": Int((maxY - minY) * CGFloat(h)), + "x": Int(minX), + "y": Int(minY), + "width": Int(maxX - minX), + "height": Int(maxY - minY), ] } diff --git a/src/api/server.rs b/src/api/server.rs index f9c8a0c..0bb680c 100644 --- a/src/api/server.rs +++ b/src/api/server.rs @@ -72,6 +72,7 @@ fn get_uptime_ms() -> u64 { struct HealthResponse { status: String, version: String, + build_git_hash: String, uptime_ms: u64, } @@ -369,6 +370,7 @@ pub struct AppState { struct DetailedHealthResponse { status: String, version: String, + build_git_hash: String, uptime_ms: u64, services: ServiceHealth, } @@ -408,6 +410,7 @@ async fn health(State(state): State) -> Json { Json(HealthResponse { status: status.to_string(), version: env!("BUILD_VERSION").to_string(), + build_git_hash: env!("BUILD_GIT_HASH").to_string(), uptime_ms: get_uptime_ms(), }) } @@ -431,6 +434,7 @@ async fn health_detailed(State(state): State) -> Json u64 { + path.read_dir().map(|d| d.filter_map(|e| e.ok()).filter_map(|e| e.metadata().ok()).map(|m| m.len()).sum()).unwrap_or(0) +} + const DEMO_DIR: &str = "/Users/accusys/momentry/var/sftpgo/data/demo"; const OUTPUT_DIR: &str = "/Users/accusys/momentry/output_dev"; const RELEASE_DIR: &str = "/Users/accusys/momentry_core_0.1/release/files"; @@ -353,77 +357,133 @@ async fn cmd_package(db: &PostgresDb, uuid: &str) -> Result<()> { "width": width, "height": height, "status": "completed", + "momentry_version": env!("CARGO_PKG_VERSION"), + "momentry_build": env!("BUILD_GIT_HASH"), }); fs::write(outdir.join("file_info.json"), serde_json::to_string_pretty(&info)?)?; - // Export data.sql - let sql_path = outdir.join("data.sql"); + // Export per-table .sql files (avoid single 4.7GB psql load) + let sql_dir = outdir.join("sql"); + fs::create_dir_all(&sql_dir)?; let tables = [ ("dev.videos", "file_uuid"), ("dev.chunk", "file_uuid"), ("dev.chunk_vectors", "uuid"), ("dev.face_detections", "file_uuid"), + ("dev.tkg_nodes", "file_uuid"), + ("dev.tkg_edges", "file_uuid"), ]; - { - let mut f = fs::File::create(&sql_path)?; - writeln!(f, "-- Release package: {}", uuid)?; - writeln!(f, "BEGIN;")?; - writeln!(f)?; + let mut import_order = vec!["master.sql"]; - for (tbl, col) in &tables { - writeln!(f, "-- {} WHERE {} = '{}'", tbl, col, uuid)?; - // Get columns - let parts: Vec<&str> = tbl.split('.').collect(); - let cols = psql_exec(&format!( - "SELECT string_agg(column_name, ', ' ORDER BY ordinal_position) FROM information_schema.columns WHERE table_schema='{}' AND table_name='{}' AND is_updatable='YES'", - parts[0], parts[1] - ))?; - - // COPY - let data = psql_exec(&format!( - "COPY (SELECT * FROM {} WHERE {} = '{}') TO STDOUT WITH CSV HEADER", - tbl, col, uuid - ))?; - - if !data.is_empty() { - writeln!(f, "COPY {} ({}) FROM STDIN WITH CSV HEADER;", tbl, cols)?; - writeln!(f, "{}", data)?; - writeln!(f, "\\.")?; - writeln!(f)?; - } - } - // Export identities referenced by this file - writeln!(f, "-- dev.identities (referenced by face_detections)")?; - let cols = psql_exec("SELECT string_agg(column_name, ', ' ORDER BY ordinal_position) FROM information_schema.columns WHERE table_schema='dev' AND table_name='identities' AND is_updatable='YES'")?; + fn write_table_sql(outdir: &Path, tbl: &str, col: &str, uuid: &str, psql_exec: &dyn Fn(&str) -> Result) -> Result<()> { + let safe_name = tbl.replace('.', "_"); + let path = outdir.join(format!("{}.sql", safe_name)); + let parts: Vec<&str> = tbl.split('.').collect(); + let cols = psql_exec(&format!( + "SELECT string_agg(column_name, ', ' ORDER BY ordinal_position) FROM information_schema.columns WHERE table_schema='{}' AND table_name='{}' AND is_updatable='YES'", + parts[0], parts[1] + ))?; let data = psql_exec(&format!( - "COPY (SELECT DISTINCT i.* FROM dev.identities i INNER JOIN dev.face_detections fd ON fd.identity_id = i.id WHERE fd.file_uuid = '{}') TO STDOUT WITH CSV HEADER", uuid + "COPY (SELECT * FROM {} WHERE {} = '{}') TO STDOUT WITH CSV HEADER", + tbl, col, uuid ))?; if !data.is_empty() { + let mut f = fs::File::create(&path)?; + writeln!(f, "-- {} WHERE {} = '{}'", tbl, col, uuid)?; + writeln!(f, "COPY {} ({}) FROM STDIN WITH CSV HEADER;", tbl, cols)?; + writeln!(f, "{}", data)?; + writeln!(f, "\\.")?; + let sz = fs::metadata(&path)?.len(); + println!(" sql/{} ({} MB)", safe_name, sz / 1024 / 1024); + } + Ok(()) + } + + for (tbl, col) in &tables { + write_table_sql(&sql_dir, tbl, col, uuid, &|q| psql_exec(q))?; + } + + // Export identities with file_uuid (direct column, no JOIN needed) + // FILE LOCAL: file_uuid = '{uuid}' + // GLOBAL (cross-file): tmdb identities + user-defined (exclude inactive auto) + let idents_name = "dev_identities"; + let idents_path = sql_dir.join(format!("{}.sql", idents_name)); + { + let idents_query = format!( + "COPY (SELECT * FROM dev.identities WHERE file_uuid = '{}' OR (file_uuid IS NULL AND source IN ('tmdb', 'merged', 'user_defined'))) TO STDOUT WITH CSV HEADER", uuid + ); + let cols = psql_exec(&format!( + "SELECT string_agg(column_name, ', ' ORDER BY ordinal_position) FROM information_schema.columns WHERE table_schema='dev' AND table_name='identities' AND is_updatable='YES'" + ))?; + let data = psql_exec(&idents_query)?; + if !data.is_empty() { + let mut f = fs::File::create(&idents_path)?; + writeln!(f, "-- dev.identities WHERE file_uuid = '{}' OR global (tmdb/merged/user_defined)", uuid)?; writeln!(f, "COPY dev.identities ({}) FROM STDIN WITH CSV HEADER;", cols)?; writeln!(f, "{}", data)?; writeln!(f, "\\.")?; - writeln!(f)?; } + } - // Export identity_bindings for identities referenced by this file - writeln!(f, "-- dev.identity_bindings (for identities in face_detections)")?; - let cols = psql_exec("SELECT string_agg(column_name, ', ' ORDER BY ordinal_position) FROM information_schema.columns WHERE table_schema='dev' AND table_name='identity_bindings' AND is_updatable='YES'")?; - let data = psql_exec(&format!( - "COPY (SELECT DISTINCT ib.* FROM dev.identity_bindings ib INNER JOIN dev.face_detections fd ON fd.identity_id = ib.identity_id WHERE fd.file_uuid = '{}') TO STDOUT WITH CSV HEADER", uuid + // Export identity_bindings with custom query + let binds_name = "dev_identity_bindings"; + let binds_path = sql_dir.join(format!("{}.sql", binds_name)); + { + let binds_query = format!( + "COPY (SELECT DISTINCT ib.* FROM dev.identity_bindings ib INNER JOIN dev.face_detections fd ON fd.identity_id = ib.identity_id AND fd.trace_id IS NOT NULL WHERE fd.file_uuid = '{}' AND ib.identity_value IN (SELECT DISTINCT trace_id::text FROM dev.face_detections WHERE file_uuid = '{}' AND trace_id IS NOT NULL)) TO STDOUT WITH CSV HEADER", uuid, uuid + ); + let cols = psql_exec(&format!( + "SELECT string_agg(column_name, ', ' ORDER BY ordinal_position) FROM information_schema.columns WHERE table_schema='dev' AND table_name='identity_bindings' AND is_updatable='YES'" ))?; + let data = psql_exec(&binds_query)?; if !data.is_empty() { + let mut f = fs::File::create(&binds_path)?; + writeln!(f, "-- dev.identity_bindings (from face_detections JOIN)")?; writeln!(f, "COPY dev.identity_bindings ({}) FROM STDIN WITH CSV HEADER;", cols)?; writeln!(f, "{}", data)?; writeln!(f, "\\.")?; - writeln!(f)?; } + } + + // Write master.sql (import order, runs BEGIN/COMMIT around all) + let master_path = sql_dir.join("master.sql"); + { + let mut f = fs::File::create(&master_path)?; + writeln!(f, "BEGIN;")?; + writeln!(f)?; + + writeln!(f, "\\i sql/dev_videos.sql")?; + writeln!(f, "\\i sql/dev_chunk.sql")?; + writeln!(f, "\\i sql/dev_chunk_vectors.sql")?; + writeln!(f, "\\i sql/dev_face_detections.sql")?; + writeln!(f, "\\i sql/dev_identities.sql")?; + writeln!(f, "\\i sql/dev_identity_bindings.sql")?; + writeln!(f, "\\i sql/dev_tkg_nodes.sql")?; + writeln!(f, "\\i sql/dev_tkg_edges.sql")?; writeln!(f, "COMMIT;")?; } - let sql_size = fs::metadata(&sql_path)?.len(); - println!(" data.sql ({} MB)", sql_size / 1024 / 1024); + // Write legacy data.sql that sources master via psql -f (backward compat) + let sql_path = outdir.join("data.sql"); + { + let mut f = fs::File::create(&sql_path)?; + writeln!(f, "-- Release package: {} — see sql/ for per-table files", uuid)?; + writeln!(f, "BEGIN;")?; + writeln!(f, "\\i sql/dev_videos.sql")?; + writeln!(f, "\\i sql/dev_chunk.sql")?; + writeln!(f, "\\i sql/dev_chunk_vectors.sql")?; + writeln!(f, "\\i sql/dev_face_detections.sql")?; + writeln!(f, "\\i sql/dev_identities.sql")?; + writeln!(f, "\\i sql/dev_identity_bindings.sql")?; + writeln!(f, "\\i sql/dev_tkg_nodes.sql")?; + writeln!(f, "\\i sql/dev_tkg_edges.sql")?; + writeln!(f, "COMMIT;")?; + } + + let sql_dir_sz = dir_size(&sql_dir); + println!(" sql/ directory ({} MB total)", sql_dir_sz / 1024 / 1024); // Copy video file if !file_path.is_empty() { @@ -487,6 +547,39 @@ async fn cmd_package(db: &PostgresDb, uuid: &str) -> Result<()> { } let tsize = fs::metadata(&tarball)?.len(); println!("\n Package: {} ({} MB)", tarball.display(), tsize / 1024 / 1024); + + // Sanity check: warn if any sql file is suspiciously large + println!(" Checking sql/ file sizes..."); + for entry in fs::read_dir(&sql_dir)? { + let entry = entry?; + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) == Some("sql") && path.is_file() { + let sz = fs::metadata(&path)?.len() as f64 / 1024.0 / 1024.0; + let name = path.file_stem().and_then(|s| s.to_str()).unwrap_or("?"); + match name { + "dev_videos" | "master" if sz > 1.0 => + println!(" ⚠️ {} is {} MB, expected < 1 MB", name, sz as u64), + "dev_chunk" if sz > 2.0 => + println!(" ⚠️ {} is {} MB, expected < 2 MB for ~2.4K chunks", name, sz as u64), + "dev_identities" if sz > 1.0 => + println!(" ⚠️ {} is {} MB, expected < 1 MB for ~428 identities", name, sz as u64), + "dev_identity_bindings" if sz > 5.0 => + println!(" ⚠️ {} is {} MB, expected < 5 MB for ~7.6K bindings", name, sz as u64), + "dev_tkg_nodes" if sz > 10.0 => + println!(" ⚠️ {} is {} MB, expected < 10 MB for ~6.4K nodes", name, sz as u64), + "dev_tkg_edges" if sz > 20.0 => + println!(" ⚠️ {} is {} MB, expected < 20 MB for ~21K edges", name, sz as u64), + "dev_face_detections" if sz > 1000.0 => + println!(" ⚠️ {} is {} MB, expected < 1000 MB for ~70K faces (512D emb)", name, sz as u64), + "dev_chunk_vectors" if sz > 200.0 => + println!(" ⚠️ {} is {} MB, expected < 200 MB for ~2.4K chunks (768D emb)", name, sz as u64), + _ => {} + } + if sz > 2000.0 { + println!(" ⚠️ {} is {:.0} MB — unusually large, verify query", name, sz); + } + } + } Ok(()) } @@ -646,7 +739,9 @@ fn cmd_stats() -> Result<()> { #[tokio::main] async fn main() -> Result<()> { - dotenv::from_filename(".env.development").ok(); + if dotenv::from_filename("/Users/accusys/momentry_core_0.1/.env.development").is_err() { + let _ = dotenv::from_filename(".env.development"); + } let cli = Cli::parse(); let db = PostgresDb::new(&config::DATABASE_URL).await?; diff --git a/src/core/db/postgres_db.rs b/src/core/db/postgres_db.rs index e15527b..7e09797 100644 --- a/src/core/db/postgres_db.rs +++ b/src/core/db/postgres_db.rs @@ -482,7 +482,7 @@ impl ProcessorType { pub fn all() -> Vec { vec![ ProcessorType::Cut, - ProcessorType::Scene, + // Scene (Places365) removed — replaced by heuristic_scene_metadata post-processor ProcessorType::Asr, ProcessorType::Asrx, ProcessorType::Yolo, diff --git a/src/core/person_identity.rs b/src/core/person_identity.rs index b21c78a..c324087 100644 --- a/src/core/person_identity.rs +++ b/src/core/person_identity.rs @@ -34,9 +34,21 @@ pub struct PersonIdentity { pub struct Identity { pub id: i32, pub name: String, - pub embedding: Option, // Vector embedding stored as text/json + pub embedding: Option, pub metadata: Option, pub created_at: DateTime, + pub uuid: Option, + pub identity_type: Option, + pub source: Option, + pub status: Option, + pub face_embedding: Option>, + pub voice_embedding: Option>, + pub identity_embedding: Option>, + pub reference_data: Option, + pub tmdb_id: Option, + pub tmdb_profile: Option, + pub tmdb_poster: Option, + pub file_uuid: Option, } /// 身份綁定記錄 (Identity Binding) diff --git a/src/core/processor/heuristic_scene.rs b/src/core/processor/heuristic_scene.rs new file mode 100644 index 0000000..9a385da --- /dev/null +++ b/src/core/processor/heuristic_scene.rs @@ -0,0 +1,292 @@ +use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; +use sqlx::PgPool; +use std::path::Path; +use tracing::info; + +/// Heuristic scene metadata derived from YOLO + Face + luminance data. +/// Runs as a post-processing trigger, not a standalone processor. +/// Replaces the removed Places365 Scene classifier. + +#[derive(Debug, Serialize)] +pub struct HeuristicSceneMeta { + pub file_uuid: String, + pub segments: Vec, +} + +#[derive(Debug, Serialize)] +pub struct SceneSegmentMeta { + pub segment_index: u32, + pub start_frame: i64, + pub end_frame: i64, + pub start_time: f64, + pub end_time: f64, + pub indoor_score: f64, + pub outdoor_score: f64, + pub crowd_size: CrowdSize, + pub max_face_count: i64, + pub dominant_objects: Vec, + pub likely_vehicle_transport: bool, + pub avg_brightness: Option, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum CrowdSize { + Empty, + Single, + Duo, + SmallGroup, + Crowd, +} + +/// Indoor-indicative YOLO classes (COCO labels) +const INDOOR_CLASSES: &[&str] = &[ + "chair", "couch", "bed", "dining table", "toilet", "tv", "laptop", + "microwave", "oven", "refrigerator", "sink", "book", "clock", + "vase", "potted plant", +]; + +/// Vehicle-indicative classes (person + vehicle = transport scene) +const VEHICLE_CLASSES: &[&str] = &[ + "car", "truck", "bus", "train", "boat", "aeroplane", "bicycle", "motorbike", +]; + +/// Outdoor-indicative YOLO classes +const OUTDOOR_CLASSES: &[&str] = &[ + "car", "truck", "bus", "train", "boat", "airplane", + "traffic light", "fire hydrant", "stop sign", "parking meter", + "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", + "bear", "zebra", "giraffe", "tree", +]; + +/// Build heuristic scene metadata from disk files (yolo.json + DB face data). +/// segment_boundaries: [(start_frame, end_frame, start_time, end_time), ...] +/// — from CUT detections. +pub async fn build_heuristic_scene_meta( + pool: &PgPool, + file_uuid: &str, + segment_boundaries: &[(i64, i64, f64, f64)], +) -> Result { + if segment_boundaries.is_empty() { + return Ok(HeuristicSceneMeta { + file_uuid: file_uuid.to_string(), + segments: vec![], + }); + } + + use std::collections::HashMap; + use std::collections::HashSet; + + // Build frame→class_counts map from yolo.json + let yolo_path = Path::new(crate::core::config::OUTPUT_DIR.as_str()) + .join(format!("{}.yolo.json", file_uuid)); + let mut frame_objects: HashMap> = HashMap::new(); + if yolo_path.exists() { + if let Ok(yolo_str) = tokio::fs::read_to_string(&yolo_path).await { + #[derive(Deserialize)] + struct YoloJson { + frames: Vec, + } + #[derive(Deserialize)] + struct YoloFrameJson { + frame: i64, + objects: Vec, + } + #[derive(Deserialize)] + struct YoloObjectJson { + class_name: String, + } + if let Ok(yolo) = serde_json::from_str::(&yolo_str) { + for frm in &yolo.frames { + let classes: Vec = + frm.objects.iter().map(|o| o.class_name.clone()).collect(); + if !classes.is_empty() { + frame_objects.insert(frm.frame, classes); + } + } + } + } + } + + // Get face counts grouped by frame + let face_rows: Vec<(i64, i64)> = sqlx::query_as( + "SELECT frame_number, COUNT(*) as fc \ + FROM dev.face_detections \ + WHERE file_uuid = $1 AND frame_number IS NOT NULL \ + GROUP BY frame_number \ + ORDER BY frame_number", + ) + .bind(file_uuid) + .fetch_all(pool) + .await + .unwrap_or_default(); + + let mut frame_face_counts: HashMap = HashMap::new(); + for (frame, count) in &face_rows { + frame_face_counts.insert(*frame, *count); + } + + // Process each segment + let mut segments = Vec::new(); + for (idx, &(start_f, end_f, start_t, end_t)) in segment_boundaries.iter().enumerate() { + let mut class_counts: HashMap = HashMap::new(); + let mut class_frame_presence: HashMap = HashMap::new(); + let mut indoor_objects = 0u64; + let mut outdoor_objects = 0u64; + let mut max_faces: i64 = 0; + let mut frame_count = 0u64; + + for frame in start_f..=end_f { + frame_count += 1; + if let Some(objects) = frame_objects.get(&frame) { + let mut seen_this_frame: HashSet = HashSet::new(); + for cls in objects { + *class_counts.entry(cls.clone()).or_default() += 1; + if seen_this_frame.insert(cls.clone()) { + *class_frame_presence.entry(cls.clone()).or_default() += 1; + } + if INDOOR_CLASSES.contains(&cls.as_str()) { + indoor_objects += 1; + } else if OUTDOOR_CLASSES.contains(&cls.as_str()) { + outdoor_objects += 1; + } + } + } + if let Some(&fc) = frame_face_counts.get(&frame) { + max_faces = max_faces.max(fc); + } + } + + // Normalize by frame count (prevents static-scene FP inflation) + let indoor_ratio = indoor_objects as f64 / frame_count.max(1) as f64; + let outdoor_ratio = outdoor_objects as f64 / frame_count.max(1) as f64; + let total_indicator = indoor_ratio + outdoor_ratio; + let (indoor_score, outdoor_score) = if total_indicator > 0.0 { + (indoor_ratio / total_indicator, outdoor_ratio / total_indicator) + } else { + (0.5, 0.5) + }; + + // Determine crowd size + let crowd_size = match max_faces { + 0 => CrowdSize::Empty, + 1 => CrowdSize::Single, + 2 | 3 => CrowdSize::Duo, + 4..=10 => CrowdSize::SmallGroup, + _ => CrowdSize::Crowd, + }; + + // Vehicle transport detection: check BEFORE class_frame_presence is consumed + let person_frames = class_frame_presence.get("person").copied().unwrap_or(0); + let vehicle_frames: u64 = VEHICLE_CLASSES + .iter() + .map(|c| class_frame_presence.get(*c).copied().unwrap_or(0)) + .sum(); + let person_ratio = person_frames as f64 / frame_count.max(1) as f64; + let likely_vehicle = person_ratio > 0.5 && vehicle_frames > 0 + && outdoor_score > 0.3; + + // Dominant objects: rank by frame presence (not total count) + let mut sorted: Vec<_> = class_frame_presence.into_iter().collect(); + sorted.sort_by(|a, b| b.1.cmp(&a.1)); + let dominant_objects: Vec = sorted + .iter() + .take(3) + .map(|(cls, _)| cls.clone()) + .collect(); + + segments.push(SceneSegmentMeta { + segment_index: idx as u32 + 1, + start_frame: start_f, + end_frame: end_f, + start_time: start_t, + end_time: end_t, + indoor_score, + outdoor_score, + crowd_size, + max_face_count: max_faces, + dominant_objects, + likely_vehicle_transport: likely_vehicle, + avg_brightness: None, // Future: from frame luminance analysis + }); + } + + info!( + "[SCENE-META] {} segments generated for {}", + segments.len(), + file_uuid + ); + + Ok(HeuristicSceneMeta { + file_uuid: file_uuid.to_string(), + segments, + }) +} + +/// Full pipeline entry point: reads CUT data, generates heuristic metadata, writes JSON. +/// Called from job_worker post-processing trigger. +pub async fn generate_scene_meta(db: &crate::core::db::PostgresDb, file_uuid: &str) -> Result { + let pool = db.pool(); + + // Read CUT segment boundaries from cut.json + let cut_path = Path::new(crate::core::config::OUTPUT_DIR.as_str()) + .join(format!("{}.cut.json", file_uuid)); + let segments: Vec<(i64, i64, f64, f64)> = if cut_path.exists() { + let cut_str = tokio::fs::read_to_string(&cut_path) + .await + .context("Failed to read cut.json")?; + #[derive(Deserialize)] + struct CutJson { + scenes: Vec, + } + #[derive(Deserialize)] + struct CutSceneJson { + start_frame: i64, + end_frame: i64, + start_time: f64, + end_time: f64, + } + let cut: CutJson = serde_json::from_str(&cut_str) + .context("Failed to parse cut.json")?; + cut.scenes + .into_iter() + .map(|s| (s.start_frame, s.end_frame, s.start_time, s.end_time)) + .collect() + } else { + // Fallback: query DB for video duration, make one segment + let (total_frames, duration): (Option, Option) = sqlx::query_as( + "SELECT total_frames, duration FROM dev.videos WHERE file_uuid = $1", + ) + .bind(file_uuid) + .fetch_optional(pool) + .await + .context("Failed to query video info")? + .unwrap_or((Some(0), Some(0.0))); + let tf = total_frames.unwrap_or(0); + let dur = duration.unwrap_or(0.0); + if tf > 0 { + vec![(0, tf, 0.0, dur)] + } else { + vec![] + } + }; + + if segments.is_empty() { + info!("[SCENE-META] No segments for {}", file_uuid); + return Ok(0); + } + + let meta = build_heuristic_scene_meta(pool, file_uuid, &segments).await?; + let n = meta.segments.len(); + + // Write scene_meta.json + let out_path = Path::new(crate::core::config::OUTPUT_DIR.as_str()) + .join(format!("{}.scene_meta.json", file_uuid)); + let json_str = serde_json::to_string_pretty(&meta)?; + tokio::fs::write(&out_path, json_str) + .await + .context("Failed to write scene_meta.json")?; + + Ok(n) +} diff --git a/src/core/processor/mod.rs b/src/core/processor/mod.rs index f12fd8e..b783848 100644 --- a/src/core/processor/mod.rs +++ b/src/core/processor/mod.rs @@ -5,6 +5,7 @@ pub mod cut; pub mod executor; pub mod face; pub mod face_recognition; +pub mod heuristic_scene; pub mod ocr; pub mod pose; pub mod scene_classification; @@ -23,6 +24,9 @@ pub use face_recognition::{ FaceRecognitionFrame, FaceRecognitionResult, FaceRegistrationResult, RecognizedFace, RecognizedFaceDetection, }; +pub use heuristic_scene::{ + build_heuristic_scene_meta, generate_scene_meta, CrowdSize, HeuristicSceneMeta, SceneSegmentMeta, +}; pub use ocr::{process_ocr, OcrFrame, OcrResult, OcrText}; pub use pose::{process_pose, Bbox, Keypoint, PersonPose, PoseFrame, PoseResult}; pub use scene_classification::{ diff --git a/src/worker/job_worker.rs b/src/worker/job_worker.rs index 2d9e54f..c836589 100644 --- a/src/worker/job_worker.rs +++ b/src/worker/job_worker.rs @@ -17,6 +17,7 @@ use crate::core::db::{ use crate::core::embedding::Embedder; use crate::worker::config::WorkerConfig; use crate::worker::processor::{ProcessorPool, ProcessorTask}; +use crate::core::processor::heuristic_scene::generate_scene_meta; use crate::worker::resources::SystemResources; pub struct JobWorker { @@ -861,6 +862,26 @@ impl JobWorker { }); } + // 🚀 P2.7 Trigger: Heuristic Scene Metadata (Face + YOLO → scene attributes) + // Replaces removed Places365 Scene classifier. + if has_face && has_yolo { + info!("📝 Face + YOLO complete, generating heuristic scene metadata..."); + let db_clone = self.db.clone(); + let uuid_clone = uuid.to_string(); + tokio::spawn(async move { + match generate_scene_meta(&db_clone, &uuid_clone).await { + Ok(n) => info!( + "✅ Heuristic scene metadata: {} segments for {}", + n, uuid_clone + ), + Err(e) => error!( + "❌ Heuristic scene metadata failed for {}: {}", + uuid_clone, e + ), + } + }); + } + // 🚀 P3 Trigger: Identity Agent (Face + ASRX) if has_face && has_asrx { info!("📝 Prerequisites met for Identity Agent. Starting analysis...");