diff --git a/.env.development b/.env.development index b457003..fd5b6b2 100644 --- a/.env.development +++ b/.env.development @@ -67,4 +67,8 @@ REDIS_CACHE_TTL_VIDEO_META=3600 # 多個同義詞檔案(逗號分隔),會覆蓋 MOMENTRY_SYNONYM_FILE # MOMENTRY_SYNONYM_FILES=/path/to/first.json,/path/to/second.json # -# 示例檔案:docs/examples/custom_synonyms.json \ No newline at end of file +# 示例檔案:docs/examples/custom_synonyms.json + +# TMDb Integration (probe phase - auto-create identities from movie metadata) +TMDB_API_KEY=e9cde52197f6f8df4d9db99da93db1fb +MOMENTRY_TMDB_PROBE_ENABLED=true \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index 30d7872..6877ce4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -357,6 +357,12 @@ cargo run --features player --bin momentry_player -- -o - `MOMENTRY_CUT_TIMEOUT` - CUT timeout in seconds (default: 3600) - `MOMENTRY_DEFAULT_TIMEOUT` - Default timeout (default: 7200) +### TMDb Integration (Face Clustering) +- `TMDB_API_KEY` - TMDb API key for movie metadata lookup (required for `MOMENTRY_TMDB_PROBE_ENABLED=true`) +- `MOMENTRY_TMDB_PROBE_ENABLED` - Enable TMDb probe during registration (default: `false`) + - Register phase: searches TMDb by filename, creates identities with tmdb_id/tmdb_profile + - Post-process phase: matches detected faces against TMDb identities via cosine similarity + ### Synonym Expansion - `MOMENTRY_SYNONYM_FILES` - Comma-separated paths to synonym JSON files (e.g., `data/english_synonyms.json,data/llm_synonyms.json`) - `MOMENTRY_SYNONYM_FILE` - Single synonym JSON file path (deprecated, use above) @@ -372,6 +378,7 @@ cargo run --features player --bin momentry_player -- -o - Monitor directory is a separate system (not Rust) - PythonExecutor provides unified script execution with timeout support - Redis 1.0.x for improved performance +- FaceNet CoreML model (`models/facenet512.mlpackage`) replaces InsightFace for embedding extraction (MIT license, ANE-accelerated) ### LLM Synonym Generation diff --git a/Cargo.lock b/Cargo.lock index 293490d..5200c3a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2380,6 +2380,7 @@ dependencies = [ "tempfile", "thiserror 1.0.69", "tokio", + "tokio-util", "tower 0.4.13", "tower-http 0.5.2", "tracing", diff --git a/Cargo.toml b/Cargo.toml index f7be791..957288c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -80,6 +80,7 @@ crossterm = "0.28" # Terminal atty = "0.2" +tokio-util = { version = "0.7.18", features = ["io"] } # System diff --git a/docs_v1.0/API_V1.0.0/INTERNAL/API_DICTIONARY_V1.0.0.md b/docs_v1.0/API_V1.0.0/INTERNAL/API_DICTIONARY_V1.0.0.md index 5f5be85..ab20311 100644 --- a/docs_v1.0/API_V1.0.0/INTERNAL/API_DICTIONARY_V1.0.0.md +++ b/docs_v1.0/API_V1.0.0/INTERNAL/API_DICTIONARY_V1.0.0.md @@ -2,8 +2,8 @@ document_type: "reference_doc" service: "MOMENTRY_CORE" title: "Momentry Core API 字典 V1.0.0" -date: "2026-05-01" -version: "V1.0" +date: "2026-05-06" +version: "V1.3" status: "active" owner: "Warren" created_by: "OpenCode" @@ -24,160 +24,150 @@ ai_query_hints: related_documents: - "API_V1.0.0/MOMENTRY_CORE_API_V1.0.0.md" - "API_V1.0.0/API_USAGE_DEMO_V1.0.0.md" - - "API_V1.0.0/API_REFERENCE_v1.0.0.20260501md.md" - "API_V1.0.0/CHUNK_DEFINITION_V1.0.0.md" - "API_V1.0.0/VECTOR_SPEC_V1.0.0.md" --- -# Momentry Core API 字典級全量文件 V1.0.0 +# Momentry Core API 字典 V1.0.0 ## 關鍵術語定義 | 術語 | 定義 | |------|------| -| Public API | 供前端與外部系統使用的標準介面(58 個端點) | -| Internal API | 系統內部流程或狀態查詢用(5 個端點) | -| Admin API | 管理員專用(5 個端點) | -| file_uuid | 32 碼 SHA256 檔案識別碼 | -| RESTful | 以資源為中心的 API 設計風格 | +| Public API | 供前端與外部系統使用的標準介面 | +| Internal API | 系統內部流程或狀態查詢用 | +| Admin API | 管理員專用 | +| file_uuid | 32 碼 birth UUID(MAC + time + path + filename) | +| identity_uuid | 32 碼 UUIDv5(source + external_id) | +| RESTful | 以資源為中心的 API 設計風格,collection 複數、resource 單數 | -## 📊 端點統計 (Endpoint Statistics) +## 端點統計 | 分類 | 數量 | 說明 | |---|---|---| -| ✅ **Public** | 58 | 供前端與外部系統使用的標準介面 | -| ⚠️ **Internal** | 5 | 系統內部流程或狀態查詢 (如 Probe, SFTPGo) | -| 🔒 **Admin** | 5 | 管理員專用 (如 Resources, Config Cache) | -| **總計** | **67** | 所有已註冊路由 (`gen-traces` 已移除) | +| Public | 40 | 供前端與外部系統使用的標準介面 | +| Internal | 4 | 系統內部流程或狀態查詢 | +| Admin | 3 | 管理員專用 | +| Health | 2 | 服務健康檢查 | +| **總計** | **48** | 所有已註冊路由 | -| 項目 | 內容 | -|------|------| -| 建立者 | OpenCode | -| 建立時間 | 2026-05-01 | -| 端點總數 | **68** | -| 文件版本 | V1.1 (Route Fixes + Arch Notes) | +## 設計原則 + +### 1. RESTful 命名規範 +- Collection(複數): `/api/v1/files`, `/api/v1/identities` +- Resource(單數): `/api/v1/file/:file_uuid`, `/api/v1/identity/:identity_uuid` +- Action on resource: `/api/v1/identity/:identity_uuid/bind` + +### 2. File-Centric +- 每個媒體檔案由 32 碼 UUID (`file_uuid`) 唯一標識 +- File 是所有資料的根節點,Chunk、Job 隸屬於特定 File + +### 3. Global Identity +- Identity 跨檔案關聯,不受單一檔案限制 +- 透過 bind/unbind/mergeinto 管理 Face → Identity 的直接 FK 綁定(V4.0) --- -## 🚀 設計原則 (Design Principles) +## 1. 系統與認證 -### 1. Clear API (介面清晰化) -* **去蕪存菁**: 嚴格區分 **Public** (公開) 與 **Internal** (內部) 端點。舊版冗餘路徑(如 `/api/v1/videos`, `/api/v1/probe`)已全面移除或合併。 -* **標準化回應**: 所有列表型 API 均回傳統一結構 `{ "success": true, "data": [...], "total": N }`。 -* **命名規範**: 採用 RESTful 風格,資源以複數名詞或明確動作命名(如 `files`, `identities`)。 - -### 2. File-Centric (以檔案為核心) -* **唯一識別**: 每個媒體檔案(影片/圖片/音訊)均由 **32 碼 UUID** (`file_uuid`) 唯一標識。 -* **生命週期**: `File` 是所有資料的根節點。所有的 `Chunk` (片段), `Snapshot` (快照), `Jobs` (任務) 皆隸屬於特定的 `File`。 -* **操作模式**: 前端應優先呼叫 `GET /api/v1/files` 取得清單,再透過 `POST /api/v1/files/:uuid/snapshots/migrate` 載入詳細資源。 - -### 4. Trace Aggregation (軌跡聚合獨立化) -* **架構**: `trace_face` 聚合由獨立 Python 腳本 `scripts/trace_face_aggregator.py` 處理,**不**內嵌於 Rust DB 層。 -* **流程**: Face Processor (Python) 輸出離散幀級資料到 `face_detections` 表 → Rust Worker 排程 `trace_face_aggregator.py` → 該腳本讀取 DB、按 `face_id` 分組聚合、寫入 `pre_chunks` (source_type=`trace_face`)。 -* **設計理由**: 保持 Rust 排程層輕量化,軌跡聚合邏輯留在 Python 層統一維護,便於未來調整聚合演算法 (如 IOU 門檻、時間間隔合併等) 而無需重新編譯 Rust。 - -### 5. Global Identity (全域身份識別) -* **跨檔案關聯**: `Identity` 代表一個獨立的人物或角色,不受單一檔案限制。 -* **綁定機制 (Binding)**: 透過 `POST /api/v1/identities/bind`,我們可以將多個檔案中偵測到的臉部 (`face`) 或聲音 (`speaker`) 聚合到同一個 `Identity` 下。 -* **資料聚合**: 查詢某個 `Identity` 即可看到該人物在所有歷史檔案中的軌跡 (`/api/v1/identities/:uuid/files`)。 - ---- - -## 1. 系統與認證 (System & Auth) | 方法 | 路徑 | 狀態 | -|---|---|---| -| `GET` | `/health` | ✅ Public | -| `GET` | `/health/detailed` | ✅ Public | -| `POST` | `/api/v1/auth/login` | ✅ Public | -| `POST` | `/api/v1/auth/logout` | ✅ Public | +|------|------|------| +| `GET` | `/health` | Health | +| `GET` | `/health/detailed` | Health | +| `POST` | `/api/v1/auth/login` | Public | +| `POST` | `/api/v1/auth/logout` | Public | -## 2. 檔案管理 (Files & Assets) -| 方法 | 路徑 | 狀態 | -|---|---|---| -| `GET` | `/api/v1/files` | ✅ Public | -| `GET` | `/api/v1/files/scan` | ✅ Public | -| `POST` | `/api/v1/files/register` | ✅ Public | -| `POST` | `/api/v1/unregister` | ✅ Public | -| `GET` | `/api/v1/files/:file_uuid` | ✅ Public | -| `GET` | `/api/v1/files/:file_uuid/identities` | ✅ Public | -| `GET` | `/api/v1/files/:file_uuid/snapshots` | ✅ Public | -| `GET` | `/api/v1/files/:file_uuid/snapshots/status` | ✅ Public | -| `POST` | `/api/v1/files/:file_uuid/snapshots/migrate` | ✅ Public | -| `POST` | `/api/v1/files/:file_uuid/snapshots/teardown` | ✅ Public | +## 2. 檔案管理 (Files) -## 3. 影片與任務 (Videos & Jobs) | 方法 | 路徑 | 狀態 | -|---|---|---| -| `DELETE` | `/api/v1/videos/:file_uuid` | ✅ Public | -| `GET` | `/api/v1/videos/:file_uuid/details` | ✅ Public | -| `GET` | `/api/v1/videos/:file_uuid/pre_chunks` | ✅ Public | -| `GET` | `/api/v1/progress/:file_uuid` | ✅ Public | -| `GET` | `/api/v1/jobs` | ✅ Public | -| `GET` | `/api/v1/jobs/:job_id` | ✅ Public | -| `GET` | `/api/v1/rules/:rule/status` | ✅ Public | -| `GET` | `/api/v1/files/:file_uuid/probe` | ✅ Public | -| `POST` | `/api/v1/files/:file_uuid/process` | ✅ Public | -| `GET` | `/api/v1/assets/:uuid/status` | ⚠️ Internal | -| `POST` | `/api/v1/resources/register` | 🔒 Internal | -| `POST` | `/api/v1/resources/heartbeat` | 🔒 Internal | -| `GET` | `/api/v1/resources` | 🔒 Internal | +|------|------|------| +| `GET` | `/api/v1/files` | Public | +| `GET` | `/api/v1/files/scan` | Public | +| `POST` | `/api/v1/files/register` | Public | +| `POST` | `/api/v1/files/unregister` | Public | +| `GET` | `/api/v1/file/:file_uuid` | Public | +| `GET` | `/api/v1/file/:file_uuid/probe` | Public | +| `POST` | `/api/v1/file/:file_uuid/process` | Public | +| `GET` | `/api/v1/file/:file_uuid/identities` | Public | +| `GET` | `/api/v1/file/:file_uuid/chunks` | Public | +| `GET` | `/api/v1/file/:file_uuid/thumbnail?frame=&x=&y=&w=&h=` | Public | +| `POST` | `/api/v1/file/:file_uuid/face_trace/sortby` | Public | + +## 3. 管線與任務 (Pipeline & Jobs) + +| 方法 | 路徑 | 狀態 | +|------|------|------| +| `GET` | `/api/v1/progress/:file_uuid` | Public | +| `GET` | `/api/v1/jobs` | Public | +| `GET` | `/api/v1/job/:job_id` | Public | +| `GET` | `/api/v1/rule/:rule_id/status` | Public | +| `POST` | `/api/v1/resource/register` | Internal | +| `POST` | `/api/v1/resource/heartbeat` | Internal | +| `GET` | `/api/v1/resources` | Internal | ## 4. 搜尋 (Search) -| 方法 | 路徑 | 狀態 | -|---|---|---| -| `POST` | `/api/v1/search` | ✅ Public | -| `POST` | `/api/v1/search/bm25` | ✅ Public | -| `POST` | `/api/v1/search/hybrid` | ✅ Public | -| `POST` | `/api/v1/search/visual` | ✅ Public | -| `POST` | `/api/v1/search/visual/class` | ✅ Public | -| `POST` | `/api/v1/search/visual/density` | ✅ Public | -| `POST` | `/api/v1/search/visual/combination` | ✅ Public | -| `POST` | `/api/v1/search/visual/stats` | ✅ Public | -## 5. 身份與綁定 (Identity & Binding) | 方法 | 路徑 | 狀態 | -|---|---|---| -| `GET` | `/api/v1/identities` | ✅ Public | -| `GET` | `/api/v1/identities/:uuid` | ✅ Public | -| `GET` | `/api/v1/identities/:uuid/files` | ✅ Public | -| `GET` | `/api/v1/identities/:uuid/chunks` | ✅ Public | -| `GET` | `/api/v1/identities/:identity_id/faces` | ✅ Public | -| `POST` | `/api/v1/identities/from-person` | ✅ Public | -| `POST` | `/api/v1/identities/from-face` | ✅ Public | -| `POST` | `/api/v1/identities/bind` | ✅ Public | -| `POST` | `/api/v1/identities/unbind` | ✅ Public | +|------|------|------| +| `POST` | `/api/v1/search` | Public | +| `POST` | `/api/v1/search/bm25` | Public | +| `POST` | `/api/v1/search/hybrid` | Public | +| `POST` | `/api/v1/search/smart` | Public | +| `POST` | `/api/v1/search/universal` | Public | +| `POST` | `/api/v1/search/frames` | Public | +| `POST` | `/api/v1/search/visual` | Public | +| `POST` | `/api/v1/search/visual/class` | Public | +| `POST` | `/api/v1/search/visual/density` | Public | +| `POST` | `/api/v1/search/visual/combination` | Public | +| `POST` | `/api/v1/search/visual/stats` | Public | + +## 5. 身份管理 (Identity) -## 6. 臉部 (Face) | 方法 | 路徑 | 狀態 | -|---|---|---| -| `GET` | `/api/v1/face/list` | ✅ Public | -| `GET` | `/api/v1/face/:face_id` | ✅ Public | -| `DELETE` | `/api/v1/face/:face_id` | ✅ Public | -| `POST` | `/api/v1/face/recognize` | ✅ Public | -| `POST` | `/api/v1/face/register` | ✅ Public | -| `POST` | `/api/v1/face/search` | ✅ Public | -| `GET` | `/api/v1/faces/candidates` | ✅ Public | -| `GET` | `/api/v1/files/:file_uuid/faces/:face_id/thumbnail` | ✅ Public | -| `GET` | `/api/v1/signals/unbound` | ✅ Public | -| `GET` | `/api/v1/signals/:uuid/:binding_type/:binding_value/timeline` | ✅ Public | +|------|------|------| +| `GET` | `/api/v1/identities` | Public | +| `POST` | `/api/v1/identity` | Public | +| `GET` | `/api/v1/identity/:identity_uuid` | Public | +| `DELETE` | `/api/v1/identity/:identity_uuid` | Public | +| `GET` | `/api/v1/identity/:identity_uuid/files` | Public | +| `GET` | `/api/v1/identity/:identity_uuid/chunks` | Public | +| `POST` | `/api/v1/identity/:identity_uuid/bind` | Public | +| `POST` | `/api/v1/identity/:identity_uuid/unbind` | Public | +| `POST` | `/api/v1/identity/:from_uuid/mergeinto` | Public | + +## 6. 臉部 (Faces) + +| 方法 | 路徑 | 狀態 | +|------|------|------| +| `GET` | `/api/v1/faces/candidates` | Public | ## 7. 代理人 (Agents) -| 方法 | 路徑 | 狀態 | -|---|---|---| -| `POST` | `/api/v1/agents/translate` | ✅ Public | -| `POST` | `/api/v1/agents/5w1h/analyze` | ✅ Public | -| `POST` | `/api/v1/agents/5w1h/batch` | ✅ Public | -| `GET` | `/api/v1/agents/5w1h/status` | ✅ Public | -| `POST` | `/api/v1/agents/identity/analyze` | ✅ Public | -| `POST` | `/api/v1/agents/identity/suggest` | ✅ Public | -| `GET` | `/api/v1/agents/identity/status` | ✅ Public | -| `POST` | `/api/v1/agents/suggest/merge` | ✅ Public | -## 8. 狀態與統計 (Stats) | 方法 | 路徑 | 狀態 | -|---|---|---| -| `GET` | `/api/v1/stats/ingest` | ✅ Public | -| `GET` | `/api/v1/stats/sftpgo` | ⚠️ Internal | -| `GET` | `/api/v1/stats/inference` | ⚠️ Internal | -| `POST` | `/api/v1/config/cache` | 🔒 Internal | -| `GET` | `/api/v1/lookup` | ✅ Public | +|------|------|------| +| `POST` | `/api/v1/agents/translate` | Public | +| `POST` | `/api/v1/agents/identity/analyze` | Public | +| `POST` | `/api/v1/agents/identity/suggest` | Public | +| `GET` | `/api/v1/agents/identity/status` | Public | +| `POST` | `/api/v1/agents/suggest/merge` | Public | +| `POST` | `/api/v1/agents/5w1h/analyze` | Public | +| `POST` | `/api/v1/agents/5w1h/batch` | Public | +| `GET` | `/api/v1/agents/5w1h/status` | Public | + +## 8. 狀態與管理 (Stats & Admin) + +| 方法 | 路徑 | 狀態 | +|------|------|------| +| `GET` | `/api/v1/stats/sftpgo` | Internal | +| `GET` | `/api/v1/stats/inference` | Internal | +| `POST` | `/api/v1/config/cache` | Admin | + +--- + +## 變更歷史 + +| 版本 | 日期 | 作者 | 說明 | +|------|------|------|------| +| V1.3 | 2026-05-06 | OpenCode | 新增 `face_thumbnail` ffmpeg 即時裁切端點 + `face_trace/sortby` 端點;portal 修復 hardcoded URL/API key/legacy endpoints | +| V1.1 | 2026-05-01 | OpenCode | Route fixes + arch notes | +| V1.0 | 2026-04 | OpenCode | 初始版本 | diff --git a/docs_v1.0/API_V1.0.0/INTERNAL/CHILD_DETECTION_AGE_BENCHMARK_V1.0.0.md b/docs_v1.0/API_V1.0.0/INTERNAL/CHILD_DETECTION_AGE_BENCHMARK_V1.0.0.md new file mode 100644 index 0000000..ee2a44c --- /dev/null +++ b/docs_v1.0/API_V1.0.0/INTERNAL/CHILD_DETECTION_AGE_BENCHMARK_V1.0.0.md @@ -0,0 +1,148 @@ +--- +document_type: "experiment_report" +service: "MOMENTRY_CORE" +title: "兒童偵測與年齡估算模型選型報告" +date: "2026-05-06" +version: "V1.0" +status: "completed" +owner: "Warren" +created_by: "OpenCode" +--- + +# 兒童偵測與年齡估算模型選型報告 + +## 1. 實驗目標 + +在 Momentry Core 的 Face Trace 資料中,尋找「非主要演員中的兒童角色」並評估三種年齡估算方案的可行性: +1. **DeepFace AgeNet** — 深度學習年齡估算(MIT License) +2. **Apple Vision 頭肩比** — 用頭寬/肩寬比例推測年齡(系統內建) +3. **MiVOLO** — HuggingFace 年齡模型(Apache 2.0) + +## 2. 實驗環境 + +| 項目 | 內容 | +|------|------| +| 測試影片 | Charade (1963), 113 min, 24fps | +| Face detections | 6182 faces, 2347 traces | +| Face 偵測 | Apple Vision `VNDetectFaceRectanglesRequest` (swift_face) | +| Face 嵌入 | CoreML FaceNet512 | +| 取樣間隔 | 60 幀 (2.5 秒) | +| 體態偵測 | Apple Vision `VNDetectHumanBodyPoseRequest` | + +## 3. 實驗方法 + +### 3.1 主要角色年齡估算 + +從 2347 個 trace 中挑選 face_count ≥ 5 的 12 個主要 trace,提取中間幀進行 DeepFace 年齡估算 + Apple Vision 頭肩比計算。 + +### 3.2 非主要角色搜尋 + +搜尋小臉(< 60px)、低 face_count(≤ 2)的 trace,找出群眾演員(可能包含兒童)。 + +### 3.3 滑雪場水槍場景 + +Charade 開場 Megève 滑雪場有一名男孩用水槍噴灑女主角的場景。對此場景進行密集幀掃描(30 幀間隔)搜尋兒童臉。 + +## 4. 模型選型結果 + +### 4.1 模型可用性 + +| 方案 | 可用 | 速度/face | License | 結論 | +|------|------|----------|---------|------| +| **DeepFace AgeNet** | ✓ | 0.2s(快取後) | MIT | **推薦** | +| Apple Vision 年齡 | ✗ | — | 系統內建 | Vision 無年齡 API | +| Apple Vision 頭肩比 | ✓ | 即時 | 系統內建 | 僅成人/兒童分類 | +| MiVOLO | ✗ | — | Apache 2.0 | 模型不可用(HuggingFace 不存在) | + +### 4.2 DeepFace 年齡估算(12 主要角色取樣) + +| Trace | Faces | 出現時間 | 臉寬 | DeepFace 年齡 | 性別 | 情緒 | +|-------|-------|----------|------|-------------|------|------| +| 0 | 45 | 35s | 160px | 35 | Man | sad | +| 24 | 6 | 708s | 100px | 34 | Man | neutral | +| 26 | 5 | 728s | 100px | 31 | Woman | neutral | +| 39 | 14 | 760s | 120px | 30 | Man | sad | +| 43 | 12 | 765s | 120px | 25 | Man | sad | +| 45 | 8 | 775s | — | 36 | Woman | neutral | +| 46 | 9 | 795s | — | 29 | Woman | neutral | +| 48 | 6 | 818s | 140px | 50 | Man | angry | +| 76 | 13 | 908s | — | 29 | Man | sad | +| 87 | 5 | 972s | — | 35 | Man | sad | +| 103 | 7 | 1022s | — | 35 | Woman | neutral | +| 132 | 5 | 1158s | — | 27 | Man | surprise | + +**年齡範圍:25–50 歲,全成人。** + +### 4.3 Apple Vision 頭肩比 + +| Frame | 臉寬 | 肩寬 | 頭肩比 | DeepFace 年齡 | 場景 | +|-------|------|------|--------|-------------|------| +| 840 | 160px | 407px | **0.39** | 35 | 滑雪場(主角) | +| 17460 | 100px | 354px | **0.28** | 31 | 中段場景 | +| 18360 | 120px | 306px | **0.39** | 25 | 中段場景 | +| 19620 | 140px | 425px | **0.33** | 50 | 最年長角色 | +| 27780 | 110px | 381px | **0.29** | 27 | 後段場景 | + +**頭肩比範圍:0.28–0.39(全成人範圍)。兒童預期 > 0.6。** + +### 4.4 非主要演員(群眾) + +| Trace | Faces | 臉寬 | DeepFace 年齡 | 性別 | 頭肩比 | 場景 | +|-------|-------|------|-------------|------|--------|------| +| 129 | 1 | 42px | 37 | Man | 0.13 | 遠景群眾 | +| 172 | 2 | 51px | 31 | Man | 0.22 | 遠景群眾 | +| 304 | 2 | 47px | 41 | Man | 0.14 | 遠景群眾 | +| 57 | 1 | 52px | 35 | Woman | — | 遠景群眾 | +| 322 | 1 | 52px | 34 | Man | 0.18 | 遠景群眾 | + +**全成人。遠景群眾頭肩比更低 (0.13–0.22),因相機距離影響 > 體型差異。** + +## 5. 水槍場景搜尋結果 + +**成功找到小孩,但無法可靠估算年齡。** + +| 參數 | 數值 | +|------|------| +| 影片 | Charade (1963) | +| 場景 | Megève 滑雪場戶外餐廳 | +| 時間 | Frame 2450 (102 秒 / 1:42) | +| 臉部尺寸 | **29 × 29 px** | +| Swift Face 偵測 | ✓ 已偵測(trace_id 未分配,單幀) | +| DeepFace 年齡 | 33 Man ❌ **誤判**(解析度不足) | +| Apple Vision 頭肩比 | 無法計算(身體被遮擋) | + +### 誤判原因 + +29×29px 遠低於年齡估算模型的最低解析度需求(一般需 ≥ 50×50px)。在遠景中,兒童的臉太小,神經網路無法提取足夠的年齡特徵,導致: +- DeepFace 將兒童誤判為成人 +- 頭肩比受距離影響大於實際年齡 + +## 6. 結論與建議 + +| 發現 | 說明 | +|------|------| +| Charade 無兒童主要角色 | 全卡司成人,DeepFace 年齡範圍 25–50 | +| 水槍小孩已找到 | Frame 2450,102 秒,但 29px 太小無法估齡 | +| DeepFace 可行 | MIT license,0.2s/face,適合 ≥ 50px 臉部 | +| Apple Vision 頭肩比 | 僅適合作近景成人/兒童分類(非精確年齡) | +| MiVOLO | 不可用(HuggingFace 模型不存在) | + +### 建議 + +1. **整合 DeepFace** 年齡估算入 `face_processor.py` pipeline,對 ≥ 50px 的臉進行年齡標記 +2. **保留頭肩比** 做為輔助驗證(成人/兒童二元分類) +3. **降低取樣間隔** 從 60 幀降至 10–15 幀以捕捉更多短暫出現的角色 +4. **若需測試兒童年齡**:使用片庫中的 `Alice Comedies (1926)`,該片有近景小女孩(Virginia Davis,6–8 歲),臉部可達 150px+ + +--- + +## 附錄:測試資料 + +| 檔案 | 路徑 | +|------|------| +| DeepFace 年齡 JSON | `output_dev/experiments/age_benchmark/age_benchmark_report.json` | +| 頭肩比 JSON | `output_dev/experiments/head_shoulder/head_shoulder_report.json` | +| 水槍場景幀 | `output_dev/experiments/head_shoulder/child_f2450.jpg` | +| 年齡基準腳本 | `scripts/age_benchmark.py` | +| 頭肩比腳本 | `scripts/head_shoulder_quick.py` | +| Face trace 排序 API | `POST /api/v1/file/:file_uuid/face_trace/sortby` | diff --git a/docs_v1.0/API_V1.0.0/INTERNAL/CHUNK_DEFINITION_V1.0.0.md b/docs_v1.0/API_V1.0.0/INTERNAL/CHUNK_DEFINITION_V1.0.0.md index fa8ec73..2fe9b83 100644 --- a/docs_v1.0/API_V1.0.0/INTERNAL/CHUNK_DEFINITION_V1.0.0.md +++ b/docs_v1.0/API_V1.0.0/INTERNAL/CHUNK_DEFINITION_V1.0.0.md @@ -1,8 +1,8 @@ --- document_type: "spec" service: "MOMENTRY_CORE" -title: "Chunk 定義 V1.0.0" -date: "2026-05-01" +title: "Story Parent-Child Chunk Rules V1.0" +date: "2026-05-05" version: "V1.0" status: "active" owner: "Warren" @@ -11,188 +11,288 @@ tags: - "momentry" - "core" - "chunk" - - "v1.0.0" - - "chunk-type" - - "pre-chunk" + - "story" - "parent-child" - - "data-structure" + - "v1.0" ai_query_hints: - - "chunk 的定義與結構" - - "pre_chunk 與 chunk 的關係" - - "parent_chunk 與 child_chunk 的關係" - - "ChunkType 包含哪些類型(Sentence/Cut/Visual/Trace/Story)" - - "chunk 的巢狀結構與 Rule 組合規則" - - "chunk 如何對應到 file_uuid 與幀區間" - - "chunk 的搜尋用途與向量儲存方式" - - "chunk 與 pre_chunk 的雙層資料架構" + - "Story parent-child chunk generation rules" + - "CUT scene → parent chunk, ASR sentence → child chunk" + - "boundary overlap: partial match enriches child context" + - "parent_summary template + child_summary template" + - "children per parent distribution" related_documents: - - "PROCESSOR_SELECTION_V1.0.0.md" - - "VECTOR_SPEC_V1.0.0.md" - - "PROCESSORS/ASR_V1.0.0.md" - - "PROCESSORS/CUT_V1.0.0.md" - - "PROCESSORS/FACE_V1.0.0.md" + - "../CHUNK_DEFINITION_V1.0.0.md" + - "../DUAL_EMBEDDING_PIPELINE_V1.0.0.md" + - "../PROCESSORS/ASR_V1.0.0.md" + - "../PROCESSORS/CUT_V1.0.0.md" --- -# Chunk 定義 V1.0.0 +# Story Parent-Child Chunk Rules V1.0 -| 項目 | 內容 | +## 核心概念 + +- **Parent chunk** = CUT 場景邊界內的所有對話 → 一個場景敘述 +- **Child chunk** = 單一 ASR sentence → 一句對白 +- **Boundary overlap** = 場景邊界重疊的句子 → 同時歸屬前後 parent + +## 匹配規則 + +### Rule 1: Fully-Contained Matching + +``` +ASR sentence 完全在 CUT 場景時間範圍內 + → seg.start >= scene.start_time AND seg.end <= scene.end_time + → 加入該 scene 的 children 列表 +``` + +### Rule 2: Boundary Overlap (所有 parent) + +``` +對於每個 parent chunk(即使只有 1 child): + → 找出與 scene 時間範圍有 partial overlap 的 ASR sentence + → seg.start < scene.end_time AND seg.end > scene.start_time + → AND 未被 Rule 1 匹配(不是 fully-contained) + → 加入該 scene 的 children 列表 +``` + +邊界 overlap 讓 child chunk 可以同時歸屬前後兩個 parent,提供更多上下文。 + +### Rule 3: Scene Filter + +``` +CUT scene duration < 1s → 跳過(場景太短無意義) +``` + +## Parent Summary 模板 + +``` +[{start}s-{end}s, {duration}s] +Cast: {character_list} +Total dialogue: N lines, W words +Speakers: {name} (N lines): "sample text..." +``` + +## Child Summary 模板 + +``` +[{start}s-{end}s] {speaker_name}: "{asr_text}" +``` + +### Embedding Target + +Child summary text → Ollama nomic-embed-text-v2-moe → 768D vector → pgvector + +## 數據實例:Charade (1963) — 長片 113min + +### 輸入 + +| 來源 | 數量 | 說明 | +|------|------|------| +| ASR segments | **1,629** | Whisper small 英文字幕 | +| ASR with text | 1,629 | 全部有文字 | +| ASR total duration | 6,760s (113 min) | | +| CUT scenes | **1,331** | PySceneDetect 場景切割 | +| CUT scenes ≥ 1s | 1,200 | 過濾後有效場景 | +| CUT mean duration | 5.2s | 平均場景長度 | +| CUT scene gap (unmatched) | 131 | < 1s 場景被過濾 | + +### 輸出 (V2.1 — boundary overlap for ALL scenes, duration filter removed) + +| 指標 | 數值 | |------|------| -| 建立者 | OpenCode | -| 建立時間 | 2026-05-01 | -| 文件版本 | V1.0 | +| **Parent chunks** | **1,313** (all CUT scenes ≥ 0s) | +| **Child chunks** (total in DB) | **2,927** (1,629 unique + 1,298 overlaps) | +| **Unique children** | **1,629** (100% ASR coverage) | +| DB duplicates (shared) | 1,298 (ON CONFLICT merge) | +| Children per parent | 1 ~ 43, avg **2.2** | +| Unmatched | **0** | -## 名詞定義 - -| 名詞 | 定義 | 範例 | -|------|------|------| -| **Processor JSON** | Processor 腳本的第一層產出檔案 | `384b0ff44aaaa1f14cb2cd63b3fea966.face.json` | -| **pre_chunk** | 從 Processor JSON 匯入 DB 的最低層元件(`pre_chunks` 表) | 單幀 face detection、單句 ASR text | -| **chunk** | 可搜尋單位(`chunks` 表),由 Rule 組合 pre_chunks 產出,`start_frame` ~ `end_frame` 定義區間 | sentence chunk, visual chunk, scene chunk | -| **parent_chunk** | chunk 的一種,包含 `child_chunk_ids`,其區間涵蓋多個 child_chunks,由 Summary Agent 產出統整描述 | scene chunk, story chunk | -| **child_chunk** | chunk 的一種,被 parent_chunk 參照為子元素 | sentence chunk, visual chunk | - ---- - -## Chunk 結構 - -```rust -Chunk { - uuid: String, // file_uuid (32-char hex) - chunk_id: String, // "{uuid}_{chunk_index}" - chunk_index: u32, // 0-based 序號 - chunk_type: ChunkType, // Sentence | Cut | Visual | Trace | Story - rule: ChunkRule, // Rule1 (直接組合) | Rule2 (聚合) - start_frame: i64, // 起始幀(0-based,唯一時間參考) - end_frame: i64, // 結束幀(exclusive) - fps: f64, // 該區間的 fps - content: JSON, // 主要內容 - text_content: Option, // 純文字內容(供搜尋用) - metadata: Option, // speaker, face_ids, yolo_objects 等 - pre_chunk_ids: Vec, // 來源 pre_chunks(原始元件追溯) - parent_chunk_id: Option, // 父 chunk ID(如存在) - child_chunk_ids: Vec, // 子 chunk IDs(如為 parent_chunk) - vector_id: Option, // 向量儲存參考 -} -``` - ---- - -## ChunkType - -| 類型 | 說明 | 範例 | -|------|------|------| -| `Sentence` | ASR 句子 chunk | 一句話對應一個 chunk | -| `Cut` | 場景切換 chunk | PySceneDetect 輸出的場景邊界 | -| `Visual` | 視覺物件 chunk | YOLO/OCR/Face/Pose 聚合 | -| `Trace` | 追蹤 chunk | face_trace / yolo_trace | -| `Story` | 敘事 chunk(parent) | 5W1H Agent 產出的統整描述 | - ---- - -## Chunk 特性 - -- **區間定義**: `start_frame` / `end_frame`(frames 為唯一時間座標) -- **可重疊**: 不同類型的 chunk 可以覆蓋相同區間 -- **可不連續**: chunk 之間不需要連續 -- **巢狀**: parent_chunk 包含 child_chunk_ids,子區間不須填滿父區間 -- **單幀 chunk**: `start_frame == end_frame`(如 frame-level detection) - ---- - -## 資料流 +### 分佈 ``` -Processor JSON ({file_uuid}.{type}.json) - │ - ▼ 匯入 -pre_chunks (原始元件, start_frame / end_frame / data) - │ - ▼ Rule 組合 (Rule1 / Rule2 / Rule3) -chunks (可搜尋單位) - ├── child_chunk (基礎搜尋單位) - │ └── 5W1H: 該 chunk 的摘要描述(3~5 句話) - │ - └── parent_chunk (較大區間, Summary Agent 產出) - ├── child_chunk_ids: [內含的所有 child_chunks] - └── summary: (child_chunks 的 5W1H + parent_chunk 補充描述) - via Summary Agent (如 5W1H Agent) - summary 為 3~5 句話,統整區間內所有內容 - 用於 embedding 成向量,確保搜尋時涵蓋足夠語意 +Children per parent: + 1: 128 parents (獨白/短場景) + 2: 58 parents + 3: 0 parents ← 邊界 overlap 後 3 被 2/4 吸收 + 4-9: 64 parents (中等對話場景) + 10-27: 50 parents (多人對話場景) ``` ---- +### 已匹配率 -## 與 pre_chunk 的關係 +| 指標 | 數值 | +|------|------| +| ASR unmatched | **0** (V2.1: boundary overlap for ALL scenes) | +| 已匹配率 | **100%** | -| 層級 | 產生方式 | 目的 | -|------|----------|------| -| pre_chunk | 直接從 Processor JSON 匯入 | 保留原始資料,供 Rule 加工 | -| chunk | Rule 組合 pre_chunks | 成為可搜尋單位 | -| child_chunk | chunk 的一種 | 基礎搜尋目標 | -| parent_chunk | Summary Agent 產出 | 補足單一 child_chunk 資訊量不足 | +## 輸入/輸出範例 ---- +### Big Parent(多子女) -## 範例 +**輸入原始數據**: +``` +CUT scene [2783s-2847s, 65s] +27 ASR sentences, all spoken by Audrey Hepburn + Cary Grant + SPEAKER_2 +``` -### Sentence Chunk (child_chunk) +**輸出 Parent Summary**: +``` +[2783s-2847s, 65s] Cast: Audrey Hepburn, Cary Grant, SPEAKER_2. +Total dialogue: 27 lines, 143 words. +``` + +**輸出 Child Summaries**(embedding target): +``` +[2784s-2786s] Audrey Hepburn: "they stole it" +[2786s-2788s] Audrey Hepburn: "by burying it" +[2788s-2790s] Audrey Hepburn: "then reporting the Germans had captured it" +... (27 total) +``` + +**Metadata 信度**(隨 parent/child 傳遞): ```json +// Parent metadata { - "chunk_id": "384b0ff44aaaa1f14cb2cd63b3fea966_42", - "chunk_index": 42, - "chunk_type": "sentence", - "rule": "rule_1", - "start_frame": 1260, - "end_frame": 1350, - "fps": 29.97, - "content": { - "text": "今天天氣很好,我們決定去公園走走。", - "speaker": "SPEAKER_00" - }, - "text_content": "今天天氣很好,我們決定去公園走走。", - "metadata": { - "speaker": "SPEAKER_00", - "face_ids": ["face_42", "face_43"], - "5w1h": "講者 SPEAKER_00 在室內提到今天天氣很好。他建議大家一起到公園散步。同伴們同意這個提議。大家開始準備出發。整個對話顯示團隊氣氛融洽。" - }, - "pre_chunk_ids": [101, 102, 103], - "parent_chunk_id": "384b0ff44aaaa1f14cb2cd63b3fea966_scene_3" + "speaker_confidence": { "Audrey Hepburn": 0.85, "Cary Grant": 0.64 }, + "face_confidence": { "Audrey Hepburn": 0.60, "Cary Grant": 0.64 }, + "yolo_objects": { "car": 0.72, "bottle": 0.55, "chair": 0.68 } +} + +// Child metadata +{ + "speaker_name": "Audrey Hepburn", + "speaker_confidence": 0.85, // MAR lip: 57% events during SPEAKER_1 + "face_confidence": 0.60, // clustering composite + "asr_confidence": 0.92 // Whisper confidence } ``` -### Scene Chunk (parent_chunk) +### 1:1 Parent(單子女) -```json -{ - "chunk_id": "384b0ff44aaaa1f14cb2cd63b3fea966_scene_3", - "chunk_index": 3, - "chunk_type": "cut", - "rule": "rule_3", - "start_frame": 1200, - "end_frame": 1800, - "fps": 29.97, - "content": { - "scene_number": 3, - "scene_type": "dialogue" - }, - "text_content": "今天天氣很好,我們決定去公園走走。之後我們在公園裡散步,看到很多花。", - "metadata": { - "summary": "講者和同伴在室內討論天氣狀況,提到今天陽光明媚。他們決定到附近的公園散步享受好天氣。抵達公園後,他們沿著步道行走,觀察到許多盛開的花朵。其中一人用手機拍攝了花朵的照片。整個對話氣氛輕鬆愉快。" - }, - "pre_chunk_ids": [98, 99, 100], - "child_chunk_ids": [ - "384b0ff44aaaa1f14cb2cd63b3fea966_42", - "384b0ff44aaaa1f14cb2cd63b3fea966_43", - "384b0ff44aaaa1f14cb2cd63b3fea966_44" - ] -} +**輸入原始數據**: +``` +CUT scene [304s-318s, 14s] +1 ASR sentence, spoken by Cary Grant alone ``` ---- +**輸出 Parent Summary**: +``` +[304s-318s, 14s] Cast: Cary Grant. +Total dialogue: 1 lines, 13 words. +``` + +**輸出 Child Summary**(embedding target): +``` +[309s-317s] Cary Grant: "Sylvia I'm getting a divorce what from Charles he's the only husband I" +``` + +## 與 LLM Pipeline 的關係 + +``` +Pipeline 1 (Story): template summary → DB + embedding +Pipeline 2 (LLM): LLM summary → DB + embedding (future) + +chunk_type: + story_parent / story_child ← Pipeline 1 + llm_parent / llm_child ← Pipeline 2 (future) +``` ## 版本歷史 -| 版本 | 日期 | 目的 | 操作人 | 工具/模型 | -|------|------|------|--------|-----------| -| V1.0 | 2026-05-01 | 初始版本 | OpenCode | deepseek-chat | +| 版本 | 日期 | 變更 | +|------|------|------| +| V1.0 | 2026-05-05 | 初始規則:fully-contained + boundary overlap | +| V2.1 | 2026-05-05 | 移除 duration filter,boundary overlap 對所有場景(含空場景)。100% ASR coverage。Speaker mapping 從 DB 動態讀取。 | + +## Charade 1963 統計分析記錄 + +### 影片資料 + +| 指標 | 值 | +|------|-----| +| 片長 | 113 分鐘 | +| 總幀數 | 412,343 | +| FPS | 59.94 | +| 解析度 | 1920×1080 | + +### 處理器產出 + +| Processor | 輸出行數 | 說明 | +|-----------|---------|------| +| CUT | 1,331 scenes | 平均 5.2s/scene,min 0.2s,max 64.5s | +| ASR | 1,629 segments | Whisper small,113 min total | +| ASRX | 10 speakers | SPEAKER_0/1 為主要角色 | +| Face | 4,008 frames, 6,182 faces | sample=60, Vision+CoreML ANE | +| Face Trace | 6,182 detections, 2,347 traces | IoU+embedding tracking | +| Identity | 677 traces → 7 identities | 99.4% coverage, MAR lip speaker binding | +| YOLO | 328,800 frames, 57 object classes | CoreML ANE | + +### Matching 迭代記錄 + +#### Iteration 1: Fully-contained only, >= 1s scene filter + +``` +Rule: seg.start >= scene.start AND seg.end <= scene.end +Scene filter: duration >= 1s (131 scenes filtered out) + +Result: 990/1629 (61%) matched + 454 unmatched, 74 in filtered scenes + Only scenes with children got boundary overlaps +``` + +#### Iteration 2: Add boundary overlap for scenes with >= 3 children + +``` +Rule: For scenes with >= 3 children, add partial overlaps + +Result: 1,210 children (+220 partial) + Still 454 unmatched (boundary overlap only for rich scenes) +``` + +#### Iteration 3: Remove duration filter + +``` +Rule: Remove >=1s scene filter + +Result: 1,496 unique children (92% coverage) + 133 unmatched + Root cause: boundary overlap still gated by "if children:" +``` + +#### Iteration 4: Boundary overlap for ALL scenes (regardless of children) + +``` +Rule: Move boundary overlap code outside "if children:" guard + All 1,331 scenes participate + +Result: 1,629 unique children (100% coverage) + 1,313 parents (all scenes) + 2,927 total children (1,629 unique + 1,298 overlaps) +``` + +### 關鍵決策 + +| 決策 | 理由 | 影響 | +|------|------|------| +| 移除 duration filter | 131 scenes <1s 會漏掉句子 | +24 parents, +321 children | +| 移除 children guard | 空場景也要加 boundary children | +133 children (100%) | +| 用 overlap 而非 fully-contained | ASR/CUT 時間邊界不對齊 | 避免 565 sentences orphan | +| Partial overlaps 存兩次 | 邊界句可歸屬兩個 parent | 1,298 duplicates via ON CONFLICT | +| Speaker map 從 DB 讀 | 不再 hardcode 演員名 | 通用化任何影片 | + +### 效能指標 + +| 指標 | 值 | +|------|-----| +| Story 生成時間 | < 1s (template, instant) | +| Embedding 時間 (Ollama) | ~2 min for 1,629 chunks | +| Qdrant sync 時間 | ~3 min for rule1, ~1 min for story | +| BM25 search 時間 | < 10ms per query | + +### 教學要點 + +1. **時間邊界不對齊是常態**:ASR(語音邊界)與 CUT(視覺邊界)用不同演算法,永遠不會完美對齊。overlap matching 是必要設計。 +2. **Boundary overlap 需對所有場景生效**:不能只限有 children 的場景,否則會產生 orphan sentences。 +3. **ON CONFLICT merge**:同一 sentence 出現在兩個 parent 時,DB 層面用最後一個 parent。如需多對多關係,需 junction table。 +4. **Hardcoded 到 Dynamic**:speaker map 從 hardcode → DB-driven 是通用化的關鍵一步。 diff --git a/docs_v1.0/API_V1.0.0/INTERNAL/CLASS_SYSTEM_DESIGN_V1.0.0.md b/docs_v1.0/API_V1.0.0/INTERNAL/CLASS_SYSTEM_DESIGN_V1.0.0.md new file mode 100644 index 0000000..7806380 --- /dev/null +++ b/docs_v1.0/API_V1.0.0/INTERNAL/CLASS_SYSTEM_DESIGN_V1.0.0.md @@ -0,0 +1,192 @@ +--- +document_type: "design" +service: "MOMENTRY_CORE" +title: "Class 分類系統設計 V1.0" +date: "2026-05-05" +version: "V1.0" +status: "design" +owner: "Warren" +created_by: "OpenCode" +tags: + - "momentry" + - "core" + - "class" + - "taxonomy" + - "design" + - "v1.0" +ai_query_hints: + - "Class 分層分類系統設計" + - "參照 IPC (國際專利分類) 及 HS (海關稅則)" + - "編碼格式: {section}-{NNNN}" + - "用於 identity 多層分類、快速定位" +related_documents: + - "../DATA_SCHEMA_FILE_IDENTITY_V1.0.0.md" + - "../UUID_ENCODING_RULES_V1.0.0.md" +--- + +# Class 分類系統設計 V1.0 + +> 狀態:設計階段,尚未實施 + +## 設計參考 + +IPC(國際專利分類)與 HS(海關稅則)。 + +共通原則:**層級碼**、**數字越長越精細**、**全球通用**、**可無限擴展**。 + +## 設計目標 + +- IPC/HS 式的 hierarchical code → **快速定位** +- Tag 式的 multi-label 使用 → **靈活分類** +- 同一 entity 可擁有多條 class path +- 新增分類只需 INSERT,無 migration + +``` +Cary Grant + → P-0201 (演員/主角) + → T-0102 (1960s) + → S-0200 (場景/戶外 — 他在片中出現的場景) + +Ferrari 250 GT + → O-0101 (汽車) + → B-0300 (汽車品牌/Ferrari) + → T-0102 (1960s) + +## 編碼格式 + +``` +{section}-{NNNN} + │ └── 4 digits,每 2 digits 一層 + └───────── 1 char section prefix +``` + +| 層級 | 範例 | 意義 | +|------|------|------| +| `P-0000` | top section | 人物 | +| `P-0200` | subclass | 人物 → 演員 | +| `P-0201` | group | 人物 → 演員 → 主角 | +| `P-0202` | group | 人物 → 演員 → 配角 | + +層級判斷:`code.length`。`P-` = section,`P-02` = subclass,`P-0201` = group。 + +### Section 定義 + +| Section | 名稱 | 範疇 | 預留 | +|---------|------|------|------| +| `P` | 人物 | 演員、導演、公眾人物、虛構角色、運動員... | 01-99 | +| `O` | 物件 | 交通工具、家具、武器、工具、電子產品... | 01-99 | +| `B` | 品牌/組織 | 時尚、科技、汽車品牌、政府機構、NGO... | 01-99 | +| `C` | 概念/抽象 | 情感、思想、事件、主題、風格... | 01-99 | +| `A` | 生物 | 動物、植物、真菌... | 01-99 | +| `S` | 場景/地點 | 室內、戶外、城市、自然地標、建築內部... | 01-99 | +| `E` | 環境/自然 | 天氣、地形、天象、自然災害... | 01-99 | +| `M` | 音樂/聲音 | 樂器、音樂類型、自然聲音、人工聲音... | 01-99 | +| `L` | 語言/文字 | 語言、方言、書寫系統、符號... | 01-99 | +| `T` | 時間/時期 | 年代、季節、節日、歷史時期... | 01-99 | +| `F` | 檔案類型 | 影片格式、文件類型、圖片格式... | 01-99 | +| `D` | 領域/學科 | 科學、藝術、體育、政治、經濟... | 01-99 | + +12 個 Section,各 99 subclass × 99 group = ~117K 分類槽位。可隨時新增 Section。 + +## 初始 Class Tree + +``` +P-0000 人物 +├── P-0100 公眾人物 +├── P-0200 演員 +│ ├── P-0201 主角 +│ └── P-0202 配角 +├── P-0300 導演 +├── P-0400 虛構角色 +└── P-9900 其他人物 + +O-0000 物件 +├── O-0100 交通工具 +│ ├── O-0101 汽車 +│ ├── O-0102 船 +│ └── O-0103 飛機 +├── O-0200 建築 +├── O-0300 家具 +└── O-9900 其他物件 + +B-0000 品牌 +├── B-0100 時尚 +├── B-0200 科技 +└── B-9900 其他品牌 + +C-0000 概念 +├── C-0100 情感 +├── C-0200 思想 +└── C-9900 其他概念 +``` + +## Table + +```sql +CREATE TABLE classes ( + code VARCHAR(8) PRIMARY KEY, -- P-0201 + name TEXT NOT NULL, -- 主角 + description TEXT, + created_at TIMESTAMPTZ DEFAULT now() +); + +-- 多對多:同一 identity 可有多個 class code(如 tag 使用) +CREATE TABLE identity_classes ( + identity_id INTEGER REFERENCES identities(id), + class_code VARCHAR(8) REFERENCES classes(code), + confidence REAL DEFAULT 1.0, + source VARCHAR(20), -- which agent classified + PRIMARY KEY (identity_id, class_code) +); +``` + +## Query 範例 + +```sql +-- 查某 identity 的所有 class +SELECT c.code, c.name +FROM identity_classes ic +JOIN classes c ON ic.class_code = c.code +WHERE ic.identity_id = 8; + +-- 查所有屬於 "演員" (P-0200) 的 identity +SELECT i.name +FROM identity_classes ic +JOIN identities i ON ic.identity_id = i.id +WHERE ic.class_code LIKE 'P-02%'; + +-- 查某 section 下的所有 identity +SELECT DISTINCT i.name +FROM identity_classes ic +JOIN identities i ON ic.identity_id = i.id +WHERE ic.class_code LIKE 'P-%'; +``` + +## 擴展方式 + +1. 新增 leaf class:`INSERT INTO classes VALUES ('P-0203', '配音員')` — P-02 底下的新 group +2. 新增 subclass:`INSERT INTO classes VALUES ('P-0500', '製作團隊')` — P 底下的新 subclass +3. 新增 section:`INSERT INTO classes VALUES ('X-0000', '新分類')` — 全新 top-level + +無需 migration,insert 即可。 + +## 版本歷史 + +| 版本 | 日期 | 狀態 | +|------|------|------| +| V1.0 | 2026-05-05 | 設計階段 | + +## Future: Class-Based Search + +實施 class 系統後,search API 可加入 class filter 提升命中率: + +``` +GET /api/v1/search?q=car&class=O-0101 + → 只搜被分類為「汽車」的內容,過濾 "care", "car accident", "car wash" + +GET /api/v1/search/hybrid?q=divorce&class=P-0200 + → 只搜演員說出的 "divorce",排除旁白、字幕 + +GET /api/v1/search/universal?class=T-0102 + → 搜所有 1960s 相關內容 +``` diff --git a/docs_v1.0/API_V1.0.0/INTERNAL/DATA_SCHEMA_FILE_IDENTITY_V1.0.0.md b/docs_v1.0/API_V1.0.0/INTERNAL/DATA_SCHEMA_FILE_IDENTITY_V1.0.0.md new file mode 100644 index 0000000..6bf5fdd --- /dev/null +++ b/docs_v1.0/API_V1.0.0/INTERNAL/DATA_SCHEMA_FILE_IDENTITY_V1.0.0.md @@ -0,0 +1,328 @@ +--- +document_type: "spec" +service: "MOMENTRY_CORE" +title: "Data Schema: File & Identity V1.0" +date: "2026-05-05" +version: "V1.0" +status: "active" +owner: "Warren" +created_by: "OpenCode" +tags: + - "momentry" + - "core" + - "schema" + - "file" + - "identity" + - "v1.0" +ai_query_hints: + - "File & Identity DB schema" + - "face_detections.identity_id direct FK" + - "identity multi-modal: face + voice + TMDb + manual" +related_documents: + - "../DUAL_EMBEDDING_PIPELINE_V1.0.0.md" + - "../UUID_ENCODING_RULES_V1.0.0.md" +--- + +# Data Schema: File & Identity V1.0 + +## 1. File Schema + +### videos / files + +| Column | Type | 說明 | +|--------|------|------| +| `id` | SERIAL PK | | +| `file_uuid` | VARCHAR(32) | Birth UUID | +| `file_path` | VARCHAR(512) | 檔案完整路徑 | +| `file_name` | VARCHAR(256) | | +| `probe_json` | JSONB | ffprobe raw output | +| `status` | VARCHAR(20) | ready / processing / completed | +| `processing_status` | JSONB | per-processor progress | +| `total_frames` | INTEGER | | +| `fps` | DOUBLE | | +| `duration` | DOUBLE | 影片長度(秒) | +| `width` / `height` | INTEGER | 解析度 | +| `registration_time` | TIMESTAMP | 註冊時間 | + +### face_detections (per-file face data) + +| Column | Type | 說明 | +|--------|------|------| +| `id` | SERIAL PK | | +| `file_uuid` | VARCHAR(32) | → videos.file_uuid | +| `frame_number` | BIGINT | 幀號 | +| `face_id` | VARCHAR(64) | per-file face identifier | +| `trace_id` | INTEGER | 跨幀追蹤 ID | +| `x, y, width, height` | INTEGER | bbox | +| `confidence` | REAL | 偵測信度 | +| `embedding` | REAL[] | 512D CoreML FaceNet | +| `identity_id` | INTEGER | → identities.id (V4.0 direct FK) | + +### chunks (per-file parent/child chunks) + +| Column | Type | 說明 | +|--------|------|------| +| `id` | SERIAL PK | | +| `chunk_id` / `old_chunk_id` | VARCHAR | chunk identifier | +| `file_uuid` | VARCHAR(32) | → videos.file_uuid | +| `chunk_type` | VARCHAR(32) | story_parent / story_child / rule1_sentence | +| `chunk_index` | INTEGER | per-file ordering | +| `start_time` / `end_time` | DOUBLE | time range | +| `content` | JSONB | metadata | +| `text_content` | TEXT | summary text → embedding target | +| `embedding` | VECTOR | pgvector 768D | +| `search_vector` | TSVECTOR | BM25 full-text | +| `parent_chunk_id` | VARCHAR | → chunks.chunk_id | + +## 2. Identity Schema + +### 概念 + +Identity 是可命名的任何識別標的,不限於人。 + +| identity_type | 範例 | 識別模型 | +|--------------|------|---------| +| `people` | Cary Grant, Audrey Hepburn | face, voice, name | +| `animal` | 電影中的狗、馬 | face, body, sound | +| `object` | 特定道具、車輛 | yolo, image embedding | +| `plant` | 場景中的特定植物 | image embedding | +| `building` | 艾菲爾鐵塔、特定建築 | image embedding, OCR | +| `place` | Paris, 咖啡廳 | scene classification | +| `concept` | "離婚", "復仇" | text embedding | +| `brand` | Coca-Cola | OCR, logo detection | + +每種 identity_type 可以使用不同的識別模型組合。 + +### 識別模型 + +| model | dimension | source | 適用 identity_type | +|-------|-----------|--------|-------------------| +| `face` | 512D | CoreML FaceNet | people, animal | +| `voice` | 192D | SpeechBrain ECAPA-TDNN | people | +| `text` | 768D | Ollama nomic-embed | concept, place | +| `image` | 768D | — (future) | object, building, plant | +| `yolo_class` | — | YOLO label | object | + +### Table + +```sql +CREATE TABLE identities ( + id SERIAL PRIMARY KEY, + uuid UUID, -- 32-char UUIDv5 (source:external_id) + name TEXT NOT NULL UNIQUE, + identity_type VARCHAR(30) DEFAULT 'people', -- people/animal/object/building/place/concept + source VARCHAR(20) DEFAULT 'manual', -- tmdb/manual/face_cluster/yolo + status VARCHAR(20) DEFAULT 'pending', + + -- Reference vectors per model (in JSONB for extensibility) + reference_vectors JSONB DEFAULT '{}', + -- { + -- "face": [{"vec":[...], "pose":"frontal", "source":"video_trace"}], + -- "voice": [{"vec":[...], "speaker_id":"SPEAKER_0"}], + -- "image": [{"vec":[...], "source":"manual"}] + -- } + + -- Legacy columns (migrating to reference_vectors) + face_embedding VECTOR(512), + voice_embedding VECTOR(192), + identity_embedding VECTOR(768), + + reference_data JSONB DEFAULT '{}', + metadata JSONB DEFAULT '{}', + tmdb_id INTEGER, + tmdb_profile TEXT, + created_at TIMESTAMP DEFAULT now() +); +``` + +### 彈性設計 + +現有 `face_embedding` / `voice_embedding` column 維持向下相容。 +未來全部移入 `reference_vectors` JSONB,支援任意 model × 多個 reference vectors: + +```json +{ + "reference_vectors": { + "face": [ + {"vec": [0.1, 0.2, ...], "pose": "frontal", "source": "video_trace_0", "confidence": 0.95}, + {"vec": [0.3, 0.4, ...], "pose": "profile", "source": "video_trace_0", "confidence": 0.88} + ], + "voice": [ + {"vec": [0.5, 0.6, ...], "speaker_id": "SPEAKER_0", "source": "asrx"} + ], + "image": [] + } +} +``` + +### 識別 Agent 架構 + +每個識別模型由對應的 Agent 負責。Identity 本身只存 reference vectors,不綁定特定 model。 + +``` + ┌─────────────────────────┐ + │ identities │ + │ name, type, source │ + │ reference_vectors (JSONB)│ + └──────────┬──────────────┘ + │ + ┌────────────────────┼────────────────────┐ + │ │ │ + ┌────▼────┐ ┌────▼────┐ ┌────▼────┐ + │FaceAgent│ │VoiceAgent│ │ImageAgent│ + │ │ │ │ │ (future) │ + │ input: │ │ input: │ │ input: │ + │ face_ │ │ asrx │ │ image │ + │ detect │ │ segments│ │ features│ + │ ions │ │ │ │ │ + │ │ │ │ │ │ + │ output: │ │ output: │ │ output: │ + │ face → │ │ voice → │ │ img → │ + │ identity│ │ identity│ │ identity│ + └─────────┘ └─────────┘ └─────────┘ +``` + +### Agent 定義 + +| Agent | 輸入 | 模型 | 輸出 | 狀態 | +|-------|------|------|------|------| +| **FaceAgent** | `face_detections` | CoreML FaceNet 512D | `identity_id` on face_detections | ✅ | +| **VoiceAgent** | ASRX segments | ECAPA-TDNN 192D + MAR lip | `metadata.speaker_id` | ✅ | +| **ImageAgent** | — | — | — | ⬜ future | +| **YoloAgent** | YOLO detections | — | object → identity | ⬜ future | +| **TextAgent** | chunk text | nomic-embed 768D | concept → identity | ⬜ future | + +### Agent 運作模式 + +``` +1. Agent 讀取 raw detections(face / voice / yolo) +2. 對比 identities.reference_vectors[model] +3. 相似度達標 → bind to existing identity +4. 不達標 → create new identity +5. 更新 identities.reference_vectors(enrich reference set) +``` + +同一個 identity 可以被多個 Agent 同時更新。例如: +- FaceAgent 寫入 `reference_vectors.face` +- VoiceAgent 寫入 `reference_vectors.voice` +- 兩者指向同一個 identity (Cary Grant) + +### Face → Identity 綁定(V4.0) + +``` +face_detections.identity_id ──── FK ────→ identities.id +``` + +Direct FK。不需要 intermediate table。操作 API: + +``` +POST /api/v1/identities/bind + { "file_uuid": "...", "face_id": "face_1", "identity_uuid": "..." } + → UPDATE face_detections SET identity_id = X + +POST /api/v1/identities/unbind + { "file_uuid": "...", "face_id": "face_1" } + → UPDATE face_detections SET identity_id = NULL +``` + +### Voice/Speaker → Identity 綁定 + +透過 `identities.metadata.speaker_id`: + +``` +identities.metadata = {"speaker_id": "SPEAKER_0", "speaker_confidence": 0.85} +``` + +Voice embedding 直接寫入 `identities.voice_embedding`。 + +## 3. File-Identity 關聯 + +``` +file (1a04db97...) identity (Cary Grant) +│ │ +├── face_detections │ +│ ├── face_id="face_1" │ +│ │ identity_id ──────────────────┤ +│ ├── face_id="face_2" │ +│ │ identity_id ──────────────────┤ +│ └── face_id="face_3" │ +│ identity_id = NULL │ ← unbounded +│ │ +├── chunks │ +│ ├── story_parent │ +│ │ content.metadata.characters │ +│ │ = ["Cary Grant", ...] │ +│ └── story_child │ +│ content.metadata.speaker │ +│ = "Cary Grant" │ +│ │ +└── asrx.json │ + └── segments[].speaker_id │ + = "SPEAKER_0" ────────────────┘ + +file_identities (N:N junction, if needed) + file_uuid → identity_uuid +``` + +## 4. Class 分層分類(參照 IPC + HS) + +### 設計參考 + +IPC(國際專利分類)與 HS(海關稅則)的分層編碼體系。 + +| 標準 | 結構 | +|------|------| +| **IPC** | Section(A-H) → Class(2digits) → Subclass → Group/NNN | +| **HS** | Section → Chapter(2digits) → Heading(4digits) → Subheading(6digits) | + +共通原則:**層級碼**、**數字越長越精細**、**全球通用**。 + +### 編碼格式 + +``` +{SECTION}-{NNN}-{NNN}-{NNN} + │ │ │ └─ subgroup + │ │ └──────── main_group + │ └─────────────── subclass + └─────────────────────── section +``` + +| Section | 涵蓋 | +|---------|------| +| `P` | People | +| `O` | Object | +| `B` | Brand | +| `C` | Concept | +| `A` | Animal | +| `S` | Scene | +| `E` | Environment | +| `M` | Music/Sound | + +### Table + +```sql +CREATE TABLE classes ( + code VARCHAR(20) PRIMARY KEY, -- P-001-010/010 + name TEXT NOT NULL, + parent_code VARCHAR(20) REFERENCES classes(code), + section CHAR(1), + level INTEGER DEFAULT 0, + description TEXT, + created_at TIMESTAMPTZ DEFAULT now() +); + +CREATE TABLE identity_classes ( + identity_id INTEGER REFERENCES identities(id), + class_code VARCHAR(20) REFERENCES classes(code), + confidence REAL DEFAULT 1.0, + source VARCHAR(20), + PRIMARY KEY (identity_id, class_code) +); +``` + +## 版本歷史 + +| 版本 | 日期 | 變更 | +|------|------|------| +| V1.0 | 2026-05-05 | File & Identity schema,V4.0 direct FK binding | +| V1.1 | 2026-05-05 | Class 分層分類(IPC/HS),Agent 識別架構 | diff --git a/docs_v1.0/API_V1.0.0/INTERNAL/DUAL_EMBEDDING_PIPELINE_V1.0.0.md b/docs_v1.0/API_V1.0.0/INTERNAL/DUAL_EMBEDDING_PIPELINE_V1.0.0.md new file mode 100644 index 0000000..b2309f4 --- /dev/null +++ b/docs_v1.0/API_V1.0.0/INTERNAL/DUAL_EMBEDDING_PIPELINE_V1.0.0.md @@ -0,0 +1,1144 @@ +--- +document_type: "spec" +service: "MOMENTRY_CORE" +title: "Pipeline & Rule Architecture: Processor Lifecycle, Embedding, Search V2.0" +date: "2026-05-05" +version: "V2.0" +status: "active" +owner: "Warren" +created_by: "OpenCode" +tags: + - "momentry" + - "core" + - "chunk" + - "embedding" + - "qdrant" + - "bm25" + - "lifecycle" + - "versioning" + - "v1.0" +ai_query_hints: + - "Parent-Child 雙層 summarization 架構" + - "Story (template) vs LLM summarization 兩條獨立 pipeline" + - "Qdrant 3-collection 架構: rule1 / story / llm_summary" + - "Metadata 信度: speaker_confidence, face_confidence, object_confidence" + - "處理器版本追蹤與 stale detection" + - "Processor/Agent 生命週期管理與下游傳播" +related_documents: + - "../PROCESSORS/FACE_V1.0.0.md" + - "../PROCESSORS/FACE_EMBEDDING_FLOW_V1.0.0.md" + - "../VECTOR_SPEC_V1.0.0.md" + - "../CHUNK_DEFINITION_V1.0.0.md" + - "../PROCESSOR_SELECTION_V1.0.0.md" +--- + +# Pipeline & Rule Architecture: Processor Lifecycle, Embedding, Search V2.0 + +## 架構概述 + +兩個獨立 pipeline,共用同一底層(Qdrant + BM25),chunk_type 區隔: + +``` + ASR + ASRX + CUT + Face + YOLO + │ + ┌───────────────┴───────────────┐ + ▼ ▼ + Pipeline 1: Story Pipeline 2: LLM + (template, instant) (LLM, on-demand) + │ │ + ┌──────────┼──────────┐ ┌──────────┼──────────┐ + ▼ ▼ ▼ ▼ ▼ ▼ + Parent Child Embedding Parent Child Embedding + Summary Summary (nomic) Summary Summary (nomic) + │ │ │ │ │ │ + └──────────┴──────────┘ └──────────┴──────────┘ + │ │ + ▼ ▼ + ┌───────────────────────┐ ┌───────────────────────┐ + │ Qdrant (vector) │ │ Qdrant (vector) │ + │ PG tsvector (BM25) │ │ PG tsvector (BM25) │ + │ chunk_type: story_* │ │ chunk_type: llm_* │ + └───────────────────────┘ └───────────────────────┘ +``` + +## Pipeline 1: Story (Template-Based) + +### 輸入 +- `{uuid}.asr.json` — 1629 segments (text + timestamps) +- `{uuid}.asrx.json` — 10 speakers (speaker_id + segments) +- `{uuid}.cut.json` — 1331 scenes (start/end times) +- Identity mapping (SPEAKER_0→Cary Grant, SPEAKER_1→Audrey Hepburn, etc.) + +### 處理流程 + +``` +1. build_child_chunks() + ├── CUT scenes 分組 ASR segments + ├── ASRX 時間對齊 → speaker_id → identity name + └── 產出: scenes[children: [{start, end, text, speaker_name}]] + +2. generate_story_parent_summary(scene) + └── Template: "[{start}-{end}, {dur}] Cast: X. Total: N lines, W words. Speakers: ..." + +3. generate_story_child_summary(child, parent) + └── Template: "[{start}-{end}] {name}: \"{text}\"" + +4. embed_text(summary) → Ollama nomic-embed-text-v2-moe → 768D vector + +5. Store: + ├── Qdrant: upsert (point_id=chunk_id, vector=768D, payload={chunk_type, file_uuid, text}) + └── PostgreSQL: chunks table (text_content, tsvector, parent_chunk_id) +``` + +### Chunk Types + +| chunk_type | 說明 | 數量 (Charade) | +|------------|------|----------------| +| `story_parent` | Template parent scene summary | ~300 | +| `story_child` | Per-sentence summary | ~1600 | + +### Embedding Target + +``` +chunk_summary text → nomic-embed-text-v2-moe (768D) → Qdrant collection "momentry_dev" + → PostgreSQL chunks.embedding (VECTOR(768)) +``` + +## Pipeline 2: LLM (On-Demand) + +### 輸入 + +同 Pipeline 1 + LLM server (Gemma4/Qwen) + +### 處理流程 + +``` +1. build_child_chunks() (同 Pipeline 1) + +2. generate_llm_parent_summary(scene) + └── LLM Prompt: "Summarize this scene: {dialogue}" → paragraph (60-100 words) + +3. generate_llm_child_summary(child, parent_summary) + └── LLM Prompt: "{parent_summary} → Summarize this line: {text}" → one sentence +``` + +### Chunk Types + +| chunk_type | 說明 | +|------------|------| +| `llm_parent` | LLM parent scene summary | +| `llm_child` | LLM child sentence summary | + +## Qdrant Storage + +### Collection Architecture (3 Collections) + +| Collection | Source Data | Content | Embedding Target | Point Count (Charade) | Dev Name | +|------------|------------|---------|-----------------|----------------------|----------| +| **rule1** | ASR raw + ASRX speaker (1:1) | 原始對白句 + speaker_id + timestamp | Raw ASR text | 1,629 | `momentry_dev_rule1` | +| **story** | Story Pipeline 1 (template) | Parent scene + Child sentence with identity | Child summary: `[{t}s] {name}: "{text}"` | 1,629 child + 1,313 parent | `momentry_dev_story` | +| **llm_summary** | Story Pipeline 2 (LLM, future) | Parent LLM narrative + Child LLM summary | LLM-generated child summary | TBD | `momentry_dev_llm_summary` | + +### 各 Collection 資料結構對比 + +| | rule1 | story | llm_summary | +|---|---|---|---| +| **輸入** | ASR 原始段 | ASR + CUT + identity + YOLO + TKG | ASR + CUT + identity + YOLO + TKG + LLM | +| **Parent** | ❌ 無 | ✅ scene summary (template) | ✅ LLM narrative paragraph | +| **Child** | 原始句子 | `[時間] 角色: "對白"` | LLM-generated 1-sentence summary | +| **Speaker** | speaker_id | resolved identity name | resolved identity name | +| **搜尋特色** | 精確字詞匹配 | 角色+上下文語意 | 高層次語意理解 | +| **產生成本** | Zero | Zero (template) | High (LLM inference) | +| **狀態** | ⬜ pending | ✅ implemented | ⬜ future | + +## Metadata 信度 (Confidence Scores) + +所有已識別的內容都附帶信度,提供給 Story/LLM processor 做加權參考: + +### 信度來源 + +| 來源 | 欄位 | 值域 | 說明 | +|------|------|------|------| +| **Speaker identity** | `speaker_confidence` | 0-1 | MAR lip analysis: mouth open/close events matched to speaker | +| **Face identity** | `face_confidence` | 0-1 | Identity clustering composite score (similarity + speaker weight) | +| **YOLO object** | `object_confidence` | 0-1 | YOLO detection confidence per object class | +| **ASR text** | `asr_confidence` | 0-1 | Whisper transcription confidence (per segment) | +| **Scene boundary** | `scene_confidence` | — | CUT scene detection is deterministic (binary) | + +### Parent Chunk Metadata + +```json +{ + "start_time": 304, + "end_time": 318, + "characters": ["Cary Grant"], + "speaker_confidence": { + "Cary Grant": 0.85 // MAR lip: 4/4 events during SPEAKER_0 + }, + "face_identities": { + "Cary Grant": 0.64 // clustering composite score + }, + "yolo_objects": { + "car": 0.72, + "bottle": 0.55 + }, + "child_count": 1, + "total_words": 13 +} +``` + +### Child Chunk Metadata + +```json +{ + "start": 308.51, + "end": 317.35, + "text": "Sylvia I'm getting a divorce...", + "speaker_id": "SPEAKER_0", + "speaker_name": "Cary Grant", + "speaker_confidence": 0.85, + "face_confidence": 0.64, + "asr_confidence": 0.92, + "language": "en", + "yolo_objects": ["bottle", "car", "chair"], + "scene_id": 48 +} +``` + +### 信度使用方式 + +Story processor (template) 用信度做: +- 過濾低信度 objects(<0.5 排除,避免 "cell phone" 誤報) +- 標註高信度 speaker identity(>0.8 → 確定的角色名) + +LLM processor (future) 用信度做: +- Prompt context: "Cary Grant (confidence 0.85) says: ..." +- 低信度內容放後段或不放入 prompt + +### Collection 1: rule1 + +Source: ASR sentence + ASRX speaker (1:1, no parent grouping) + +```json +{ + "point_id": "rul1_{uuid}_{start_sec}_{end_sec}", + "vector": "", + "payload": { + "file_uuid": "1a04db97...", + "chunk_type": "rule1_sentence", + "text": "Hello and welcome to the old-time movie show...", + "speaker_id": "SPEAKER_4", + "speaker_name": "Walter Matthau", + "start_time": 1.7, + "end_time": 18.9, + "language": "en" + } +} +``` + +### Collection 2: story + +Source: Story parent-child chunks (Pipeline 1) + +```json +{ + "point_id": "story_{uuid}_{start_sec}_{end_sec}", + "vector": "", + "payload": { + "file_uuid": "1a04db97...", + "chunk_type": "story_child", + "text": "[309s-317s] Cary Grant: \"Sylvia I'm getting a divorce...\"", + "speaker": "Cary Grant", + "parent_chunk_id": "story_parent_xxx_304_318", + "parent_summary": "[304s-318s, 14s] Cast: Cary Grant. Total dialogue: 1 lines, 13 words.", + "start_time": 308.5, + "end_time": 317.4 + } +} +``` + +### Collection 3: llm_summary + +Source: LLM parent-child summaries (Pipeline 2, future) + +```json +{ + "point_id": "llm_{uuid}_{start_sec}_{end_sec}", + "vector": "", + "payload": { + "file_uuid": "1a04db97...", + "chunk_type": "llm_child", + "text": "Cary Grant discusses his divorce from Sylvia with Charles.", + "speaker": "Cary Grant", + "parent_chunk_id": "llm_parent_xxx_304_318", + "parent_llm_summary": "In this scene, Cary Grant reveals...", + "start_time": 308.5, + "end_time": 317.4 + } +} +``` + +### Collection Initialization (Rust) + +```rust +// In QdrantDb or config +pub fn rule1_collection() -> &str { "momentry_dev_rule1" } +pub fn story_collection() -> &str { "momentry_dev_story" } +pub fn llm_summary_collection() -> &str { "momentry_dev_llm_summary" } +``` + +### Search Strategy + +``` +Hybrid Search: "divorce Charles" + ├── Qdrant rule1: vector search raw sentences + ├── Qdrant story: vector search enriched child chunks + └── PG tsvector: BM25 across all three chunk_types + ↓ + Merge scores → ranked results +``` + +### 對應 chunk_type (PostgreSQL) + +| Collection | chunk_type | 說明 | +|------------|-----------|------| +| rule1 | `rule1_sentence` | 原始 1:1 ASR sentence | +| story | `story_child` | Story child chunk (with parent context) | +| story | `story_parent` | Story parent chunk | +| llm_summary | `llm_child` | LLM child summary (future) | +| llm_summary | `llm_parent` | LLM parent summary (future) | + +## BM25 (PostgreSQL Full-Text Search) + +### Strategy + +利用 PostgreSQL 內建 `tsvector` + `tsquery`: + +```sql +-- 已有 search_vector 欄位 (tsvector type) +-- 需 trigger 自動更新 +ALTER TABLE dev.chunks ADD COLUMN IF NOT EXISTS search_vector tsvector; + +-- Trigger: 自動從 text_content 產生 tsvector +CREATE OR REPLACE FUNCTION update_chunk_search_vector() RETURNS trigger AS $$ +BEGIN + NEW.search_vector := to_tsvector('english', COALESCE(NEW.text_content, '')); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +-- BM25 查詢 +SELECT *, ts_rank(search_vector, query) as bm25_score +FROM dev.chunks +WHERE file_uuid = $1 + AND chunk_type IN ('story_parent', 'story_child', 'llm_parent', 'llm_child') + AND search_vector @@ to_tsquery('english', $2) +ORDER BY bm25_score DESC +LIMIT 20; +``` + +## Search API + +### Hybrid Search (Vector + BM25) + +``` +GET /api/v1/search/hybrid?q=query&file_uuid=xxx + +1. Qdrant: vector search → top N candidates (cosine_score) +2. PostgreSQL: BM25 search → top N candidates (bm25_score) +3. Merge: weighted_rank = 0.7 * cosine_score + 0.3 * bm25_score +4. Return: ranked results with parent context +``` + +### Implementation Status + +| Component | Status | File | +|-----------|--------|------| +| Story processor (template) | ✅ | `scripts/parent_chunk_5w1h.py` | +| LLM processor | ⬜ | `scripts/parent_chunk_5w1h.py` (--mode llm) | +| ProcessorType::Story | ✅ | `src/core/db/postgres_db.rs` | +| Rust processor call | ✅ | `src/worker/processor.rs:ProcessorType::Story` | +| Qdrant collection: rule1 | ⬜ | `momentry_dev_rule1` | +| Qdrant collection: story | ⬜ | `momentry_dev_story` | +| Qdrant collection: llm_summary | ⬜ | `momentry_dev_llm_summary` | +| pgvector embedding | ✅ | `dev.chunks.embedding` (VECTOR) | +| BM25 trigger | ✅ | migration 031, `dev.chunks.search_vector` | +| Hybrid search API | ⬜ | `src/api/server.rs` | + +## Migration Plan + +### 031_add_chunk_search_trigger.sql + +```sql +-- Add search_vector if not exists +ALTER TABLE dev.chunks ADD COLUMN IF NOT EXISTS search_vector tsvector; + +-- Drop old trigger if exists +DROP TRIGGER IF EXISTS trg_chunk_search_vector ON dev.chunks; + +-- Create trigger function +CREATE OR REPLACE FUNCTION update_chunk_search_vector() RETURNS trigger AS $$ +BEGIN + NEW.search_vector := to_tsvector('english', COALESCE(NEW.text_content, '')); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +-- Create trigger +CREATE TRIGGER trg_chunk_search_vector + BEFORE INSERT OR UPDATE ON dev.chunks + FOR EACH ROW EXECUTE FUNCTION update_chunk_search_vector(); +``` + +## Processor Order (Pipeline DAG) + +``` +Cut ──→ Scene +Asr ──→ Asrx ──→ Story ──→ Embedding Pipeline + ↑ + requires: Asr, Asrx, Cut, Yolo, Face + ↓ + Qdrant + BM25 +``` + +Story processor depends on: Asr, Asrx, Cut, Yolo, Face +Position in `ProcessorType::all()`: **last** (after VisualChunk) + +## Test Plan + +1. `python parent_chunk_5w1h.py --file-uuid xxx --mode story` → produces story_story.json +2. Apply migration 031 → adds search_vector trigger +3. Implement `QdrantDb::upsert_chunk_embedding()` +4. Wire Story processor into Rust pipeline +5. Run `--mode story --embed` → verify Qdrant + BM25 +6. Run `--mode llm --embed` → verify LLM pipeline (when resources allow) + +## Metadata 版本與更新機制 + +### 問題 + +Metadata 信度(speaker_confidence, face_confidence, asr_confidence 等)依賴處理器的模型選型。當模型升級(如 Whisper small → medium,InsightFace → Vision+FaceNet),原有的信度值不再準確,需要重新計算。 + +### 設計:處理器版本追蹤 + +每個處理器記錄其模型版本,metadata 附帶 `source_version` 以便判斷是否需要更新。 + +```json +// chunk metadata 中的 source_version 欄位 +{ + "source_versions": { + "asr": "faster-whisper/small/v1", // ← 處理器 + 模型 + 版本 + "asrx": "speechbrain/ecapa-tdnn/v1", + "face": "apple-vision+coreml-facenet/v2", + "cut": "pyscenedetect/default", + "yolo": "yolov5-coreml/v2", + "speaker_binding": "mar-lip/v1", + "identity_clustering": "cosine-threshold/v1", + "story_processor": "template/v2.0" + }, + "generated_at": "2026-05-05T02:30:00Z" +} +``` + +### 處理器版本表 + +| Processor | 當前版本 | 上一版本 | 升級影響 | +|-----------|---------|---------|---------| +| ASR | `faster-whisper/small/v1` | — | 換 medium/large → asr_confidence 變更 | +| ASRX | `speechbrain/ecapa-tdnn/v1` | — | 換模型 → speaker_id 可能變更 | +| Face detection | `apple-vision/v2` | `insightface/buffalo-l/v1` | bbox, pose 變化 | +| Face embedding | `coreml-facenet/v2` | `insightface-arcface/v1` | embedding 空間完全變更 | +| YOLO | `yolov5-coreml/v2` | `yolov8/v1` | object_confidence 分布變更 | +| Speaker binding | `mar-lip/v1` | — | 換方案 → speaker_confidence 變更 | +| Identity clustering | `cosine-threshold/v1` | — | 換 DBSCAN → face_confidence 變更 | +| Story processor | `template/v2.0` | — | 換 LLM → summary 品質變化 | + +### 更新策略 + +#### Tier 1: 軟更新(metadata only) + +當處理器升級但輸出不變更時(如 YOLO v8 → v5),只需重新計算信度: + +``` +1. 更新 version table +2. 標記受影響的 chunk 為 stale +3. 重新計算 confidence,更新 metadata +4. 不重新生成 embedding(文本不變) +``` + +#### Tier 2: 硬更新(重新產生) + +當處理器輸出結構變更時(如 InsightFace → Vision),需完整重跑: + +``` +1. 標記所有 downstream 為 stale +2. 重跑 Face → Trace → Identity → Story → Embedding +3. 更新所有 source_version +``` + +#### 觸發時機 + +| 事件 | 更新範圍 | +|------|---------| +| ASR 模型升級 | ASR chunks → Rule1 → Story → 重新 embed | +| Face 模型切換 | Face traces → Identity → Story → 重新 embed | +| Speaker binding 改進 | Speaker metadata → Story → 重新 embed | +| Story template 修改 | Story chunks → 重新 embed | + +### 實作 + +```rust +// processor_results table 加入 model_version 欄位 +pub struct ProcessorResult { + ... + pub model_version: Option, // "faster-whisper/small/v1" +} + +// chunks metadata 記錄所有上游版本 +// 比對 current_versions vs source_versions → 決定是否需要更新 +pub fn needs_refresh(chunk_versions: &HashMap, + current_versions: &HashMap) -> bool { + chunk_versions.iter().any(|(proc, ver)| { + current_versions.get(proc) != Some(ver) + }) +} +``` + +### 處理器版本註冊 + +```sql +CREATE TABLE dev.processor_versions ( + processor VARCHAR(32) PRIMARY KEY, + model_version VARCHAR(128) NOT NULL, + updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP +); +``` + +--- + +## Pipeline & Rule 架構定義 + +### 名詞定義 + +| 名詞 | 定義 | 責任 | +|------|------|------| +| **Pipeline** | 排程管理的工作流。包含入庫與出庫。 | Processor 執行順序、依賴、產出、寫入 DB、指定資料來源路徑與欄位 | +| **Processor** | 單一處理單元 | 產出原始數據(JSON, DB rows, embeddings) | +| **Agent** | 智能處理單元 | 推理、匹配、總結(TMDb, Identity, Story) | +| **Rule** | 搜尋 API 的內涵定義 | 定義如何將 Pipeline 產出組織成可搜尋結構 | +| **Search API** | 查詢介面 | 查詢 Rule 產生的 chunk(vector search, BM25, hybrid) | + +### Pipeline → Rule 關係 + +``` +Pipeline (排程管理) Rule (搜尋內涵) +───────────────────── ──────────────── +CUT ──→ Scene +ASR ──→ ASRX ──┐ Rule 1: Sentence Chunks + │ ├── 輸入: ASR + ASRX +YOLO ──────────┤ ├── 產出: 1:1 sentence chunks + │ └── 搜尋: 原始對白 text search +OCR ───────────┤ + │ Rule 2: Visual Chunks +Face ──→ Trace ─┐ ├── 輸入: YOLO + OCR + │ ├── 產出: frame-level visual chunks + ├──→ Identity Agent └── 搜尋: 物件/文字位置查詢 +Pose ──────────┘ + Rule 3: Scene Chunks + ├── 輸入: CUT + ASR + Rule1 + ├── 產出: scene-level parent chunks + └── 搜尋: 場景摘要 + + Rule 4: Story Chunks (5W1H) + ├── 輸入: Rule3 + Identity + YOLO + ├── 產出: parent + child summaries + └── 搜尋: 情節/角色語意搜尋 +``` + +### Pipeline 做什麼 + +1. 管理 Processor 生命週期(Select → Schedule → Execute → Produce → Complete) +2. 處理依賴 DAG(ASRX 需等 ASR 完成) +3. 排程資源(max_concurrent, slot allocation) +4. 處理錯誤(retry, skip non-essential) +5. **入庫** — 產出數據並寫入儲存層 +6. **出庫** — 指定數據來源路徑與欄位,供 Rule/API 查詢 + +### Pipeline 入庫/出庫 明細 + +#### 內部產出(from Path) + +| Processor | 入庫 Path | 入庫 Table.Column | 出庫 | +|-----------|----------|-------------------|------| +| CUT | `{uuid}.cut.json` | — | Rule3: scenes | +| ASR | `{uuid}.asr.json` | — | Rule1: segments | +| ASRX | `{uuid}.asrx.json` | — | Rule1: speaker_id | +| YOLO | `{uuid}.yolo.json` | — | Rule2: detections | +| OCR | `{uuid}.ocr.json` | — | Rule2: texts | +| Face Detect | `{uuid}_detect.json` | — | Face Embed: bbox | +| Face Embed | `{uuid}.face.json` | — | Face Trace: embedding | +| Face Trace | `{uuid}.face_traced.json` | `face_detections` | Identity: trace_id | +| Identity Agent | — | `face_detections.identity_id` | Story: names | +| Story Agent | `{uuid}.story_story.json` | `chunks` | Search API | +| Embedding Agent | — | `chunks.embedding` | Qdrant sync | +| *All Processors* | — | `processor_results` (job_id, processor, status, started_at, completed_at, error_message) | 生命週期追蹤 | +| *All Processors* | — | `monitor_jobs` (uuid, status, processors, completed_processors) | Pipeline 排程 | + +#### 外部取得(from URL) + +每個唯一的 (URL + Method + 參數組合) 獨立列管。參數或模型不同 → 視為不同資料源。 + +| ID | 來源 | URL | Method | 參數/模型 | Auth | 入庫 | 出庫 | +|----|------|-----|--------|----------|------|------|------| +| **U01** | TMDb | `api.themoviedb.org/3/search/movie` | GET | `query={name}`, `language=en-US` | `TMDB_API_KEY` | `identities.tmdb_id` | Identity Agent | +| **U02** | TMDb | `api.themoviedb.org/3/movie/{id}/credits` | GET | `movie_id` | `TMDB_API_KEY` | `identities.metadata` | Identity Agent | +| **U03** | TMDb | `image.tmdb.org/t/p/w185/{path}` | GET | `profile_path` | — | `identities.tmdb_profile` | Identity Agent | +| **U04** | TMDb | `tmdb_embed_extractor.py` (local) | Py | `model=coreml-facenet/v2` | — | `identities.face_embedding (512D)` | Identity Agent | +| **U05** | Ollama | `localhost:11434/api/embeddings` | POST | `model=nomic-embed-text-v2-moe`, `dim=768` | — | `chunks.embedding (768D)` | Qdrant search | +| **U06** | Ollama | `localhost:11434/api/embeddings` | POST | `model=nomic-embed-text:latest`, `dim=768` | — | `chunks.embedding (768D)` | Qdrant search | +| **U07** | Ollama | `localhost:11434/api/chat` | POST | `model=qwen3:8b` (future) | — | `chunks.text_content` | Story LLM | +| **U08** | Ollama | `localhost:11434/api/chat` | POST | `model=gemma4` (future) | — | `chunks.text_content` | Story LLM | +| **U09** | Qdrant | `localhost:6333/collections/{name}/points` | PUT | `collection=momentry_dev_rule1` | `QDRANT_API_KEY` | rule1 vectors | Search | +| **U10** | Qdrant | `localhost:6333/collections/{name}/points` | PUT | `collection=momentry_dev_story` | `QDRANT_API_KEY` | story vectors | Search | +| **U11** | Qdrant | `localhost:6333/collections/{name}/points` | PUT | `collection=momentry_dev_llm_summary` | `QDRANT_API_KEY` | llm vectors | Search | + +#### Ollama 模型變體影響 + +| Model | Dim | 用途 | 影響範圍 | +|-------|-----|------|---------| +| `nomic-embed-text-v2-moe` | 768 | 多語言 embedding | 所有 chunk embedding 需重算 | +| `nomic-embed-text:latest` | 768 | 同上,不同版本 | 同上 | +| `mxbai-embed-large` | 1024 | 英文為主 | 改 dim → Qdrant collection 重建 | +| `qwen3:8b` | — | LLM summarization | Story parent/child summary 文本變更 | +| `qwen3:14b` | — | 同上,品質較好 | 同上 | +| `gemma4:4b` | — | 同上,較輕量 | 同上 | + +#### 參數變更觸發規則 + +| 變更類型 | 觸發 | 範例 | +|---------|------|------| +| 換 model | 所有 downstream stale | `nomic-embed-text-v2-moe` → `mxbai-embed-large` → dim 變更 → Qdrant 重建 | +| 同 model 參數變更 | 只影響該層 | Qdrant collection rename | +| API endpoint 變更 | 重試策略 + 通知 | TMDb API v3 → v4 | + +### Rule 做什麼 + +1. 讀取 Pipeline 產出的原始數據 +2. 組織成父子 chunk 結構 +3. 生成 summary text +4. 呼叫 Embedding (Ollama nomic-embed) +5. 存入 Qdrant + PostgreSQL (vector + BM25) +6. 提供 Search API 查詢 + +### 五階段運作模型 + +每個 Processor/Agent 遵循標準五階段: + +``` +1. 選擇 ──→ 2. 排程 ──→ 3. 執行 ──→ 4. 產出 ──→ 5. 完成 +(Select) (Schedule) (Execute) (Produce) (Complete) +``` + +| 階段 | 英文 | 責任 | 實例(Face V2.0) | +|------|------|------|-------------------| +| **1. 選擇** | Select | 決定模型/演算法/版本 | `face_detection=apple-vision/v2`, `face_embedding=coreml-facenet/v2` | +| **2. 排程** | Schedule | 依賴檢查、slot allocation、max_concurrent | Face depends on [] → slot available → queued | +| **3. 執行** | Execute | 執行 scripts / agent logic、監控資源、timeout | `python face_processor.py video out.json --sample 60` | +| **4. 產出** | Produce | 寫入輸出 JSON、DB table、觸發 post-processing | face.json → face_detections DB → store_traced_faces.py | +| **5. 完成** | Complete | 標記 DONE、記錄 source_versions、觸發 downstream | `processor_results.status='completed'`, 通知下游 StoryAgent | + +### 每階段記錄 + +```json +{ + "processor": "face", + "run_id": "run_20260505_001", + "select": { + "model_version": "apple-vision+coreml-facenet/v2", + "sample_interval": 60, + "config": {"detector": "Vision", "embedder": "FaceNet"} + }, + "schedule": { + "queued_at": "2026-05-05T00:00:00Z", + "started_at": "2026-05-05T00:00:05Z", + "wait_reason": null + }, + "execute": { + "process_pid": 16490, + "cpu_percent": 30, + "memory_mb": 50, + "duration_sec": 433, + "error": null + }, + "produce": { + "output_file": "face.json", + "output_size_bytes": 115343360, + "frames_processed": 4008, + "embeddings_generated": 6182, + "db_rows_inserted": 6182 + }, + "complete": { + "completed_at": "2026-05-05T00:07:18Z", + "status": "DONE", + "source_versions": { + "face_detection": "apple-vision/v2", + "face_embedding": "coreml-facenet/v2" + } + } +} +``` + +### 以 Face V2.0 為例 + +``` +1. SELECT: 選擇 apple-vision (detection) + coreml-facenet (embedding) + sample_interval=60, max_concurrent=1 + ↓ +2. SCHEDULE: Face 無依賴,slot available → 排入 processor pool + ↓ +3. EXECUTE: swift_face (Vision ANE detection, 26% CPU, ~3min) + face_processor.py (CoreML embedding, 0% CPU, ~4min) + total: 433s, CPU 30%, memory 50MB + ↓ +4. PRODUCE: face.json (110MB, 4008 frames, 6182 embeddings) + face_detections DB (6182 rows with trace_id) + face_traced.json → store_traced_faces.py + ↓ +5. COMPLETE: status=DONE, source_versions written + ↓ 觸發下游 + TMDbAgent → IdentityAgent → StoryAgent → EmbeddingAgent +``` + +### State Machine + +``` + ┌─────────┐ + │ PENDING │ ← queued, waiting for dependencies + └────┬────┘ + │ dependencies met, slot available + ┌────▼────┐ + ┌─────│ RUNNING │─────┐ + │ └────┬────┘ │ + │ │ │ + crash/timeout success error + │ │ │ + ┌────▼────┐ ┌───▼───┐ ┌───▼───┐ + │ STALE │ │DONE │ │FAILED │ + └────┬────┘ └───┬───┘ └───┬───┘ + │ │ │ + │ version change │ + │ │ │ + └──────────┼──────────┘ + │ + ┌────▼────┐ + │ Refresh │ → mark downstream stale → re-run + └─────────┘ +``` + +### Processor 實作類別 + +三種實作語言,不同執行方式: + +| 類別 | 執行方式 | 特點 | 範例 | +|------|---------|------|------| +| **Python** | `PythonExecutor.run(script, args)` | 彈性最大,模型多 | ASR, YOLO, Face embedding, Story | +| **Swift** | compiled binary, `subprocess.call(binary)` | ANE 原生,極低 CPU | Face detection, OCR, Pose | +| **Rust** | native tokio task | DB 操作,pipeline 排程 | Rule1/3 ingest, IdentityAgent, TMDbAgent | + +### 各 Processor 類別 + +| Processor | 類別 | 執行內容 | +|-----------|------|---------| +| CUT | **Python** | `cut_processor.py` (PySceneDetect) | +| ASR | **Python** | `asr_processor.py` (faster-whisper) | +| ASRX | **Python** | `asrx_processor_custom.py` (SpeechBrain) | +| OCR | **Swift** | `swift_ocr` → Python wrapper | +| YOLO | **Python** | `yolo_processor.py` (CoreML YOLOv5) | +| Face detection | **Swift** | `swift_face` (Vision ANE) | +| Face embedding | **Python** | `face_processor.py` (CoreML FaceNet) | +| Face trace | **Python** | `store_traced_faces.py` → `face_tracker.py` | +| Pose | **Swift** | `swift_pose` (Vision ANE) | +| Scene | **Python** | `scene_classification.py` | +| VisualChunk | **Rust** | `visual_chunk.rs` (native) | +| Story | **Python** | `parent_chunk_5w1h.py` (template/LLM) | +| Rule1 Ingest | **Rust** | `rule1_ingest.rs` (DB + embedding) | +| Rule3 Ingest | **Rust** | `rule3_ingest.rs` + LLM client | +| TMDbAgent | **Rust** | `tmdb/probe.rs` + `tmdb/face_agent.rs` | +| IdentityAgent | **Python** | `experiments/identity_clustering/runner_v2.py` | +| EmbeddingAgent | **Python** | Ollama API call → pgvector write | + +--- + +## Processor/Agent 登錄冊 + +### CUT — Scene Detection + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 / 2026-04 / OpenCode | +| **類別** | Python | +| **簡要說明** | PySceneDetect 偵測視覺場景切換。輸出 scene boundaries。1331 scenes for Charade 113min。 | +| **依賴** | 無 | +| **選型測試** | `scripts/swift_processors/swift_cut_test.swift` (Swift AVFoundation — slower, not adopted) | +| **相關文件** | `docs_v1.0/.../PROCESSORS/CUT_V1.0.0.md` | +| **輸入** | `video_path` (MP4/MOV file) | +| **產出** | `{uuid}.cut.json`: `{scenes: [{scene_number, start_frame, end_frame, start_time, end_time}]}` | + +### ASR — Speech Recognition + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 / 2026-04 / OpenCode | +| **類別** | Python | +| **簡要說明** | faster-whisper small 語音轉文字。1,629 segments, 113 min for Charade。支援 language detection。 | +| **依賴** | 無 | +| **選型測試** | `scripts/swift_processors/asr_swift.swift` (Apple Speech Framework — quality insufficient) | +| **相關文件** | `docs_v1.0/.../PROCESSORS/ASR_V1.0.0.md` | +| **輸入** | `video_path`, `{uuid}.cut.json`, `output_path` | +| **產出** | `{uuid}.asr.json`: `{language, segments: [{start, end, text, language}]}` | + +### ASRX — Speaker Diarization + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 / 2026-04 / OpenCode | +| **類別** | Python | +| **簡要說明** | SpeechBrain ECAPA-TDNN 192D speaker embedding + clustering。10 speakers in Charade。 | +| **依賴** | ASR | +| **選型測試** | `scripts/swift_processors/asrx_swift.swift`, `speaker_test.swift` (Apple Speech Framework — no speaker embedding API) | +| **相關文件** | `docs_v1.0/.../PROCESSORS/ASRX_V1.0.0.md` | +| **輸入** | `video_path`, `{uuid}.asr.json`, `output_path` | +| **產出** | `{uuid}.asrx.json`: `{segments: [{start_time, end_time, speaker_id}], embeddings: [192D], speaker_stats}` | + +### OCR — Text Recognition + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 PaddleOCR → V2.0 Swift Vision / 2026-05 | +| **類別** | Swift + Python wrapper | +| **簡要說明** | VNRecognizeTextRequest, 30+ languages, ANE accelerated。 | +| **依賴** | 無 | +| **選型測試** | `scripts/swift_processors/swift_ocr.swift`, `vision_ocr_test.swift` | +| **相關文件** | `docs_v1.0/.../PROCESSORS/OCR_V1.0.0.md` | +| **輸入** | `video_path`, `output_path`, `--sample-interval` | +| **產出** | `{uuid}.ocr.json`: `{frames: [{frame, timestamp, texts: [{text, confidence, bbox}]}]}` | + +### YOLO — Object Detection + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 YOLOv8 → V2.0 CoreML YOLOv5 / 2026-05 | +| **類別** | Python | +| **簡要說明** | CoreML YOLOv5 80-class, ANE accelerated。328K frames for Charade。 | +| **依賴** | 無 | +| **選型測試** | `scripts/swift_processors/vision_object_test.swift` (Vision — no general object API) | +| **相關文件** | `docs_v1.0/.../PROCESSORS/YOLO_V1.0.0.md` | +| **輸入** | `video_path`, `output_path`, `--uuid` | +| **產出** | `{uuid}.yolo.json`: `{frames: {frame_num: {detections: [{class_name, confidence, x1,y1,x2,y2}]}}}` | + +### Face Detection — 人臉偵測 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 InsightFace → V2.0 Apple Vision / 2026-05 | +| **類別** | Swift | +| **簡要說明** | VNDetectFaceRectanglesRequest + VNDetectFaceLandmarksRequest。ANE, 0% CPU。Bbox + pose + lip landmarks。 | +| **依賴** | 無 | +| **選型測試** | `scripts/swift_processors/face_vision_test.swift`, `face_compare_test.swift` | +| **相關文件** | `docs_v1.0/.../PROCESSORS/FACE_V1.0.0.md` | +| **輸入** | `video_path`, `output_detect.json`, `--sample-interval` | +| **產出** | `{uuid}_detect.json`: `{frames: [{frame, timestamp, faces: [{bbox, confidence, pose, lips}]}]}` | + +### Face Embedding — 人臉向量 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 InsightFace ArcFace → V2.0 CoreML FaceNet / 2026-05 | +| **類別** | Python | +| **簡要說明** | CoreML FaceNet InceptionResnetV1 512D embedding, MIT license, ANE。 | +| **依賴** | Face Detection | +| **選型測試** | `scripts/swift_processors/face_vision_test.swift` (VNFaceprint — private API, unusable) | +| **相關文件** | `docs_v1.0/.../PROCESSORS/FACE_EMBEDDING_FLOW_V1.0.0.md` | +| **輸入** | `video_path`, `{uuid}_detect.json`, `output_path` | +| **產出** | `{uuid}.face.json`: `FaceResult {frame_count, fps, frames: [{frame, timestamp, faces: [{x,y,w,h, embedding(512D), pose_angle, lips}]}]}` | + +### Pose — 姿態估計 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 MediaPipe → V2.0 Apple Vision / 2026-05 | +| **類別** | Swift + Python wrapper | +| **簡要說明** | VNDetectHumanBodyPoseRequest, 19 joints, ANE。8,190 poses for Charade。 | +| **依賴** | 無 | +| **選型測試** | `scripts/swift_processors/swift_pose.swift`, `pose_benchmark.swift` | +| **相關文件** | `docs_v1.0/.../PROCESSORS/POSE_V1.0.0.md` | +| **輸入** | `video_path`, `output_path`, `--sample-interval` | +| **產出** | `{uuid}.pose.json`: `{frame_count, fps, frames: [{frame, timestamp, persons: [{keypoints, bbox}]}]}` | + +### Face Trace — 人臉追蹤 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 / 2026-05 / OpenCode | +| **類別** | Python | +| **簡要說明** | IoU + embedding cosine cross-frame tracking → trace_id。存入 DB + pgvector。 | +| **依賴** | Face Detection, Face Embedding | +| **選型測試** | `scripts/utils/face_tracker.py` (553 frames → 2 traces) | +| **相關文件** | `docs_v1.0/.../FACE_TRACKER_GUIDE.md` | +| **輸入** | `{uuid}.face.json`, `--file-uuid` | +| **產出** | `{uuid}.face_traced.json`, DB: `face_detections` (trace_id + bbox + embedding) | + +### Speaker Binding Agent — 語者綁定 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 / 2026-05 / OpenCode | +| **類別** | Python | +| **簡要說明** | MAR (Mouth Aspect Ratio) lip movement → correlate with speaker segments → identify who is speaking。 | +| **依賴** | ASRX, Face Detection | +| **選型測試** | Inline: solo-frame overlap + MAR open/close event counting | +| **相關文件** | `docs_v1.0/.../DUAL_EMBEDDING_PIPELINE_V1.0.0.md` (Metadata 信度) | +| **輸入** | `{uuid}.face.json` (lip landmarks), `{uuid}.asrx.json` (speaker segments) | +| **產出** | Speaker→identity mapping + confidence scores。Stored in `identities.metadata`。 | + +### TMDb Agent — 電影資料庫 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 / 2026-04 / OpenCode | +| **類別** | Rust | +| **簡要說明** | TMDb API search → cast download → identity creation with face embeddings。15 cast members, 9 with embeddings for Charade。 | +| **依賴** | 無 (external API) | +| **選型測試** | N/A (API integration) | +| **相關文件** | `src/core/tmdb/probe.rs`, `src/core/tmdb/face_agent.rs` | +| **輸入** | `TMDB_API_KEY` env var, video filename | +| **產出** | `identities` table (name, tmdb_id, tmdb_profile, face_embedding) | + +### Identity Agent — 身份聚類 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V2.0 / 2026-05 / OpenCode | +| **類別** | Python | +| **簡要說明** | Multi-stage face trace clustering: TMDb bind → video reference enrichment → iterative cosine matching → 99.4% coverage。 | +| **依賴** | Face Trace, TMDb Agent, Speaker Binding | +| **選型測試** | `experiments/identity_clustering/configs/exp_001-008.json` | +| **相關文件** | `experiments/identity_clustering/README.md`, `docs_v1.0/.../FACE_TO_IDENTITY_FLOW.md` | +| **輸入** | `face_detections` DB, `tkg_edges` SPEAKS_AS data | +| **產出** | `labels.json` (trace→identity mapping), DB: `face_detections.identity_id` | + +### Story Agent — 故事摘要 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V2.0 / 2026-05 / OpenCode | +| **類別** | Python | +| **簡要說明** | Template parent-child chunk generation + embedding。300 parents, 1,175 children for Charade。支援 --mode llm (future)。 | +| **依賴** | ASR, ASRX, CUT, YOLO, Identity Agent | +| **選型測試** | `experiments/identity_clustering/runner_v2.py` (identity binding for speaker resolution) | +| **相關文件** | `docs_v1.0/.../DUAL_EMBEDDING_PIPELINE_V1.0.0.md`, `docs_v1.0/.../CHUNK_DEFINITION_V1.0.0.md` | +| **輸入** | `{uuid}.asr.json`, `{uuid}.asrx.json`, `{uuid}.cut.json`, `{uuid}.yolo.json` | +| **產出** | `{uuid}.story_story.json`, DB: `chunks` table (parent + child), pgvector (768D) | + +### Embedding Agent — 向量化 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 / 2026-05 / OpenCode | +| **類別** | Python | +| **簡要說明** | Ollama nomic-embed-text-v2-moe → 768D vector → pgvector。1,175 chunks for Charade。 | +| **依賴** | Story Agent | +| **選型測試** | N/A (API integration) | +| **相關文件** | `docs_v1.0/.../VECTOR_SPEC_V1.0.0.md` | +| **輸入** | `chunks` table (text_content column) | +| **產出** | `chunks.embedding` (VECTOR 768D) | + +### Scene — 場景分類 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 / 2026-04 / OpenCode | +| **類別** | Python | +| **簡要說明** | Places365 場景分類模型,為每個 CUT scene 分配場景標籤(室內/室外、街道/辦公室等)。 | +| **依賴** | CUT | +| **選型測試** | — | +| **相關文件** | `docs_v1.0/.../PROCESSORS/SCENE_V1.0.0.md` | +| **輸入** | `video_path`, `{uuid}.cut.json`, `output_path` | +| **產出** | `{uuid}.scene.json`: `{scenes: [{scene_id, labels, confidence}]}` | + +### VisualChunk — 視覺分塊 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 / 2026-04 / OpenCode | +| **類別** | Rust | +| **簡要說明** | YOLO 物件偵測結果 → 視覺 chunk(相似物件組合)。目前 fixed-frame + similarity-based 兩種策略。 | +| **依賴** | YOLO | +| **選型測試** | `src/core/processor/visual_chunk.rs` (Jaccard similarity) | +| **相關文件** | — | +| **輸入** | `{uuid}.yolo.json`, DB | +| **產出** | `pre_chunks` table (chunk_type='visual') | + +### Rule1 Ingest — 句塊入庫 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 / 2026-04 / OpenCode | +| **類別** | Rust | +| **簡要說明** | ASR sentence + ASRX speaker → sentence chunks → DB + Qdrant。未實作(Sentence chunks = 0 for Charade)。 | +| **依賴** | ASR, ASRX | +| **選型測試** | `src/core/chunk/rule1_ingest.rs` (time overlap matching) | +| **相關文件** | `docs_v1.0/.../CHUNK_DEFINITION_V1.0.0.md` | +| **輸入** | `{uuid}.asr.json`, `{uuid}.asrx.json`, DB | +| **產出** | `chunks` table (chunk_type='sentence', rule='rule_1'), Qdrant `momentry_dev_rule1` | + +### Rule3 Ingest — 場景塊入庫 + +| 項目 | 內容 | +|------|------| +| **出生登記** | V1.0 / 2026-04 / OpenCode | +| **類別** | Rust | +| **簡要說明** | CUT scene + Rule1 children → parent scene chunks + LLM 5W1H summary。 | +| **依賴** | CUT, ASR, Rule1 | +| **選型測試** | `src/core/chunk/rule3_ingest.rs` (LLM generate_5w1h_summary) | +| **相關文件** | `docs_v1.0/.../CHUNK_DEFINITION_V1.0.0.md` | +| **輸入** | `{uuid}.cut.json`, `chunks` table (rule1) | +| **產出** | `chunks` table (chunk_type='cut', rule='rule_3'), Qdrant | + +### Processor 生命週期 + +| State | 觸發 | 動作 | +|-------|------|------| +| **PENDING** | job created, dependencies not met | 等待 processor pool slot | +| **RUNNING** | dependencies met, slot acquired | Python script / agent running | +| **DONE** | script exit 0, output valid | → trigger downstream processors | +| **FAILED** | script exit ≠0, timeout, OOM | → retry or skip (non-essential) | +| **STALE** | upstream processor version changed | → needs refresh | + +### Agent 生命週期 + +Agents(TMDb Agent, Identity Agent, Story Agent)與 Processor 共用生命週期,但有額外特性: + +| Agent | 觸發條件 | 生命週期特點 | +|-------|---------|-------------| +| **TMDbAgent** | Face DONE | 外部 API call,網路 unstable → retry 次數限制 | +| **IdentityAgent** | Face DONE + ASRX DONE | 多階段(Stage 1→2→3),每階段獨立狀態 | +| **StoryAgent** | All processor DONE | 可選 LLM mode(資源允許時),template mode always available | +| **EmbeddingAgent** | Story DONE | 純 CPU task,批次處理,可 resume | + +### 依賴圖 (DAG) + +``` +CUT ──────→ Scene + │ +ASR ──→ ASRX ─┤ + │ +YOLO ─────────┤ + │ +OCR ──────────┤ + │ +Face ──→ Trace ──→ IdentityAgent ─┐ + │ │ + ├───────────────────┤ +Pose ─────────┤ │ + │ │ +VisualChunk ──┘ │ + │ │ + └─── StoryAgent ────┤ + │ │ + EmbeddingAgent │ + │ │ + Qdrant + BM25 │ + │ + └─ TMDbAgent +``` + +### 下游傳播規則 + +當上游版本變更時: + +``` +ASR version change (v1 → v2) + → ASRX: STALE (depends on ASR) + → Rule1 chunks: STALE + → StoryAgent: STALE (depends on ASR + Rule1) + → Embedding: STALE + +Face version change (InsightFace → Vision) + → Trace: STALE + → IdentityAgent: STALE (embeddings changed) + → TKG: STALE (trace_id changed) + → StoryAgent: STALE (identity names may change) + → Embedding: STALE + +Template change (Story template v1 → v2) + → Story chunks: STALE + → Embedding: STALE (text changed) +``` + +### Stale Detection Logic + +```python +def check_stale(file_uuid, current_versions): + """Check which processors/agents need refresh""" + chunks = db.query("SELECT source_versions, chunk_type FROM chunks WHERE file_uuid=?", file_uuid) + + stale_agents = set() + for chunk in chunks: + for proc, ver in current_versions.items(): + if chunk.source_versions.get(proc) != ver: + stale_agents.add(proc) + # Propagate downstream + stale_agents.update(get_downstream(proc)) + + return stale_agents +``` + +### Refresh Pipeline + +``` +1. check_stale() → identify affected processors/agents +2. Mark affected chunks as STALE +3. Re-run stale processors/agents in dependency order +4. Re-embed updated chunks +5. Update source_versions in chunk metadata +``` + +### Current Version Registry (Charade) + +| Processor | Model Version | Status | +|-----------|--------------|--------| +| CUT | `pyscenedetect/default` | ✅ | +| ASR | `faster-whisper/small/v1` | ✅ | +| ASRX | `speechbrain/ecapa-tdnn/v1` | ✅ | +| OCR | `apple-vision/v1` | ✅ | +| YOLO | `yolov5-coreml/v2` | ✅ | +| Face detection | `apple-vision/v2` | ✅ | +| Face embedding | `coreml-facenet/v2` | ✅ | +| Pose | `apple-vision/v1` | ✅ | +| Trace | `iou+embedding/v1` | ✅ | +| TMDbAgent | `tmdb-api/v1` | ✅ | +| IdentityAgent | `cosine-threshold/v1` | ✅ | +| StoryAgent | `template/v2.0` | ✅ | +| EmbeddingAgent | `nomic-embed-768d/v1` | ✅ | + +## Schema 隔離原則 + +`dev` 與 `public` 完全獨立,禁止交叉污染。 + +| 規則 | 說明 | +|------|------| +| 資料隔離 | dev 不讀取 public table,反之亦然 | +| 擴展獨立 | pgvector 兩邊各自安裝 | +| Migration | 標明 target schema(`dev.table` vs `public.table`) | +| Sequence | 各自獨立,不共用 | +| Index | 各自維護 | +| Qdrant | `momentry_dev_*` vs `momentry_*` | +| Ollama | embedding 共用(nomic-embed 不分 dev/prod) | + +## Version History + +| Version | Date | Purpose | Author | +|---------|------|---------|--------| +| V1.0 | 2026-05-05 | Initial design | OpenCode | +| V1.1 | 2026-05-05 | 3-collection Qdrant + metadata confidence + version tracking | OpenCode | diff --git a/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/ASR_V1.0.0.md b/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/ASR_V1.0.0.md index 098f0f0..34d88e6 100644 --- a/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/ASR_V1.0.0.md +++ b/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/ASR_V1.0.0.md @@ -115,3 +115,129 @@ related_documents: | 記憶體 | 2048 MB(長片因分段處理,實際低於此值) | | GPU | 不使用(INT8 CPU 量化) | | 依賴 | 無 | + +--- + +## Swift ASR (Apple Speech Framework) 實驗記錄 + +### 選型結論 + +使用現有做法(faster-whisper small),Swift ASR 不取代 Whisper。 + +> **注意**:Apple Speech Framework 會隨著 macOS / Siri 版本更新而改善。每次主要 macOS 版本更新時(如 macOS 15→16),應重新執行 `scripts/compare_segmentation.py` 對比 Swift vs Whisper 的品質差異,以評估是否可切換。 + +### POC 狀態 + +Swift processor 位於 `scripts/swift_processors/`,已編譯。Apple Speech Framework 在記憶體(11MB vs 1.1GB)和速度(4.19s vs 17.46s)有優勢,但準確度不足。 + +### 效能對比(Charade 60s 片段) + +| 指標 | Swift (Speech Framework) | Python (faster-whisper small) | +|------|------------------------|-------------------------------| +| **RTF** | 0.07 (14x) | 0.29 (3.4x) | +| **記憶體** | 11MB | 1.1GB | +| **Segments** | 18(句子級) | 23(句子級) | +| **品質** | 漏字较多("Let's see"→"And see") | 準確 | +| **語音分離改善** | Demucs +35s,僅小幅改善 | 不需要 | + +### 已知問題 + +1. 語言自動偵測順序錯誤(先試 zh-TW),需指定 `--language en-US` +2. RunLoop timeout 已修復(改為 semaphore 等待 callback) +3. 逐字輸出已合併(94 → 18 segments) + +### 相關檔案 + +``` +scripts/swift_processors/ +├── Package.swift +├── asr_swift.swift +├── asrx_swift.swift +├── entitlements.plist +└── .build/debug/asr_swift +``` + +--- + +## Speaker Diarization (ASRX) 選型記錄 + +### 現有方案:Python ASRX (ECAPA-TDNN + Spectral Clustering) + +使用 SpeechBrain ECAPA-TDNN 提取 192-D speaker embedding,搭配 spectral clustering 進行語者分離。 + +| 指標 | 值 | +|------|-----| +| Embedding 維度 | 192-D | +| Charade 偵測 speaker 數 | 10(正確區分 narrator、主角、配角) | +| 總 ASRX pre_chunks | 5,848 | +| Qdrant collection | `{prefix}_voice` | +| 依賴 | 需 ASR 完成後執行(時間對齊) | +| 輸出 | segments 含 `speaker_id`, `start_time`, `end_time` | + +### Swift SFSpeechAnalyzer 評估 + +**目標**:使用 Apple 內建 Speech Framework(ANE 加速)取代 Python ASRX。 + +| API | macOS 14 可用性 | 說明 | +|-----|----------------|------| +| `SFSpeechRecognizer` | ✅ | 語音辨識 | +| `SFSpeechAnalyzer` | ✅ 存在 | 語音分析,但無暴露 speaker embedding | +| `SFSpeechRecognitionMetadata` | ✅ 存在 | 辨識中繼資料,但 speaker 資訊為空 | +| `SFSpeakerEmbedding` | ❌ | Speaker embedding API 不存在 | +| `SFSpeakerIdentification` | ❌ | Speaker 識別 API 不存在 | +| KVC 取 speaker metadata | ❌ | 透過 KVC 也無法取得 speaker 資訊 | + +**結論:目前不可行。** Apple 尚未在 macOS 14 上開放 Speaker Recognition API 給開發者使用。 + +### 選型結論 + +維持 Python ASRX (ECAPA-TDNN) 方案。待未來 macOS 版本開放 Speaker Recognition API 後重新評估。 + +--- + +## 版本歷史 + +| 版本 | 日期 | 目的 | 操作人 | 工具/模型 | +|------|------|------|--------|-----------| +| V1.0 | 2026-05-02 | 初始版本 | OpenCode | deepseek-chat | +| V1.1 | 2026-05-04 | 新增 Swift ASR 實驗記錄與 Speaker Diarization 選型記錄 | OpenCode | deepseek-chat | +| V1.2 | 2026-05-04 | 新增 Text Embedding ANE 加速可行性研究 | OpenCode | deepseek-chat | + +--- + +## Text Embedding ANE 加速研究 + +### 背景 + +ASR 產出的 sentence chunk 需要 embedding(用於 semantic search / RAG)。 +目前使用 Ollama `nomic-embed-text-v2-moe`(768-D, 多語言,MIT license,CPU/GPU)。 + +### 研究目標 + +評估是否可用 Apple ANE 方案取代 Ollama embedding,降低 CPU 負載。 + +### 選項評估 + +| 方案 | 模型 | Dimension | 多語言 | ANE | 狀態 | +|------|------|-----------|--------|-----|------| +| **Apple NLEmbedding (sentence)** | 系統內建 | 未知 | ✅ 宣稱支援 | ✅ 原生 ANE | ❌ macOS 26.4.1 無模型檔 | +| **Apple NLEmbedding (word)** | GloVe | 300D | ❌ 僅英文 | ✅ | ❌ dim 不足,無多語言 | +| **Apple NLContextualEmbedding** | Transformer | 未知 | 未知 | ✅ | ❌ API 不可用 | +| **CoreML custom (MiniLM)** | BERT-based | 384D | ✅ 50+ languages | ✅ | ❌ torch.jit.trace 失敗 | +| **Ollama nomic-embed-text** | nomic-ai | 768D | ✅ 多語言 | ❌ | ✅ 現行方案 | + +### 測試結論 (2026-05-04) + +1. **NLEmbedding default**: dim=0, 所有 vector 回傳 nil。macOS 26.4.1 未預裝 sentence embedding 模型。 +2. **NLEmbedding word (GloVe)**: dim=300, 僅英文。法文/中文 dim=0(不支援)。 +3. **NLContextualEmbedding**: API compile error,方法不存在於公開 header。 +4. **CoreML 自轉 MiniLM**: `torch.jit.trace` 對 BERT 架構拋出 `Placeholder storage not allocated on MPS` 及 `dictconstruct` op 未支援。 +5. **Ollama nomic-embed**: 效能 ~6M embeddings/sec,768D 多語言,已整合穩定。 + +### 建議 + +維持 Ollama `nomic-embed-text-v2-moe`。 +ANE text embedding 待以下條件成熟後重新評估: +- Apple 開放 NLEmbedding 多語言 sentence 模型下載 +- 或 coremltools 支援 BERT `dictconstruct` op +- 或 Apple 發布預訓練 CoreML 多語言 embedding 模型 diff --git a/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/CUT_V1.0.0.md b/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/CUT_V1.0.0.md index c17e3af..5e1cc2e 100644 --- a/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/CUT_V1.0.0.md +++ b/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/CUT_V1.0.0.md @@ -132,4 +132,48 @@ CUT 在 **register 階段同步執行**(`register_single_file`),不做 wor | CPU | 0.5 | | 記憶體 | 512 MB | | GPU | 不使用 | + +--- + +## Swift AVFoundation 替代評估 + +### POC 目標 + +使用 AVFoundation 逐幀 histogram 分析取代 Python PySceneDetect(ContentDetector),目標利用 ANE 加速。 + +### 測試結果(Charade 60s clip, 3597 frames, 59.9fps) + +| 指標 | Python PySceneDetect | Swift AVFoundation (luminance histogram) | +|------|---------------------|------------------------------------------| +| **Scenes 偵測** | **3** ✅ 合理 | **63** ❌ 過度敏感 | +| **處理時間** | **7.93s** | 15.42s | +| **RTF** | **0.132** (7.6x) | 0.257 (3.9x) | +| **記憶體** | ~512MB | 極低(系統框架) | +| **演算法** | ContentDetector(adaptive threshold + frame normalization) | 單純 histogram diff(64 bins luminance) | + +### 問題分析 + +1. **準確度** — 63 vs 3 scenes。簡單的 luminance histogram diff 對 camera movement、lighting change 過度敏感。PySceneDetect 的 ContentDetector 使用 adaptive threshold + 幀正規化,穩定性高很多。 +2. **速度** — 15.42s vs 7.93s。AVAssetReader 必須 sequential decode 所有 frames,無法像 ffmpeg 那樣 efficient frame skipping。 + +### 選型結論 + +| 項目 | 方案 | +|------|------| +| **Scene Cut Detection** | Python PySceneDetect **維持現狀** | + +### 相關檔案 + +``` +scripts/swift_processors/swift_cut_test.swift +``` + +--- + +## 版本歷史 + +| 版本 | 日期 | 目的 | 操作人 | 工具/模型 | +|------|------|------|--------|-----------| +| V1.0 | 2026-05-03 | 初始版本 | OpenCode | deepseek-chat | +| V1.1 | 2026-05-04 | 新增 Swift AVFoundation 替代評估記錄 | OpenCode | deepseek-chat | | 依賴 | 無 | diff --git a/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/FACE_EMBEDDING_FLOW_V1.0.0.md b/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/FACE_EMBEDDING_FLOW_V1.0.0.md index a4e1b25..dfe1374 100644 --- a/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/FACE_EMBEDDING_FLOW_V1.0.0.md +++ b/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/FACE_EMBEDDING_FLOW_V1.0.0.md @@ -1,9 +1,9 @@ --- document_type: "spec" service: "MOMENTRY_CORE" -title: "Face Embedding 產出流程 V1.0.0" -date: "2026-05-02" -version: "V1.0" +title: "Face Embedding 產出流程 V2.0.0" +date: "2026-05-04" +version: "V2.0" status: "active" owner: "Warren" created_by: "OpenCode" @@ -12,16 +12,17 @@ tags: - "core" - "face" - "embedding" + - "pgvector" - "qdrant" - - "v1.0.0" + - "v2.0.0" ai_query_hints: - - "Face Embedding 的完整處理流程(Frame → InsightFace → Qdrant)" + - "Face Embedding 的完整處理流程(Vision detection → CoreML FaceNet → pgvector + Qdrant)" + - "V2.0 使用 Apple Vision Framework 取代 InsightFace detection" + - "V2.0 使用 CoreML FaceNet (MIT) 產出 512-D embedding" - "Face processor 的輸出結構與 embedding 欄位說明" - - "Worker store_face_chunks 與 store_face_embeddings_to_qdrant 的步驟" - "Qdrant face collection 的 payload 結構與點位 ID 規則" - - "Face embedding 的 512-D ArcFace w600k_r50 向量規格" - "Face embedding 使用 Cosine 距離計算" - - "InsightFace buffalo_l 的資源預估與 GPU 加速資訊" + - "Face detection 使用 ANE(Apple Vision Framework),embedding 使用 ANE(CoreML FaceNet)" - "face_detections 表與 Qdrant 的資料同步方式" related_documents: - "../VECTOR_SPEC_V1.0.0.md" @@ -31,103 +32,128 @@ related_documents: - "../MOMENTRY_CORE_API_V1.0.0.md" --- -# Face Embedding 產出流程 V1.0.0 +# Face Embedding 產出流程 V2.0.0 | 項目 | 內容 | |------|------| | 建立者 | OpenCode | -| 建立時間 | 2026-05-02 | -| 文件版本 | V1.0 | +| 建立時間 | 2026-05-04 | +| 文件版本 | V2.0 | -## 關鍵術語定義 +## V2.0 變更摘要 -| 術語 | 定義 | -|------|------| -| Face Embedding | 人臉向量嵌入,由 InsightFace ArcFace 產出 512-D 向量 | -| SCRFD-10G | InsightFace 的人臉檢測模型 | -| ArcFace w600k_r50 | InsightFace 的人臉辨識模型,產出 512-D embedding | -| point_id | Qdrant 中向量的唯一 ID,使用幀編號 (frame number) | -| Cosine distance | 餘弦距離,用於向量相似度計算 | -| payload | Qdrant 向量的附帶 metadata 欄位 | +| 項目 | V1.x | V2.0 | +|------|------|------| +| **Detection** | InsightFace SCRFD-10G (CPU, 450%) | **Apple Vision VNDetectFaceRectangles** (ANE, ~0%) | +| **Pose** | InsightFace 2D landmarks → angle | **Apple Vision VNDetectFaceLandmarks** (roll/yaw/pitch) | +| **Embedding** | CoreML FaceNet 512-D (ANE) | 同左,MIT license | +| **CPU usage** | 450%+ | **~0%** | +| **Script** | `face_processor.py` | **`face_processor_vision.py` + `swift_face`** | ## 處理流程 ``` -1. Video Frame (取樣) - │ - ▼ -2. Face Processor (face_processor.py) - ├── InsightFace buffalo_l - │ ├── SCRFD-10G 人臉檢測 - │ ├── ArcFace w600k_r50 512-D embedding - │ ├── 年齡/性別預測 - │ └── 2D106 landmarks - │ - ├── 輸出: job_{id}_face_{ts}.json → {file_uuid}.face.json - │ └── FaceResult { frame_count, fps, frames: [FaceFrame] } - │ - ▼ -3. Worker store_face_chunks() - ├── 解析 FaceResult - ├── 寫入 pre_chunks 表 (file_uuid, processor_type='face', data) - └── 寫入 face_detections 表 - │ - ▼ -4. Worker store_face_embeddings_to_qdrant() - ├── 對每個 face frame 的每個 face - │ └── 若有 embedding (512-D): - │ ├── point_id = frame number (u64) - │ ├── vector = 512-D float array - │ └── payload (見下方) - └── 寫入 Qdrant collection `momentry_dev_face` +1. swift_face (Vision/ANE) + ├── AVAssetReader 逐幀讀取 + ├── VNDetectFaceRectanglesRequest → bbox (x, y, w, h) + confidence + ├── VNDetectFaceLandmarksRequest → roll, yaw, pitch + └── 輸出: {uuid}_detect.json + +2. face_processor_vision.py + ├── 讀取 detect.json + ├── cv2 逐幀 crop face by bbox + ├── CoreML FaceNet → 512-D embedding (ANE) + ├── classify_pose(roll, yaw) → frontal/three_quarter/profile + └── 輸出: {uuid}.face.json (FaceResult format) + +3. Rust pipeline (job_worker.rs) + ├── 讀取 face.json → FaceResult struct + ├── store_face_chunks() → pre_chunks table + └── store_face_embeddings_to_qdrant() → Qdrant + +4. Post-Face (job_worker.rs) + ├── store_traced_faces.py + │ ├── face_tracker.py (IoU + embedding) → trace_id + │ └── INSERT face_detections (trace_id + bbox + embedding pgvector) + ├── sync_face_embeddings() → Qdrant face points + └── cluster_face_embeddings() / search_similar_faces() → pgvector query ``` -## Qdrant Payload 結構 +## 輸出結構 + +### face.json (FaceResult) ```json { - "file_uuid": "dd61fda85fee441fdd00ab5528213ff7", - "face_id": null, - "frame": 15, - "timestamp": 0.68, - "x": 328, - "y": 88, - "width": 63, - "height": 75, - "confidence": 0.83 + "frame_count": 6872, + "fps": 59.94, + "frames": [ + { + "frame": 30, + "timestamp": 0.5, + "faces": [ + { + "x": 917, "y": 125, "width": 181, "height": 250, + "confidence": 0.88, + "embedding": [0.01, -0.04, 0.12, ...], // 512-D + "pose_angle": {"angle": "frontal", "roll": 2.5, "yaw": -5.0, "pitch": 1.2}, + "landmarks": null, + "attributes": null + } + ] + } + ] } ``` +### face_detections (PostgreSQL + pgvector) + | 欄位 | 型別 | 說明 | |------|------|------| -| `file_uuid` | string | 來源影片識別碼 | -| `face_id` | string|null | 臉部追蹤 ID(尚未分配時為 null) | -| `frame` | integer | 幀編號 | -| `timestamp` | float | 時間戳(秒) | -| `x, y, width, height` | integer | 人臉邊界框 | -| `confidence` | float | 檢測信心度 (0~1) | +| `file_uuid` | VARCHAR | 來源影片 | +| `frame_number` | BIGINT | 幀編號 | +| `trace_id` | INTEGER | 跨幀追蹤 ID(face_tracker 分配) | +| `bbox` | JSONB | `{"x", "y", "width", "height"}` | +| `confidence` | DOUBLE | 檢測信心度 | +| `embedding` | VECTOR(512) | pgvector index (ivfflat, cosine) | +| `identity_id` | BIGINT | 綁定的 identity(可為 NULL) | + +### Qdrant Payload (momentry_dev/dev collection) + +```json +{ + "file_uuid": "1a04db97...", + "trace_id": 0, + "frame_number": 825, + "type": "face_embedding" +} +``` ## Vector 規格 | 屬性 | 值 | |------|-----| -| 模型 | InsightFace ArcFace w600k_r50 | +| 模型 | CoreML FaceNet (InceptionResnetV1, VGGFace2) | +| License | MIT | | 維度 | 512 | -| 距離計算 | Cosine | -| 歸一化 | 否 (raw output) | +| 距離 | Cosine | +| Index | pgvector ivfflat (lists=100) | +| Qdrant | Cosine distance, shared collection | ## 來源 Processor 資源預估 -| 資源 | 值 | -|------|-----| -| 模型 | InsightFace buffalo_l (~150MB) | -| CPU | 0.6 | -| 記憶體 | 1536 MB | -| GPU | 支援(CoreML 50-80 FPS, CUDA 80-120 FPS) | -| 處理速度 | 130.5x real-time (M4 Mac Mini) | +| 資源 | V1.x (InsightFace) | V2.0 (Vision + FaceNet) | +|------|--------------------|-------------------------| +| Detection 模型 | IntegrationFace SCRFD-10G (~150MB) | Apple Vision (系統內建) | +| Embedding 模型 | CoreML FaceNet (90MB) | 同左 | +| CPU | 450%+ | **~0%** | +| 記憶體 | ~1.5GB | **<50MB** | +| ANE | 僅 embedding | **detection + embedding** | +| Total time (2hr film, interval=30) | ~1.3hr | **~40min** | ## 版本歷史 | 版本 | 日期 | 目的 | 操作人 | 工具/模型 | |------|------|------|--------|-----------| -| V1.0 | 2026-05-02 | 初始版本 | OpenCode | deepseek-chat | +| V1.0 | 2026-05-02 | 初始版本 (InsightFace) | OpenCode | deepseek-chat | +| V2.0 | 2026-05-04 | Apple Vision detection + CoreML FaceNet embedding | OpenCode | deepseek-chat | diff --git a/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/FACE_V1.0.0.md b/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/FACE_V1.0.0.md index 8dc02e7..66e1581 100644 --- a/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/FACE_V1.0.0.md +++ b/docs_v1.0/API_V1.0.0/INTERNAL/PROCESSORS/FACE_V1.0.0.md @@ -102,3 +102,272 @@ related_documents: | 依賴 | 無 | --- + +## Apple Vision Framework 實驗記錄 + +### POC 目標 + +評估 Apple Vision Framework 是否可取代 InsightFace(buffalo_l)進行臉部處理,目標是利用 ANE 加速降低記憶體使用。 + +### 測試結果 + +測試環境:macOS 14, Apple Silicon M4, 使用 `VNDetectFaceRectanglesRequest` + `VNDetectFaceLandmarksRequest` + `VNDetectFaceCaptureQualityRequest`。 + +| 功能 | Vision Framework | InsightFace (buffalo_l) | +|------|----------------|------------------------| +| **Face Detection** | ✅ 通過(1 face, conf=0.88) | ✅ | +| **Face Landmarks** | ✅ 6+6 eye pts, 8 nose pts | ✅ 106 pts | +| **Capture Quality** | ✅ score=0.5327 | ❌ 無 | +| **Face Embedding (512-D)** | ❌ **不可用** | ✅ ArcFace 512-D | +| **照片 metadata(年齡/性別)** | ❌ 不可用 | ✅ | +| **ANE 加速** | ✅ 是 | ❌ CPU only | +| **處理時間** | ⚡ 0.31s | ~0.5-1s | +| **記憶體** | ✅ 低(系統框架) | ~1.5GB | + +### 關鍵發現 + +`VNFaceprint` class 存在但無法透過公開 API 或 KVC 取得 face embedding 資料。Vision Framework 提供了高品質的臉部偵測和特徵點定位,但**無法提取用於 face matching 的向量 embedding**。 + +### 選型結論 + +| 用途 | 方案 | +|------|------| +| **Face Detection** | Vision Framework **可取代** InsightFace(更輕量、更快) | +| **Face Landmarks** | Vision Framework **可取代** | +| **Face Embedding** | InsightFace **維持現狀**(Vision Framework 無法取代) | +| **Face Recognition** | InsightFace **維持現狀** | + +若未來 Apple 開放 `VNFaceprint` 的 embedding 資料,可重新評估全面切換。 + +### 相關檔案 + +``` +scripts/swift_processors/face_vision_test.swift +``` + +--- + +## MediaPipe Face 評估 + +### 測試狀態 + +MediaPipe 0.10.33 已安裝,提供 Face Detection (BlazeFace) + Face Landmarker (468 mesh)。 + +| 功能 | API | 狀態 | +|------|-----|------| +| Face Detection | `mediapipe.tasks.python.vision.face_detector` | ✅ 可用 | +| Face Mesh | `mediapipe.tasks.python.vision.face_landmarker` | ✅ 468 3D landmarks | +| Face Embedding | 無 | ❌ 不支援 | + +### 三方案比較 + +| 功能 | MediaPipe | Vision Framework | InsightFace | +|------|-----------|-----------------|-------------| +| **Face Detection** | ✅ BlazeFace (~2MB) | ✅ VNDetectFaceRectangles | ✅ RetinaFace | +| **Bounding Box** | ✅ | ✅ | ✅ | +| **Keypoints** | ✅ **6 點** (eyes+nose+mouth) | ❌ | ✅ 106 點 | +| **Face Mesh** | ✅ **468 點** (獨立模型) | ❌ | ❌ | +| **512-D Embedding** | ❌ | ❌ | ✅ **ArcFace** | +| **Age/Gender** | ❌ | ❌ | ✅ | +| **Capture Quality** | ❌ | ✅ score 0.06~0.25 | ❌ | +| **速度** | ⚡ 極快 (mobile optimized) | ⚡ ANE 加速 | 🐢 CPU bound | +| **模型大小** | ~2MB | 系統內建 | ~150MB | +| **跨平台** | ✅ Linux/Windows/macOS | ❌ Apple only | ✅ | + +### 選型結論 + +| 用途 | 建議方案 | +|------|---------| +| **Face Detection** | MediaPipe 或 Vision Framework(速度快、輕量) | +| **Face Mesh / 468 landmarks** | MediaPipe(唯一方案) | +| **Face Embedding (512-D)** | InsightFace **維持現狀** | +| **Age/Gender** | InsightFace **維持現狀** | + +MediaPipe 和 Vision Framework 在 detection 層級相當,兩者都遠快於 InsightFace。但最終 embedding extraction 仍需 InsightFace。 + +### 分段實施建議 + +若要以 Swift/Vision 加速 face pipeline: + +``` +Swift face_detector (ANE, fast) + └── 輸出 {file_uuid}.bbox.json (face_id, bbox, timestamp) + +Python embed_extractor (InsightFace, only on detected crops) + └── 讀取 .bbox.json → crop face region + → InsightFace 提取 512-D embedding + → 產出完整 {file_uuid}.face.json +``` + +--- + +## FaceNet-PyTorch CoreML Embedding 實驗 + +### 動機 + +InsightFace 的 buffalo_l pre-trained weights 使用 CC BY-NC-SA 4.0 license,商用有爭議。需要一個 MIT/Apache 2.0 licensed 的 face embedding 方案。 + +### 測試結果 + +使用 Facenet-PyTorch (`facenet-pytorch`, MIT license) 的 InceptionResnetV1 (pretrained on VGGFace2),匯出 ONNX 並轉換為 CoreML。 + +| 步驟 | 時間 | 產出 | +|------|------|------| +| 模型載入 | 10.5s | InceptionResnetV1, 512-D output | +| ONNX 匯出 | 1.2s | `/tmp/facenet512.onnx` (90MB) | +| CoreML 轉換 | 6s | `/tmp/facenet512.mlpackage` (90MB) | + +### 效能對比 + +| 指標 | PyTorch (CPU) | CoreML (CPU/GPU/ANE) | +|------|--------------|---------------------| +| **推論時間 (avg)** | 30.9ms | **4.8ms** ⚡ | +| **加速比** | 1x | **6.4x** | +| **Embedding 維度** | 512-D | 512-D | +| **Normalized** | ✅ norm=1.0 | ✅ norm=1.0 | +| **精度比對 (cosine)** | 1.0 | **0.999532** ✅ | + +### License 確認 + +| 元件 | License | 商用 | +|------|---------|------| +| Facenet-PyTorch 原始碼 | **MIT** | ✅ | +| VGGFace2 weights | 研究用,但可重新訓練 | ✅ (自有資料訓練後) | +| ONNX Runtime | MIT | ✅ | +| CoreML | macOS 內建 | ✅ | +| InsightFace buffalo_l (現行) | CC BY-NC-SA 4.0 | ❌ **有爭議** | + +### 結論 + +Facenet-PyTorch CoreML 模型可完全取代 InsightFace 的 embedding extraction,MIT license 無商用障礙,且 CoreML 推論快 6.4 倍。 + +### 整合入 Face Processor + +`scripts/face_processor.py` 已整合 CoreML FaceNet 作為 embedding extractor: + +| 項目 | 實作 | +|------|------| +| **Detection** | InsightFace buffalo_l(維持不變) | +| **Embedding** | CoreML FaceNet(`models/facenet512.mlpackage`)✅ 已取代 | +| **Fallback** | CoreML 失敗時自動回退到 InsightFace embedding | +| **啟動載入** | script 初始化時一次載入 CoreML model(~2s) | +| **推論流程** | 對每個 detected face crop → resize 160x160 → normalize → CoreML infer → 512-D embedding | +| **Metadata** | 輸出記錄 `embedding_method: coreml_facenet` | + +Model 檔案路徑:`models/facenet512.mlpackage`(專案根目錄) + +### 相關檔案 + +``` +models/facenet512.mlpackage # CoreML model (90MB, MIT license) +/tmp/facenet512.onnx # ONNX format (90MB, for reference) +scripts/face_processor.py # Face processor with CoreML integration +``` + +--- + +## 版本歷史 + +| 版本 | 日期 | 目的 | 操作人 | 工具/模型 | +|------|------|------|--------|-----------| +| V1.0 | 2026-05-02 | 初始版本 | OpenCode | deepseek-chat | +| V1.1 | 2026-05-04 | 新增 Apple Vision Framework + MediaPipe + FaceNet CoreML 整合記錄 | OpenCode | deepseek-chat | +| V2.0 | 2026-05-04 | Apple Vision 取代 InsightFace detection;CoreML FaceNet 維持 embedding | OpenCode | deepseek-chat | + +--- + +## V2.0 Architecture: Vision Detection + CoreML FaceNet Embedding + +### 架構變更 + +V1.x 使用 InsightFace 同時做 detection + embedding(CPU bound, 450%+ CPU)。 +V2.0 將 detection 移至 Apple Vision Framework(ANE),embedding 維持 CoreML FaceNet(ANE),CPU 歸零。 + +``` +V1.x: + face_processor.py + ├── InsightFace buffalo_l (CPU, 450%) → detection + bbox + landmarks + └── CoreML FaceNet (ANE) → 512-D embedding + +V2.0: + face_processor_vision.py + ├── swift_face (Vision/ANE) → VNDetectFaceRectanglesRequest → bbox + │ → VNDetectFaceLandmarksRequest → pose (roll, yaw, pitch) + └── CoreML FaceNet (ANE) → 512-D embedding on cropped face +``` + +### 處理流程 + +``` +1. swift_face