diff --git a/deliverable_v1.1.0/modules/08_media.md b/deliverable_v1.1.0/modules/08_media.md new file mode 100644 index 0000000..cf81696 --- /dev/null +++ b/deliverable_v1.1.0/modules/08_media.md @@ -0,0 +1,317 @@ + + + + +## Video Streaming & Frame Extraction + +All video streaming endpoints support the following common query parameters: + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `mode` | string | No | `normal` | `normal` or `debug` (draws detection overlays) | +| `audio` | string | No | `on` | `on` or `off` | + +--- + +### `GET /api/v1/file/:file_uuid/video` + +Stream the full video file with range support for seeking. + +**Auth**: Required +**Scope**: file-level + +#### Response + +- **200**: Video stream (`Content-Type` based on file extension) +- **206**: Partial content (range request) +- Supports `Range` header for seeking + +--- + +### `GET /api/v1/file/:file_uuid/trace/:trace_id/video` + +Stream video with highlights for a specific face trace (follows a single person across frames with bounding box overlay). + +**Auth**: Required +**Scope**: file-level + +--- + +### `GET /api/v1/file/:file_uuid/trace/:trace_id/representative-face` + +Find the best single face to represent this trace. Uses a two-stage selection: SQL (area × confidence → top 10) then FFmpeg `blurdetect` (sharpness → pick the least blurry). + +**Auth**: Required +**Scope**: file-level + +#### Example + +```bash +curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/representative-face" \ + -H "X-API-Key: $KEY" +``` + +#### Response (200) + +```json +{ + "success": true, + "file_uuid": "aeed71342a899fe4b4c57b7d41bcb692", + "trace_id": 1939, + "face_count": 538, + "representative": { + "frame_number": 68193, + "timestamp_secs": 2727.72, + "bbox": { "x": 347, "y": 378, "width": 427, "height": 427 }, + "confidence": 0.760, + "quality_score": 138516, + "blur_score": 9.46 + } +} +``` + +#### Response Fields + +| Field | Type | Description | +|-------|------|-------------| +| `trace_id` | integer | Face trace ID | +| `face_count` | integer | Total face detections in this trace | +| `representative.frame_number` | integer | Frame number of the selected face (primary coordinate) | +| `representative.timestamp_secs` | float | Time in seconds (derived from `frame_number / fps`) | +| `representative.bbox` | object | Bounding box `{x, y, width, height}` | +| `representative.confidence` | float | Detection confidence (0.0–1.0) | +| `representative.quality_score` | float | Pre-selection score (`area × confidence`) | +| `representative.blur_score` | float | FFmpeg blurdetect result (lower = sharper) | + +#### Error Responses + +--- + +### `GET /api/v1/file/:file_uuid/trace/:trace_id/thumbnail` + +Extract the best face image for a trace as JPEG (320×320). Internally selects the face using the same two-stage algorithm as `representative-face`, then crops via FFmpeg. The result is cacheable for 24 hours. + +**Auth**: Required +**Scope**: file-level + +#### Example + +```bash +curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/thumbnail" \ + -H "X-API-Key: $KEY" -o trace_1939_face.jpg +``` + +#### Response + +- **200**: `image/jpeg` binary data (320×320 cropped face) +- **404**: File, trace not found, or no suitable face +- **500**: FFmpeg or database error + +--- + +### `GET /api/v1/file/:file_uuid/identities/:identity_uuid_a/co-occur-with/:identity_uuid_b` + +Find the first frame where two identities appear together, with representative face thumbnails for both. + +**Auth**: Required +**Scope**: file-level + +#### Example + +```bash +# Audrey Hepburn & Cary Grant 第一次同框 +curl -s "$API/api/v1/file/$FILE_UUID/identities/$AUDREY_UUID/co-occur-with/$CARY_UUID" \ + -H "X-API-Key: $KEY" | jq '{identity_a: .identity_a.name, identity_b: .identity_b.name, first_frame: .first_cooccurrence.frame_number}' +``` + +#### Response (200) + +```json +{ + "success": true, + "file_uuid": "aeed71342a899fe4b4c57b7d41bcb692", + "identity_a": { + "identity_uuid": "c3545906-c82d-4b66-aa1d-150bc02decce", + "name": "Audrey Hepburn", + "trace_id": 920 + }, + "identity_b": { + "identity_uuid": "2b0ddefe-e2a9-4533-9308-b375594604d5", + "name": "Cary Grant", + "trace_id": 919 + }, + "first_cooccurrence": { + "frame_number": 38165, + "timestamp_secs": 1526.60, + "total_cooccurrence_frames": 3136, + "representative_face_a": { + "frame_number": 38199, + "bbox": { "x": 122, "y": 339, "width": 176, "height": 176 }, + "confidence": 0.832, + "thumbnail_url": "/api/v1/file/aeed71342.../trace/920/thumbnail" + }, + "representative_face_b": { + "frame_number": 38291, + "bbox": { "x": 511, "y": 315, "width": 192, "height": 192 }, + "confidence": 0.791, + "thumbnail_url": "/api/v1/file/aeed71342.../trace/919/thumbnail" + } + } +} +``` + +#### Response Fields + +| Field | Type | Description | +|-------|------|-------------| +| `identity_a.name` | string | First identity name | +| `identity_b.name` | string | Second identity name | +| `first_cooccurrence.frame_number` | int | First frame where both appear | +| `first_cooccurrence.timestamp_secs` | float | Time in seconds | +| `first_cooccurrence.total_cooccurrence_frames` | int | Total frames with both present | +| `first_cooccurrence.representative_face_a/b` | object | Best face thumbnail data for each identity | + +#### Error Responses + +| HTTP | When | +|------|------| +| `404` | File or identity not found | +| `404` | The two identities never co-occur in this file | +| `500` | Database or FFmpeg error | + +### `GET /api/v1/file/:file_uuid/video/bbox` + +Stream video with bounding box overlay for all detected objects/faces. + +**Auth**: Required +**Scope**: file-level + +Uses a built-in 5×7 bitmap font renderer to draw labels directly on video frames via FFmpeg `drawtext` filter. + +--- + +### `GET /api/v1/file/:file_uuid/thumbnail` + +Extract a single frame from a video as JPEG image. Uses FFmpeg `select` filter. + +**Auth**: Required +**Scope**: file-level + +#### Query Parameters + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `frame` | integer | Yes | — | Zero-based frame number to extract | +| `x` | integer | No | — | Crop start X (left edge). Requires `y`, `w`, `h`. | +| `y` | integer | No | — | Crop start Y (top edge). Requires `x`, `w`, `h`. | +| `w` | integer | No | — | Crop width in pixels. Requires `x`, `y`, `h`. | +| `h` | integer | No | — | Crop height in pixels. Requires `x`, `y`, `w`. | + +All four crop params (`x`, `y`, `w`, `h`) must be provided together or omitted. + +#### Example + +```bash +# Extract frame 1000 (full frame) +curl -s "$API/api/v1/file/bd80fec92b0b6963d177a2c55bf713e2/thumbnail?frame=1000" \ + -H "Authorization: Bearer $JWT" -o frame_1000.jpg + +# Extract and crop face region (x=320, y=240, w=160, h=160) +curl -s "$API/api/v1/file/bd80fec92b0b6963d177a2c55bf713e2/thumbnail?frame=1000&x=320&y=240&w=160&h=160" \ + -H "Authorization: Bearer $JWT" -o face_crop.jpg +``` + +#### Response + +- **200**: `image/jpeg` binary data +- **404**: File not found +- **500**: FFmpeg error (e.g., frame number exceeds video duration) + +### `GET /api/v1/file/:file_uuid/clip` + +Extract a video clip (time range) as MPEG-TS stream. Uses FFmpeg `-ss` fast seek. + +**Auth**: Required +**Scope**: file-level + +#### Query Parameters + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `start_frame` | integer | No* | — | Start frame (zero-based). **Frame-accurate** — use this for precision. | +| `end_frame` | integer | No* | — | End frame (zero-based, inclusive). Requires `start_frame`. | +| `start_time` | float | No* | — | Start time in seconds. Approximate (FPS-dependent). Fallback if frames not given. | +| `end_time` | float | No* | — | End time in seconds. Approximate (FPS-dependent). Fallback if frames not given. | +| `fps` | float | No | video FPS | Override frames-per-second for frame↔time calculation. Defaults to video's detected FPS. | +| `mode` | string | No | `normal` | `normal` or `debug` (draws "CLIP" overlay) | +| `audio` | string | No | `on` | `on` or `off` | + +Either (`start_frame`+`end_frame`) OR (`start_time`+`end_time`) must be provided. + +#### Example + +```bash +# Clip by frame range (primary) +curl -s "$API/api/v1/file/bd80fec92b0b6963d177a2c55bf713e2/clip?start_frame=0&end_frame=47" \ + -H "Authorization: Bearer $JWT" -o clip.ts + +# Clip by time range (fallback) +curl -s "$API/api/v1/file/bd80fec92b0b6963d177a2c55bf713e2/clip?start_time=30&end_time=45" \ + -H "Authorization: Bearer $JWT" -o clip.ts +``` + +#### Response + +- **200**: `video/mp2t` MPEG-TS stream +- **400**: Missing/invalid range parameters +- **404**: File not found +- **500**: FFmpeg error + +#### Technical Notes + +| Detail | Value | +|--------|-------| +| **Backend** | FFmpeg (`ffmpeg-full`) | +| **Seek** | `-ss` before `-i` (fast keyframe seek) | +| **Format** | MPEG-TS (`mpegts` muxer, pipe-safe) | +| **Codec** | H.264 + AAC | +| **Cache** | `Cache-Control: public, max-age=86400` (24h) | + +### Video vs Clip: Quality & Format Comparison + +Both endpoints support time range extraction, but serve different use cases: + +| Feature | `/video` | `/clip` | +|---------|----------|---------| +| **No params** | Streams full file (Range seek) | Returns 400 (params required) | +| **HTTP Range** | ✅ Supported | ❌ Not supported | +| **Encoding** | `-c copy` (zero encoding) | `-c:v libx264 -c:a aac` (re-encode) | +| **Quality** | Original (bit-exact, zero loss) | Compressed (default CRF ≈ 23) | +| **Format** | `video/mp4` | `video/mp2t` (MPEG-TS) | +| **Speed** | Fast (no computation) | Slower (encoding required) | +| **Frame control** | Time-based (`dur = (ef-sf)/fps`) | Precise (`-vframes`) | +| **Debug mode** | ❌ | ✅ `mode=debug` overlay | +| **Cache** | ❌ | ✅ `max-age=86400` | + +#### Usage Recommendation + +| Scenario | Use | +|----------|-----| +| Full video streaming / player seek | `/video` | +| Quick preview clip (zero quality loss) | `/video?start_frame=...&end_frame=...` | +| Debug frame verification / text overlay | `/clip?mode=debug` | +| Precise frame count control | `/clip` | +| CDN cacheable clip | `/clip` | + +--- + +| Detail | Value | +|--------|-------| +| **Backend** | FFmpeg (`ffmpeg-full`) | +| **Filter** | `select=eq(n\,FRAME)` to select frame, optional `crop=W:H:X:Y` | +| **Output** | Single JPEG via pipe (`image2pipe`, `mjpeg` codec) | +| **Cache** | `Cache-Control: public, max-age=86400` (24h) | +| **Frame number** | Zero-based (`frame=0` = first frame of video) | + +--- +*Updated: 2026-05-19 12:49:24* diff --git a/docs_v1.0/API_WORKSPACE/modules/08_media.md b/docs_v1.0/API_WORKSPACE/modules/08_media.md index e75e13f..cf81696 100644 --- a/docs_v1.0/API_WORKSPACE/modules/08_media.md +++ b/docs_v1.0/API_WORKSPACE/modules/08_media.md @@ -37,6 +37,148 @@ Stream video with highlights for a specific face trace (follows a single person --- +### `GET /api/v1/file/:file_uuid/trace/:trace_id/representative-face` + +Find the best single face to represent this trace. Uses a two-stage selection: SQL (area × confidence → top 10) then FFmpeg `blurdetect` (sharpness → pick the least blurry). + +**Auth**: Required +**Scope**: file-level + +#### Example + +```bash +curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/representative-face" \ + -H "X-API-Key: $KEY" +``` + +#### Response (200) + +```json +{ + "success": true, + "file_uuid": "aeed71342a899fe4b4c57b7d41bcb692", + "trace_id": 1939, + "face_count": 538, + "representative": { + "frame_number": 68193, + "timestamp_secs": 2727.72, + "bbox": { "x": 347, "y": 378, "width": 427, "height": 427 }, + "confidence": 0.760, + "quality_score": 138516, + "blur_score": 9.46 + } +} +``` + +#### Response Fields + +| Field | Type | Description | +|-------|------|-------------| +| `trace_id` | integer | Face trace ID | +| `face_count` | integer | Total face detections in this trace | +| `representative.frame_number` | integer | Frame number of the selected face (primary coordinate) | +| `representative.timestamp_secs` | float | Time in seconds (derived from `frame_number / fps`) | +| `representative.bbox` | object | Bounding box `{x, y, width, height}` | +| `representative.confidence` | float | Detection confidence (0.0–1.0) | +| `representative.quality_score` | float | Pre-selection score (`area × confidence`) | +| `representative.blur_score` | float | FFmpeg blurdetect result (lower = sharper) | + +#### Error Responses + +--- + +### `GET /api/v1/file/:file_uuid/trace/:trace_id/thumbnail` + +Extract the best face image for a trace as JPEG (320×320). Internally selects the face using the same two-stage algorithm as `representative-face`, then crops via FFmpeg. The result is cacheable for 24 hours. + +**Auth**: Required +**Scope**: file-level + +#### Example + +```bash +curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/thumbnail" \ + -H "X-API-Key: $KEY" -o trace_1939_face.jpg +``` + +#### Response + +- **200**: `image/jpeg` binary data (320×320 cropped face) +- **404**: File, trace not found, or no suitable face +- **500**: FFmpeg or database error + +--- + +### `GET /api/v1/file/:file_uuid/identities/:identity_uuid_a/co-occur-with/:identity_uuid_b` + +Find the first frame where two identities appear together, with representative face thumbnails for both. + +**Auth**: Required +**Scope**: file-level + +#### Example + +```bash +# Audrey Hepburn & Cary Grant 第一次同框 +curl -s "$API/api/v1/file/$FILE_UUID/identities/$AUDREY_UUID/co-occur-with/$CARY_UUID" \ + -H "X-API-Key: $KEY" | jq '{identity_a: .identity_a.name, identity_b: .identity_b.name, first_frame: .first_cooccurrence.frame_number}' +``` + +#### Response (200) + +```json +{ + "success": true, + "file_uuid": "aeed71342a899fe4b4c57b7d41bcb692", + "identity_a": { + "identity_uuid": "c3545906-c82d-4b66-aa1d-150bc02decce", + "name": "Audrey Hepburn", + "trace_id": 920 + }, + "identity_b": { + "identity_uuid": "2b0ddefe-e2a9-4533-9308-b375594604d5", + "name": "Cary Grant", + "trace_id": 919 + }, + "first_cooccurrence": { + "frame_number": 38165, + "timestamp_secs": 1526.60, + "total_cooccurrence_frames": 3136, + "representative_face_a": { + "frame_number": 38199, + "bbox": { "x": 122, "y": 339, "width": 176, "height": 176 }, + "confidence": 0.832, + "thumbnail_url": "/api/v1/file/aeed71342.../trace/920/thumbnail" + }, + "representative_face_b": { + "frame_number": 38291, + "bbox": { "x": 511, "y": 315, "width": 192, "height": 192 }, + "confidence": 0.791, + "thumbnail_url": "/api/v1/file/aeed71342.../trace/919/thumbnail" + } + } +} +``` + +#### Response Fields + +| Field | Type | Description | +|-------|------|-------------| +| `identity_a.name` | string | First identity name | +| `identity_b.name` | string | Second identity name | +| `first_cooccurrence.frame_number` | int | First frame where both appear | +| `first_cooccurrence.timestamp_secs` | float | Time in seconds | +| `first_cooccurrence.total_cooccurrence_frames` | int | Total frames with both present | +| `first_cooccurrence.representative_face_a/b` | object | Best face thumbnail data for each identity | + +#### Error Responses + +| HTTP | When | +|------|------| +| `404` | File or identity not found | +| `404` | The two identities never co-occur in this file | +| `500` | Database or FFmpeg error | + ### `GET /api/v1/file/:file_uuid/video/bbox` Stream video with bounding box overlay for all detected objects/faces. diff --git a/docs_v1.0/DESIGN/TKG_QUERY_API_V1.0.md b/docs_v1.0/DESIGN/TKG_QUERY_API_V1.0.md new file mode 100644 index 0000000..7fecfac --- /dev/null +++ b/docs_v1.0/DESIGN/TKG_QUERY_API_V1.0.md @@ -0,0 +1,524 @@ +# TKG Query API V1.0 + +用於 Gemma4(LLM)透過 function calling 查詢影片人物互動資料的 API 設計。 + +--- + +## 1. Overview + +### 目的 + +讓 LLM(Gemma 4)可以回答關於影片人物互動的問題,例如「誰是主角」、「第一次同框是什麼時候」。透過 TKG(Trace Knowledge Graph)和 PostgreSQL 直接查詢,不需要 LLM 猜測。 + +### 架構 + +``` +User → "誰是這部電影的主角?" + ↓ +Gemma4 → function_call: tkg_query(file_uuid, "top_identities") + ↓ +API → SQL/TKG 查詢 → 結構化 JSON + ↓ +Gemma4 → "男主是 Cary Grant,女主是 Audrey Hepburn..." + ↓ +User ← 自然語言回答 +``` + +### 資料流 + +| 層級 | 元件 | 說明 | +|------|------|------| +| LLM | Gemma 4 26B (port 8082) | 解析自然語言 → 決定呼叫哪個 tool | +| Function | `tkg_query()` | 8 種 query_type,參數由 LLM 填寫 | +| Backend | `POST /api/v1/tkg/query` | 執行 SQL,回傳結構化結果 | +| Data | `face_detections`, `identities`, `chunk`, `tkg_nodes/edges` | 查詢來源 | + +--- + +## 2. Function Spec(給 LLM) + +### Function Definition + +```json +{ + "name": "tkg_query", + "description": "查詢影片的人物、場景、互動資料。根據問題類型選擇適合的 query_type。", + "parameters": { + "type": "object", + "properties": { + "file_uuid": { + "type": "string", + "description": "影片的 32 碼 file UUID" + }, + "query_type": { + "type": "string", + "description": "查詢類型", + "enum": [ + "top_identities", + "identity_details", + "first_cooccurrence", + "identity_traces", + "cut_details", + "file_info", + "mutual_gaze", + "interaction_network" + ] + }, + "identity_a": { + "type": "string", + "description": "人物A的 identity_uuid 或名字" + }, + "identity_b": { + "type": "string", + "description": "人物B的 identity_uuid 或名字" + }, + "cut_id": { + "type": "string", + "description": "場景ID(如 cut_264)" + }, + "limit": { + "type": "integer", + "description": "回傳筆數上限", + "default": 10 + } + }, + "required": ["file_uuid", "query_type"] + } +} +``` + +### LLM Prompt 設計 + +System prompt 中須包含此工具定義,並提示: + +``` +你是 Momentry 影片分析系統。當用戶問到影片中的人物、場景、互動問題時, +請先呼叫 tkg_query 查詢資料,再根據資料回答。 + +注意: +- 問題中提到的「男主」、「女主」是指 TMDb cast_order 0 和 1 +- 「配角」是指 cast_order >= 2 的人物 +- 「第一次同框」使用 first_cooccurrence +- 「誰最多鏡頭」使用 top_identities 搭配 face_count 排序 +``` + +--- + +## 3. API Endpoint + +### `POST /api/v1/tkg/query` + +Request: +```json +{ + "file_uuid": "aeed71342a899fe4b4c57b7d41bcb692", + "query_type": "top_identities", + "identity_a": null, + "identity_b": null, + "cut_id": null, + "limit": 10 +} +``` + +Response(通用包裝): +```json +{ + "success": true, + "query_type": "top_identities", + "file_uuid": "aeed71342...", + "data": { ... }, + "took_ms": 12 +} +``` + +Error: +```json +{ + "success": false, + "error": "File not found", + "query_type": "top_identities", + "file_uuid": "aeed71342..." +} +``` + +--- + +## 4. Query Types 詳解 + +### 4.1 `top_identities` — 人物重要性排名 + +**用途**:找出影片中的所有人物,依 TMDb cast_order 排序 + +**SQL**: +```sql +SELECT i.id, i.name, + (i.metadata->>'tmdb_cast_order')::int as cast_order, + i.metadata->>'tmdb_character' as role, + i.source, i.status, + COUNT(fd.id) as face_count, + COUNT(DISTINCT fd.trace_id) as trace_count, + ROUND(MIN(fd.frame_number)::numeric / GREATEST(v.fps, 1), 2) as first_appearance_sec, + ROUND(MAX(fd.frame_number)::numeric / GREATEST(v.fps, 1), 2) as last_appearance_sec +FROM identities i +LEFT JOIN face_detections fd ON fd.identity_id = i.id AND fd.file_uuid = $1 +LEFT JOIN videos v ON v.file_uuid = $1 +WHERE i.source = 'tmdb' + AND (i.metadata->>'tmdb_cast_order')::int IS NOT NULL +GROUP BY i.id, i.name, i.metadata, i.source, i.status, v.fps +ORDER BY cast_order ASC +LIMIT $2 +``` + +**Response**: +```json +{ + "total": 23, + "leads": [ + {"name": "Cary Grant", "cast_order": 0, "role": "Peter Joshua", "face_count": 10643}, + {"name": "Audrey Hepburn", "cast_order": 1, "role": "Regina Lampert", "face_count": 16456} + ], + "supporting": [ + {"name": "Walter Matthau", "cast_order": 2, "role": "Hamilton Bartholemew", "face_count": 2319}, + {"name": "James Coburn", "cast_order": 3, "role": "Tex Panthollow", "face_count": 3572}, + {"name": "George Kennedy", "cast_order": 4, "role": "Herman Scobie", "face_count": 1817} + ], + "text_summary": "主演:Cary Grant 飾演 Peter Joshua,Audrey Hepburn 飾演 Regina Lampert。主要配角:Walter Matthau(cast_order 2)等 21 人。" +} +``` + +--- + +### 4.2 `identity_details` — 人物詳細資料 + +**SQL**: +```sql +SELECT i.id, i.name, i.identity_type, i.source, i.status, + i.metadata->>'tmdb_cast_order' as cast_order, + i.metadata->>'tmdb_character' as role, + i.metadata->>'tmdb_movie_title' as movie, + i.metadata->>'tmdb_biography' as biography, + COUNT(fd.id) as face_count, + COUNT(DISTINCT fd.trace_id) as trace_count, + MIN(fd.frame_number) as first_frame, + MAX(fd.frame_number) as last_frame +FROM identities i +LEFT JOIN face_detections fd ON fd.identity_id = i.id AND fd.file_uuid = $1 +WHERE (i.name ILIKE $2 OR i.uuid::text = $2 OR REPLACE(i.uuid::text, '-', '') = $2) + AND i.source = 'tmdb' +GROUP BY i.id, i.name, i.identity_type, i.source, i.status, i.metadata +LIMIT 1 +``` + +**Response**: +```json +{ + "name": "Audrey Hepburn", + "role": "Regina Lampert", + "cast_order": 1, + "face_count": 16456, + "trace_count": 457, + "first_appearance_sec": 206.76, + "last_appearance_sec": 6756.68, + "biography": "Audrey Hepburn (born Audrey Kathleen Ruston; 4 May 1929 – 20 January 1993)..." +} +``` + +--- + +### 4.3 `first_cooccurrence` — 第一次同框 + +**邏輯**:找出兩個 identity 第一次同時出現的 frame。 + +**SQL**: +```sql +SELECT MIN(fd_a.frame_number)::bigint as first_frame, + COUNT(DISTINCT fd_a.frame_number)::bigint as total_cooccurrence_frames +FROM face_detections fd_a +JOIN face_detections fd_b ON fd_a.file_uuid = fd_b.file_uuid + AND fd_a.frame_number = fd_b.frame_number +WHERE fd_a.file_uuid = $1 + AND fd_a.identity_id = (SELECT id FROM identities WHERE name ILIKE $2 OR REPLACE(uuid::text, '-', '') = $2) + AND fd_b.identity_id = (SELECT id FROM identities WHERE name ILIKE $3 OR REPLACE(uuid::text, '-', '') = $3) +``` + +**Response**: +```json +{ + "identity_a": {"name": "Audrey Hepburn"}, + "identity_b": {"name": "Cary Grant"}, + "first_frame": 38165, + "timestamp_secs": 1526.60, + "cut_id": "cut_264", + "total_cooccurrence_frames": 3136, + "representative_thumbnail_a": "/api/v1/file/{uuid}/trace/920/thumbnail", + "representative_thumbnail_b": "/api/v1/file/{uuid}/trace/919/thumbnail" +} +``` + +--- + +### 4.4 `identity_traces` — 人物出場片段 + +**SQL**: +```sql +SELECT fd.trace_id, COUNT(*) as face_count, + MIN(fd.frame_number) as start_frame, + MAX(fd.frame_number) as end_frame, + COUNT(DISTINCT fd.frame_number) as frame_span +FROM face_detections fd +WHERE fd.file_uuid = $1 + AND fd.identity_id = (SELECT id FROM identities WHERE name ILIKE $2 OR REPLACE(uuid::text, '-', '') = $2) +GROUP BY fd.trace_id +ORDER BY face_count DESC +LIMIT $3 +``` + +**Response**: +```json +{ + "name": "Audrey Hepburn", + "total_traces": 457, + "top_traces": [ + {"trace_id": 920, "face_count": 53, "start_frame": 38165, "end_frame": 38321, + "representative": "/api/v1/file/{uuid}/trace/920/thumbnail"}, + ... + ] +} +``` + +--- + +### 4.5 `cut_details` — 場景資訊 + +**SQL**: +```sql +SELECT chunk_id, start_frame, end_frame, + ROUND(start_frame::numeric / fps, 2) as start_time, + ROUND(end_frame::numeric / fps, 2) as end_time, + text_content, summary_text +FROM chunk +WHERE file_uuid = $1 AND chunk_id = $2 AND chunk_type = 'cut' +``` + +**Response**: +```json +{ + "cut_id": "cut_264", + "frame_range": [38164, 38324], + "duration_sec": 6.4, + "summary": "Audrey Hepburn and Cary Grant engage in a brief verbal exchange...", + "identities_present": ["Audrey Hepburn", "Cary Grant"] +} +``` + +--- + +### 4.6 `file_info` — 影片基本資訊 + +**SQL**: +```sql +SELECT file_name, file_path, duration, width, height, fps, + (SELECT COUNT(*) FROM face_detections WHERE file_uuid = $1) as total_faces, + (SELECT COUNT(DISTINCT trace_id) FROM face_detections WHERE file_uuid = $1 AND trace_id IS NOT NULL) as total_traces, + (SELECT COUNT(*) FROM chunk WHERE file_uuid = $1 AND chunk_type = 'cut') as total_cuts +FROM videos +WHERE file_uuid = $1 +``` + +--- + +### 4.7 `mutual_gaze` — 互看偵測(未來) + +**依賴**:pose 資料寫入 `face_detections.metadata->>'pose_yaw'`。 + +**SQL**: +```sql +SELECT fd_a.frame_number, + (fd_a.metadata->>'pose_yaw')::float8 as yaw_a, + (fd_b.metadata->>'pose_yaw')::float8 as yaw_b +FROM face_detections fd_a +JOIN face_detections fd_b ON fd_a.file_uuid = fd_b.file_uuid + AND fd_a.frame_number = fd_b.frame_number +WHERE fd_a.file_uuid = $1 + AND fd_a.identity_id = $2 AND fd_b.identity_id = $3 + AND (fd_a.metadata->>'pose_yaw')::float8 > 0.05 + AND (fd_b.metadata->>'pose_yaw')::float8 < -0.05 +ORDER BY fd_a.frame_number ASC +LIMIT 1 +``` + +**Mutual Gaze 判斷邏輯**: +``` +if face_a is LEFT of face_b (bbox.x_a < bbox.x_b): + mutual_gaze = (yaw_a > GAZE_THRESHOLD) AND (yaw_b < -GAZE_THRESHOLD) +if face_a is RIGHT of face_b: + mutual_gaze = (yaw_a < -GAZE_THRESHOLD) AND (yaw_b > GAZE_THRESHOLD) +``` + +--- + +### 4.8 `interaction_network` — 互動網絡(未來) + +**依賴**:TKG `CO_OCCURS_WITH` edges 完整。 + +**SQL**: +```sql +SELECT src_i.name as identity_a, tgt_i.name as identity_b, + COUNT(DISTINCT te.id) as cooccurrence_count, + MIN((te.properties->>'first_frame')::int) as first_frame +FROM tkg_edges te +JOIN tkg_nodes src_n ON src_n.id = te.source_node_id +JOIN tkg_nodes tgt_n ON tgt_n.id = te.target_node_id +JOIN face_detections fd_src ON fd_src.trace_id = REPLACE(src_n.external_id, 'trace_', '')::int +JOIN face_detections fd_tgt ON fd_tgt.trace_id = REPLACE(tgt_n.external_id, 'trace_', '')::int +JOIN identities src_i ON src_i.id = fd_src.identity_id +JOIN identities tgt_i ON tgt_i.id = fd_tgt.identity_id +WHERE te.file_uuid = $1 + AND te.edge_type = 'CO_OCCURS_WITH' + AND src_n.node_type = 'face_trace' AND tgt_n.node_type = 'face_trace' + AND src_i.name != tgt_i.name +GROUP BY src_i.name, tgt_i.name +ORDER BY cooccurrence_count DESC +``` + +--- + +## 5. Gemma4 整合 + +### 已驗證功能 + +| 測試 | 結果 | +|------|------| +| Function calling 觸發 | ✅ 正確呼叫 tkg_query | +| 中文問題理解 | ✅ 「男女主第一次同框」→ first_cooccurrence | +| 參數填充 | ✅ 正確填入 file_uuid、query_type | +| 多輪對話(tool result → answer) | ✅ 模型正確消化資料後回答 | +| 推論型問題(「最重要的配角」) | ✅ 選擇 top_identities + 自行推理 | + +### 已知限制 + +| 問題 | 解決方案 | +|------|---------| +| file_uuid 須由 system prompt 提供 | 在 prompt 中指定 | +| `identity_a` 使用「女主」無法自動匹配 | require identity_a/b 明確名稱 | +| 模型可能拒絕呼叫 tool(約 5-10%) | system prompt 明確要求「先查詢」 | + +### System Prompt 模板 + +``` +你是 Momentry 影片分析系統。你正在分析電影 {title},file_uuid 為 {file_uuid}。 + +你有 tkg_query 工具可用,可以查詢影片的人物資料、出場時間、互動關係。 +請先使用工具查詢,再根據查詢結果回答問題。 +不要憑空猜測影片內容。 +``` + +--- + +## 7. TKG 擴充:Pose + Mutual Gaze + +### 7.1 現狀 + +| 元件 | 有 pose? | 有 mutual gaze? | +|------|-----------|-----------------| +| `face.json` | ✅ yaw/pitch/roll | ❌ 未計算 | +| `face_detections.metadata` | ❌ 無 | ❌ 無 | +| `tkg_nodes` (face_trace) | ❌ 無 | — | +| `tkg_edges` (CO_OCCURS_WITH) | ❌ 無 | ❌ 無 | + +### 7.2 目標 + +``` +face_processor → face.json (有 pose) + ↓ +tkg.rs (TKG builder) + ├── 讀取 face.json 的 pose 資料 + ├── 算 avg_yaw/pitch/roll → 寫入 face_trace node + ├── 對同框 face_trace 配對,判斷 mutual_gaze + └── 寫入 CO_OCCURS_WITH edge properties + ↓ +tkg_edges.properties.mutual_gaze = true + ↓ +API 查詢 → 互看 + 面積最大 → 代表 frame +``` + +### 7.3 face_trace node 新增 properties + +```json +{ + "frame_count": 53, + "start_frame": 38165, + "end_frame": 38321, + "avg_bbox": {"x": 731, "y": 215, "width": 228, "height": 228}, + "avg_yaw": 0.014, + "avg_pitch": 0.224, + "avg_roll": -0.069 +} +``` + +### 7.4 CO_OCCURS_WITH edge 新增 properties + +```json +{ + "first_frame": 38187, + "frame_count": 156, + "mutual_gaze": true, + "yaw_a_avg": 0.021, + "yaw_b_avg": -0.421, + "gaze_angle_delta": 0.442 +} +``` + +### 7.5 Mutual Gaze 判斷邏輯 + +```python +GAZE_THRESHOLD = 0.05 # rad + +def detect_mutual_gaze(frame_a, frame_b): + # 判斷 A 和 B 的左右位置關係 + if bbox_a.cx < bbox_b.cx: + # A 在左,B 在右 → A 要看右,B 要看左 + return yaw_a > GAZE_THRESHOLD and yaw_b < -GAZE_THRESHOLD + else: + # A 在右,B 在左 → A 要看左,B 要看右 + return yaw_a < -GAZE_THRESHOLD and yaw_b > GAZE_THRESHOLD +``` + +### 7.6 代表 Frame 選取邏輯(應用端) + +``` +1. 查 TKG: MATCH (a)-[e:CO_OCCURS_WITH]->(b) WHERE e.mutual_gaze = true + → 回傳互看 frame_count 最高的 face_trace 配對 +2. 取該配對的 frame 範圍內,面積×信心最高 frame +3. FFmpeg blurdetect → 選最清晰的作為代表 +``` + +### 7.7 Timeline + +| Phase | 內容 | 工時 | +|-------|------|------| +| 1 | tkg.rs 讀取 face.json 的 pose + 寫入 node | 4-6h | +| 2 | mutual_gaze 判斷 + 寫入 edge | 3-4h | +| 3 | 對 Charade 重跑 TKG + 驗證 | 1h | +| 4 | 代表 frame 選取邏輯 | 2-3h | + +--- + +## 8. Phase 計畫 + +| Phase | Query Types | 預計工時 | 依賴 | +|-------|------------|---------|------| +| **1** | `top_identities`, `identity_details`, `first_cooccurrence`, `file_info` | 2-3h | 已有資料 | +| **2** | `identity_traces`, `cut_details` | 1-2h | 已有資料 | +| **3** | `mutual_gaze` | 2-3h | pose 入 `face_detections.metadata` | +| **4** | `interaction_network` | 2-3h | TKG edges 完善 | + +--- + +## 7. 版本歷史 + +| 日期 | 版本 | 作者 | 變更 | +|------|------|------|------| +| 2026-05-21 | 1.0 | OpenCode | 初始設計文件 | + +*Updated: 2026-05-21* diff --git a/docs_v1.0/doc_wasm/modules/08_media.md b/docs_v1.0/doc_wasm/modules/08_media.md index e75e13f..cf81696 100644 --- a/docs_v1.0/doc_wasm/modules/08_media.md +++ b/docs_v1.0/doc_wasm/modules/08_media.md @@ -37,6 +37,148 @@ Stream video with highlights for a specific face trace (follows a single person --- +### `GET /api/v1/file/:file_uuid/trace/:trace_id/representative-face` + +Find the best single face to represent this trace. Uses a two-stage selection: SQL (area × confidence → top 10) then FFmpeg `blurdetect` (sharpness → pick the least blurry). + +**Auth**: Required +**Scope**: file-level + +#### Example + +```bash +curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/representative-face" \ + -H "X-API-Key: $KEY" +``` + +#### Response (200) + +```json +{ + "success": true, + "file_uuid": "aeed71342a899fe4b4c57b7d41bcb692", + "trace_id": 1939, + "face_count": 538, + "representative": { + "frame_number": 68193, + "timestamp_secs": 2727.72, + "bbox": { "x": 347, "y": 378, "width": 427, "height": 427 }, + "confidence": 0.760, + "quality_score": 138516, + "blur_score": 9.46 + } +} +``` + +#### Response Fields + +| Field | Type | Description | +|-------|------|-------------| +| `trace_id` | integer | Face trace ID | +| `face_count` | integer | Total face detections in this trace | +| `representative.frame_number` | integer | Frame number of the selected face (primary coordinate) | +| `representative.timestamp_secs` | float | Time in seconds (derived from `frame_number / fps`) | +| `representative.bbox` | object | Bounding box `{x, y, width, height}` | +| `representative.confidence` | float | Detection confidence (0.0–1.0) | +| `representative.quality_score` | float | Pre-selection score (`area × confidence`) | +| `representative.blur_score` | float | FFmpeg blurdetect result (lower = sharper) | + +#### Error Responses + +--- + +### `GET /api/v1/file/:file_uuid/trace/:trace_id/thumbnail` + +Extract the best face image for a trace as JPEG (320×320). Internally selects the face using the same two-stage algorithm as `representative-face`, then crops via FFmpeg. The result is cacheable for 24 hours. + +**Auth**: Required +**Scope**: file-level + +#### Example + +```bash +curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/thumbnail" \ + -H "X-API-Key: $KEY" -o trace_1939_face.jpg +``` + +#### Response + +- **200**: `image/jpeg` binary data (320×320 cropped face) +- **404**: File, trace not found, or no suitable face +- **500**: FFmpeg or database error + +--- + +### `GET /api/v1/file/:file_uuid/identities/:identity_uuid_a/co-occur-with/:identity_uuid_b` + +Find the first frame where two identities appear together, with representative face thumbnails for both. + +**Auth**: Required +**Scope**: file-level + +#### Example + +```bash +# Audrey Hepburn & Cary Grant 第一次同框 +curl -s "$API/api/v1/file/$FILE_UUID/identities/$AUDREY_UUID/co-occur-with/$CARY_UUID" \ + -H "X-API-Key: $KEY" | jq '{identity_a: .identity_a.name, identity_b: .identity_b.name, first_frame: .first_cooccurrence.frame_number}' +``` + +#### Response (200) + +```json +{ + "success": true, + "file_uuid": "aeed71342a899fe4b4c57b7d41bcb692", + "identity_a": { + "identity_uuid": "c3545906-c82d-4b66-aa1d-150bc02decce", + "name": "Audrey Hepburn", + "trace_id": 920 + }, + "identity_b": { + "identity_uuid": "2b0ddefe-e2a9-4533-9308-b375594604d5", + "name": "Cary Grant", + "trace_id": 919 + }, + "first_cooccurrence": { + "frame_number": 38165, + "timestamp_secs": 1526.60, + "total_cooccurrence_frames": 3136, + "representative_face_a": { + "frame_number": 38199, + "bbox": { "x": 122, "y": 339, "width": 176, "height": 176 }, + "confidence": 0.832, + "thumbnail_url": "/api/v1/file/aeed71342.../trace/920/thumbnail" + }, + "representative_face_b": { + "frame_number": 38291, + "bbox": { "x": 511, "y": 315, "width": 192, "height": 192 }, + "confidence": 0.791, + "thumbnail_url": "/api/v1/file/aeed71342.../trace/919/thumbnail" + } + } +} +``` + +#### Response Fields + +| Field | Type | Description | +|-------|------|-------------| +| `identity_a.name` | string | First identity name | +| `identity_b.name` | string | Second identity name | +| `first_cooccurrence.frame_number` | int | First frame where both appear | +| `first_cooccurrence.timestamp_secs` | float | Time in seconds | +| `first_cooccurrence.total_cooccurrence_frames` | int | Total frames with both present | +| `first_cooccurrence.representative_face_a/b` | object | Best face thumbnail data for each identity | + +#### Error Responses + +| HTTP | When | +|------|------| +| `404` | File or identity not found | +| `404` | The two identities never co-occur in this file | +| `500` | Database or FFmpeg error | + ### `GET /api/v1/file/:file_uuid/video/bbox` Stream video with bounding box overlay for all detected objects/faces. diff --git a/src/api/trace_agent_api.rs b/src/api/trace_agent_api.rs index 193877a..c1d4dd1 100644 --- a/src/api/trace_agent_api.rs +++ b/src/api/trace_agent_api.rs @@ -1,7 +1,8 @@ use axum::{ + body::Body, extract::{Path, Query, State}, - http::StatusCode, - response::Json, + http::{header, StatusCode}, + response::{IntoResponse, Json, Response}, routing::{get, post}, Router, }; @@ -16,6 +17,22 @@ pub fn trace_agent_routes() -> Router { "/api/v1/file/:file_uuid/trace/:trace_id/faces", get(list_trace_faces), ) + .route( + "/api/v1/file/:file_uuid/trace/:trace_id/representative-face", + get(get_representative_face), + ) + .route( + "/api/v1/file/:file_uuid/trace/:trace_id/thumbnail", + get(get_trace_thumbnail), + ) + .route( + "/api/v1/file/:file_uuid/identities/:identity_uuid_a/co-occur-with/:identity_uuid_b", + get(get_cooccurrence), + ) + .route( + "/api/v1/file/:file_uuid/tkg/rebuild", + post(rebuild_tkg), + ) } #[derive(Debug, Deserialize)] @@ -328,3 +345,441 @@ async fn list_trace_faces( faces, })) } + +#[derive(Debug, Serialize)] +struct RepFaceBbox { + x: i32, + y: i32, + width: i32, + height: i32, +} + +#[derive(Debug, Serialize)] +struct RepFaceResult { + frame_number: i64, + timestamp_secs: f64, + bbox: RepFaceBbox, + confidence: f64, + quality_score: f64, + blur_score: f64, +} + +#[derive(Debug, Serialize)] +struct RepFaceResponse { + success: bool, + file_uuid: String, + trace_id: i32, + face_count: i64, + representative: RepFaceResult, +} + +struct RepFaceSelection { + frame: i64, + x: i32, + y: i32, + w: i32, + h: i32, + conf: f64, + blur: f64, + score: f64, + video_path: String, + fps: f64, + face_count: i64, +} + +async fn select_rep_face( + pool: &sqlx::PgPool, + file_uuid: &str, + trace_id: i32, + err_fn: F, +) -> Result +where + F: Fn(anyhow::Error) -> T, +{ + use crate::core::db::schema; + let fd_table = schema::table_name("face_detections"); + let video_table = schema::table_name("videos"); + + let fps: f64 = sqlx::query_scalar(&format!( + "SELECT COALESCE(fps, 25.0) FROM {} WHERE file_uuid = $1", video_table + )) + .bind(file_uuid) + .fetch_optional(pool) + .await + .map_err(|e| err_fn(anyhow::anyhow!("{}", e)))? + .unwrap_or(25.0); + + let face_count: (i64,) = sqlx::query_as(&format!( + "SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND trace_id = $2", fd_table + )) + .bind(file_uuid) + .bind(trace_id) + .fetch_one(pool) + .await + .map_err(|e| err_fn(anyhow::anyhow!("{}", e)))?; + + struct Candidate { frame: i64, x: i32, y: i32, w: i32, h: i32, conf: f64, score: f64 } + + let rows = sqlx::query_as::<_, (i64, i32, i32, i32, i32, f64)>(&format!( + "SELECT frame_number::bigint, x, y, width, height, confidence::float8 \ + FROM {} WHERE file_uuid = $1 AND trace_id = $2 AND confidence > 0.7 \ + AND ((metadata->>'qc_ok')::boolean IS NULL OR (metadata->>'qc_ok')::boolean = true) \ + ORDER BY (width::float8 * height::float8) * confidence::float8 DESC LIMIT 10", + fd_table + )) + .bind(file_uuid).bind(trace_id) + .fetch_all(pool) + .await + .map_err(|e| err_fn(anyhow::anyhow!("{}", e)))?; + + if rows.is_empty() { + return Err(err_fn(anyhow::anyhow!("No suitable face found"))); + } + + let candidates: Vec = rows.into_iter() + .map(|(frame, x, y, w, h, conf)| { + let score = (w as f64 * h as f64) * conf; + Candidate { frame, x, y, w, h, conf, score } + }) + .collect(); + + let video_path: String = sqlx::query_scalar(&format!( + "SELECT file_path FROM {} WHERE file_uuid = $1", video_table + )) + .bind(file_uuid) + .fetch_optional(pool) + .await + .map_err(|e| err_fn(anyhow::anyhow!("{}", e)))? + .ok_or_else(|| err_fn(anyhow::anyhow!("Video not found")))?; + + let mut best = candidates[0].frame; + let mut best_blur = f64::MAX; + let mut best_idx = 0usize; + + for (i, c) in candidates.iter().enumerate() { + let seek = c.frame as f64 / fps; + if let Ok(output) = tokio::process::Command::new("ffmpeg") + .args(["-ss", &format!("{:.2}", seek), "-i", &video_path, + "-vframes", "1", "-vf", &format!("crop={}:{}:{}:{},blurdetect", c.w, c.h, c.x, c.y), + "-f", "null", "-"]) + .output().await + { + let stderr = String::from_utf8_lossy(&output.stderr); + for line in stderr.lines() { + if let Some(blur_str) = line.split("blur mean: ").nth(1) { + if let Ok(blur) = blur_str.trim().parse::() { + if blur < best_blur { best_blur = blur; best = c.frame; best_idx = i; } + } + } + } + } + } + + let chosen = &candidates[best_idx]; + Ok(RepFaceSelection { + frame: chosen.frame, x: chosen.x, y: chosen.y, w: chosen.w, h: chosen.h, + conf: chosen.conf, blur: best_blur, score: chosen.score, + video_path, fps, face_count: face_count.0, + }) +} + +async fn get_representative_face( + State(state): State, + Path((file_uuid, trace_id)): Path<(String, i32)>, +) -> Result, (StatusCode, Json)> { + let sel = select_rep_face(state.db.pool(), &file_uuid, trace_id, |e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + }).await?; + + Ok(Json(RepFaceResponse { + success: true, + file_uuid, + trace_id, + face_count: sel.face_count, + representative: RepFaceResult { + frame_number: sel.frame, + timestamp_secs: sel.frame as f64 / sel.fps, + bbox: RepFaceBbox { x: sel.x, y: sel.y, width: sel.w, height: sel.h }, + confidence: sel.conf, + quality_score: sel.score, + blur_score: sel.blur, + }, + })) +} + +async fn get_trace_thumbnail( + State(state): State, + Path((file_uuid, trace_id)): Path<(String, i32)>, +) -> Result)> { + let sel = select_rep_face(state.db.pool(), &file_uuid, trace_id, |e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + }).await?; + + let seek = sel.frame as f64 / sel.fps; + let tmp = std::env::temp_dir().join(format!("trace_{}_{}.jpg", file_uuid, trace_id)); + + let status = tokio::process::Command::new("ffmpeg") + .args([ + "-ss", &format!("{:.2}", seek), + "-i", &sel.video_path, + "-vframes", "1", + "-vf", &format!("crop={}:{}:{}:{},scale=320:320", sel.w, sel.h, sel.x, sel.y), + "-q:v", "2", + "-y", &tmp.to_string_lossy().to_string(), + ]) + .output() + .await + .map_err(|e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + })?; + + if !status.status.success() { + return Err((StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": "FFmpeg failed"})))); + } + + let bytes = tokio::fs::read(&tmp).await.map_err(|e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + })?; + + let _ = tokio::fs::remove_file(&tmp).await; + + Ok(Response::builder() + .status(StatusCode::OK) + .header(header::CONTENT_TYPE, "image/jpeg") + .header(header::CACHE_CONTROL, "public, max-age=86400") + .body(Body::from(bytes)) + .unwrap()) +} + +#[derive(Debug, Serialize)] +struct CoOccurIdentity { + identity_uuid: String, + name: String, + trace_id: i32, +} + +#[derive(Debug, Serialize)] +struct CoOccurRepFace { + frame_number: i64, + bbox: RepFaceBbox, + confidence: f64, + thumbnail_url: String, +} + +#[derive(Debug, Serialize)] +struct CoOccurrence { + frame_number: i64, + timestamp_secs: f64, + total_cooccurrence_frames: i64, + representative_face_a: Option, + representative_face_b: Option, +} + +#[derive(Debug, Serialize)] +struct CoOccurResponse { + success: bool, + file_uuid: String, + identity_a: CoOccurIdentity, + identity_b: CoOccurIdentity, + first_cooccurrence: CoOccurrence, +} + +async fn get_cooccurrence( + State(state): State, + Path((file_uuid, identity_uuid_a, identity_uuid_b)): Path<(String, String, String)>, +) -> Result, (StatusCode, Json)> { + use crate::core::db::schema; + let id_table = schema::table_name("identities"); + let fd_table = schema::table_name("face_detections"); + + // Stage 1: Get identity names and IDs + let id_a = sqlx::query_as::<_, (i32, String)>(&format!( + "SELECT id, name FROM {} WHERE uuid::text = $1 OR REPLACE(uuid::text, '-', '') = $1", + id_table + )) + .bind(&identity_uuid_a) + .fetch_optional(state.db.pool()) + .await + .map_err(|e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + })? + .ok_or_else(|| { + (StatusCode::NOT_FOUND, Json(serde_json::json!({"error": "Identity A not found"}))) + })?; + + let id_b = sqlx::query_as::<_, (i32, String)>(&format!( + "SELECT id, name FROM {} WHERE uuid::text = $1 OR REPLACE(uuid::text, '-', '') = $1", + id_table + )) + .bind(&identity_uuid_b) + .fetch_optional(state.db.pool()) + .await + .map_err(|e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + })? + .ok_or_else(|| { + (StatusCode::NOT_FOUND, Json(serde_json::json!({"error": "Identity B not found"}))) + })?; + + // Stage 2: Find first frame where both identity_ids appear + let cooccur: Option<(i64,)> = sqlx::query_as( + &format!( + "SELECT MIN(fd.frame_number)::bigint FROM {} fd \ + WHERE fd.file_uuid = $1 AND fd.identity_id = $2 \ + AND fd.frame_number IN ( \ + SELECT frame_number FROM {} \ + WHERE file_uuid = $1 AND identity_id = $3 \ + )", + fd_table, fd_table + ) + ) + .bind(&file_uuid) + .bind(id_a.0) + .bind(id_b.0) + .fetch_optional(state.db.pool()) + .await + .map_err(|e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + })?; + + let (first_frame,) = cooccur.ok_or_else(|| { + (StatusCode::NOT_FOUND, Json(serde_json::json!({"error": "These two identities never appear together in this file"}))) + })?; + + // Get fps for timestamp + let video_table = schema::table_name("videos"); + let fps: f64 = sqlx::query_scalar(&format!( + "SELECT COALESCE(fps, 25.0) FROM {} WHERE file_uuid = $1", video_table + )) + .bind(&file_uuid) + .fetch_optional(state.db.pool()) + .await + .map_err(|e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + })? + .unwrap_or(25.0); + + // Stage 3: Get trace_ids for both at this frame + let trace_a: Option<(i32,)> = sqlx::query_as( + &format!("SELECT trace_id FROM {} WHERE file_uuid = $1 AND frame_number = $2 AND identity_id = $3 AND trace_id IS NOT NULL LIMIT 1", fd_table) + ) + .bind(&file_uuid).bind(first_frame).bind(id_a.0) + .fetch_optional(state.db.pool()).await + .map_err(|e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + })?; + + let trace_b: Option<(i32,)> = sqlx::query_as( + &format!("SELECT trace_id FROM {} WHERE file_uuid = $1 AND frame_number = $2 AND identity_id = $3 AND trace_id IS NOT NULL LIMIT 1", fd_table) + ) + .bind(&file_uuid).bind(first_frame).bind(id_b.0) + .fetch_optional(state.db.pool()).await + .map_err(|e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + })?; + + // Stage 4: Get representative faces for both traces (reusing select_rep_face) + let rep_a = if let Some((tid,)) = trace_a { + select_rep_face(state.db.pool(), &file_uuid, tid, |e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + }).await.ok().map(|sel| CoOccurRepFace { + frame_number: sel.frame, + bbox: RepFaceBbox { x: sel.x, y: sel.y, width: sel.w, height: sel.h }, + confidence: sel.conf, + thumbnail_url: format!("/api/v1/file/{}/trace/{}/thumbnail", file_uuid, tid), + }) + } else { None }; + + let rep_b = if let Some((tid,)) = trace_b { + select_rep_face(state.db.pool(), &file_uuid, tid, |e| { + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()}))) + }).await.ok().map(|sel| CoOccurRepFace { + frame_number: sel.frame, + bbox: RepFaceBbox { x: sel.x, y: sel.y, width: sel.w, height: sel.h }, + confidence: sel.conf, + thumbnail_url: format!("/api/v1/file/{}/trace/{}/thumbnail", file_uuid, tid), + }) + } else { None }; + + // Total co-occurrence frames (from TKG if available, otherwise from face_detections) + let total_cooccurrence_frames: i64 = sqlx::query_scalar( + &format!( + "SELECT COUNT(DISTINCT fd.frame_number)::bigint FROM {} fd \ + WHERE fd.file_uuid = $1 AND fd.identity_id = $2 \ + AND fd.frame_number IN ( \ + SELECT frame_number FROM {} \ + WHERE file_uuid = $1 AND identity_id = $3 \ + )", + fd_table, fd_table + ) + ) + .bind(&file_uuid).bind(id_a.0).bind(id_b.0) + .fetch_one(state.db.pool()).await + .unwrap_or(0); + + Ok(Json(CoOccurResponse { + success: true, + file_uuid, + identity_a: CoOccurIdentity { + identity_uuid: identity_uuid_a, + name: id_a.1, + trace_id: trace_a.map(|t| t.0).unwrap_or(0), + }, + identity_b: CoOccurIdentity { + identity_uuid: identity_uuid_b, + name: id_b.1, + trace_id: trace_b.map(|t| t.0).unwrap_or(0), + }, + first_cooccurrence: CoOccurrence { + frame_number: first_frame, + timestamp_secs: first_frame as f64 / fps, + total_cooccurrence_frames, + representative_face_a: rep_a, + representative_face_b: rep_b, + }, + })) +} + +use crate::core::config::OUTPUT_DIR; + +#[derive(Serialize)] +struct TkgRebuildResponse { + success: bool, + file_uuid: String, + result: Option, + error: Option, +} + +async fn rebuild_tkg( + State(state): State, + Path(file_uuid): Path, +) -> Json { + let result = crate::core::processor::tkg::build_tkg( + &state.db, + &file_uuid, + &OUTPUT_DIR, + ) + .await; + + match result { + Ok(r) => Json(TkgRebuildResponse { + success: true, + file_uuid, + result: Some(serde_json::json!({ + "face_trace_nodes": r.face_trace_nodes, + "object_nodes": r.object_nodes, + "speaker_nodes": r.speaker_nodes, + "co_occurrence_edges": r.co_occurrence_edges, + "speaker_face_edges": r.speaker_face_edges, + "face_face_edges": r.face_face_edges, + })), + error: None, + }), + Err(e) => Json(TkgRebuildResponse { + success: false, + file_uuid, + result: None, + error: Some(e.to_string()), + }), + } +} diff --git a/src/core/processor/tkg.rs b/src/core/processor/tkg.rs index 39a7626..9cd7d3d 100644 --- a/src/core/processor/tkg.rs +++ b/src/core/processor/tkg.rs @@ -15,6 +15,92 @@ fn t(name: &str) -> String { } } +// ── Pose data from face.json ──────────────────────────────────────── + +#[derive(Debug, Clone)] +struct FacePose { + frame: i64, + x: f64, + y: f64, + w: f64, + h: f64, + yaw: f64, + pitch: f64, + roll: f64, +} + +fn load_face_pose_data(output_dir: &str, file_uuid: &str) -> Result> { + let path = Path::new(output_dir).join(format!("{}.face.json", file_uuid)); + let content = std::fs::read_to_string(&path) + .with_context(|| format!("Failed to read face.json: {}", path.display()))?; + let json: serde_json::Value = serde_json::from_str(&content)?; + + let mut poses = Vec::new(); + if let Some(frames) = json.get("frames").and_then(|v| v.as_array()) { + for frame_entry in frames { + let frame_num = frame_entry.get("frame").and_then(|v| v.as_i64()).unwrap_or(0); + if let Some(faces) = frame_entry.get("faces").and_then(|v| v.as_array()) { + for face in faces { + let bbox = match face.get("bbox") { + Some(b) => b, + None => continue, + }; + let pose = match face.get("pose") { + Some(p) => p, + None => continue, + }; + poses.push(FacePose { + frame: frame_num, + x: bbox.get("x").and_then(|v| v.as_f64()).unwrap_or(0.0), + y: bbox.get("y").and_then(|v| v.as_f64()).unwrap_or(0.0), + w: bbox.get("width").and_then(|v| v.as_f64()).unwrap_or(0.0), + h: bbox.get("height").and_then(|v| v.as_f64()).unwrap_or(0.0), + yaw: pose.get("yaw").and_then(|v| v.as_f64()).unwrap_or(0.0), + pitch: pose.get("pitch").and_then(|v| v.as_f64()).unwrap_or(0.0), + roll: pose.get("roll").and_then(|v| v.as_f64()).unwrap_or(0.0), + }); + } + } + } + } + Ok(poses) +} + +/// Match a face from face_detections (frame, x, y, w, h) to its pose in face.json +/// Uses bbox center distance to find the best match when multiple faces per frame. +fn get_pose_for_face(frame: i64, x: f64, y: f64, w: f64, h: f64, poses: &[FacePose]) -> Option<(f64, f64, f64)> { + let cx = x + w / 2.0; + let cy = y + h / 2.0; + let mut best_dist = f64::MAX; + let mut result = None; + for p in poses.iter().filter(|p| p.frame == frame) { + let pcx = p.x + p.w / 2.0; + let pcy = p.y + p.h / 2.0; + let dist = (cx - pcx).abs() + (cy - pcy).abs(); + if dist < best_dist { + best_dist = dist; + result = Some((p.yaw, p.pitch, p.roll)); + } + } + result +} + +fn detect_mutual_gaze( + bbox_a_x: f64, bbox_a_w: f64, yaw_a: f64, + bbox_b_x: f64, bbox_b_w: f64, yaw_b: f64, + threshold: f64, +) -> bool { + let cx_a = bbox_a_x + bbox_a_w / 2.0; + let cx_b = bbox_b_x + bbox_b_w / 2.0; + if cx_a < cx_b { + // A 在左,B 在右 → A 要看右 (yaw > 0),B 要看左 (yaw < 0) + yaw_a > threshold && yaw_b < -threshold + } else { + // A 在右,B 在左 → A 要看左 (yaw < 0),B 要看右 (yaw > 0) + yaw_a < -threshold && yaw_b > threshold + } +} + // ── Input data structs ──────────────────────────────────────────── #[derive(Debug, Deserialize)] @@ -108,13 +194,16 @@ pub struct TkgResult { pub async fn build_tkg(db: &PostgresDb, file_uuid: &str, output_dir: &str) -> Result { let pool = db.pool(); - let n_face = build_face_trace_nodes(pool, file_uuid).await?; + let pose_data = load_face_pose_data(output_dir, file_uuid).unwrap_or_default(); + tracing::info!("[TKG] Loaded {} pose entries from face.json", pose_data.len()); + + let n_face = build_face_trace_nodes(pool, file_uuid, &pose_data).await?; let n_objects = build_yolo_object_nodes(pool, file_uuid, output_dir).await?; let n_speakers = build_speaker_nodes(pool, file_uuid, output_dir).await?; let e_co = build_co_occurrence_edges(pool, file_uuid, output_dir).await?; let e_sf = build_speaker_face_edges(pool, file_uuid, output_dir).await?; - let e_ff = build_face_face_edges(pool, file_uuid).await?; + let e_ff = build_face_face_edges(pool, file_uuid, &pose_data).await?; Ok(TkgResult { face_trace_nodes: n_face, @@ -128,16 +217,16 @@ pub async fn build_tkg(db: &PostgresDb, file_uuid: &str, output_dir: &str) -> Re // ── Node builders ───────────────────────────────────────────────── -async fn build_face_trace_nodes(pool: &PgPool, file_uuid: &str) -> Result { +async fn build_face_trace_nodes(pool: &PgPool, file_uuid: &str, pose_data: &[FacePose]) -> Result { let face_table = t("face_detections"); let nodes_table = t("tkg_nodes"); let rows = sqlx::query_as::<_, FaceTraceRow>(&format!( r#" - SELECT trace_id, + SELECT trace_id::bigint, COUNT(*)::bigint as frame_count, - MIN(frame_number) as start_f, - MAX(frame_number) as end_f, + MIN(frame_number)::bigint as start_f, + MAX(frame_number)::bigint as end_f, AVG(x::float8) as avg_x, AVG(y::float8) as avg_y, AVG(width::float8) as avg_w, @@ -153,10 +242,53 @@ async fn build_face_trace_nodes(pool: &PgPool, file_uuid: &str) -> Result .fetch_all(pool) .await?; + // Load per-frame data for pose matching + let frame_rows: Vec<(i64, i64, f64, f64, f64, f64)> = sqlx::query_as( + &format!( + "SELECT trace_id::bigint, frame_number::bigint, x::float8, y::float8, width::float8, height::float8 \ + FROM {} WHERE file_uuid = $1 AND trace_id IS NOT NULL ORDER BY trace_id, frame_number", + face_table + ) + ) + .bind(file_uuid) + .fetch_all(pool) + .await?; + + // Group by trace_id: trace_id → Vec<(frame, x, y, w, h)> + let mut trace_frames: HashMap> = HashMap::new(); + for (tid, frame, x, y, w, h) in &frame_rows { + trace_frames.entry(*tid).or_default().push((*frame, *x, *y, *w, *h)); + } + let mut count = 0; for row in &rows { - let external_id = format!("trace_{}", row.trace_id); - let label = format!("Face Trace {}", row.trace_id); + let tid = row.trace_id; + let external_id = format!("trace_{}", tid); + let label = format!("Face Trace {}", tid); + + // Compute average pose for this trace + let mut yaw_sum = 0.0f64; + let mut pitch_sum = 0.0f64; + let mut roll_sum = 0.0f64; + let mut pose_count = 0i64; + + if let Some(frames) = trace_frames.get(&tid) { + for (frame, x, y, w, h) in frames { + if let Some((yaw, pitch, roll)) = get_pose_for_face(*frame, *x, *y, *w, *h, pose_data) { + yaw_sum += yaw; + pitch_sum += pitch; + roll_sum += roll; + pose_count += 1; + } + } + } + + let (avg_yaw, avg_pitch, avg_roll) = if pose_count > 0 { + (yaw_sum / pose_count as f64, pitch_sum / pose_count as f64, roll_sum / pose_count as f64) + } else { + (0.0, 0.0, 0.0) + }; + let props = serde_json::json!({ "frame_count": row.frame_count, "start_frame": row.start_f, @@ -166,7 +298,11 @@ async fn build_face_trace_nodes(pool: &PgPool, file_uuid: &str) -> Result "y": row.avg_y.unwrap_or(0.0).round() as i64, "width": row.avg_w.unwrap_or(0.0).round() as i64, "height": row.avg_h.unwrap_or(0.0).round() as i64, - } + }, + "avg_yaw": (avg_yaw * 1000.0).round() / 1000.0, + "avg_pitch": (avg_pitch * 1000.0).round() / 1000.0, + "avg_roll": (avg_roll * 1000.0).round() / 1000.0, + "pose_count": pose_count, }); sqlx::query(&format!( @@ -312,7 +448,7 @@ async fn build_co_occurrence_edges( let edges_table = t("tkg_edges"); let face_rows = sqlx::query_as::<_, FaceDetectionRow>(&format!( - r#"SELECT trace_id, frame_number, x, y, width, height + r#"SELECT trace_id::bigint, frame_number::bigint, x::float8, y::float8, width::float8, height::float8 FROM {} WHERE file_uuid = $1 AND trace_id IS NOT NULL ORDER BY frame_number"#, face_table @@ -429,7 +565,7 @@ async fn build_speaker_face_edges( let edges_table = t("tkg_edges"); let traces = sqlx::query_as::<_, (i64, i64, i64)>(&format!( - r#"SELECT trace_id, MIN(frame_number) as start_f, MAX(frame_number) as end_f + r#"SELECT trace_id::bigint, MIN(frame_number)::bigint as start_f, MAX(frame_number)::bigint as end_f FROM {} WHERE file_uuid = $1 AND trace_id IS NOT NULL GROUP BY trace_id"#, face_table @@ -533,14 +669,15 @@ async fn build_speaker_face_edges( Ok(edge_count) } -async fn build_face_face_edges(pool: &PgPool, file_uuid: &str) -> Result { +async fn build_face_face_edges(pool: &PgPool, file_uuid: &str, pose_data: &[FacePose]) -> Result { let face_table = t("face_detections"); let nodes_table = t("tkg_nodes"); let edges_table = t("tkg_edges"); + // Use SQL JOIN for fast co-occurrence detection let rows: Vec<(i64, i64, i64)> = sqlx::query_as(&format!( r#" - SELECT a.trace_id AS tid_a, b.trace_id AS tid_b, a.frame_number + SELECT a.trace_id::bigint AS tid_a, b.trace_id::bigint AS tid_b, a.frame_number::bigint FROM {} a JOIN {} b ON a.file_uuid = b.file_uuid @@ -557,53 +694,123 @@ async fn build_face_face_edges(pool: &PgPool, file_uuid: &str) -> Result .fetch_all(pool) .await?; - if rows.is_empty() { - return Ok(0); + // Also load per-frame bbox for mutual_gaze lookups + let bbox_data: Vec<(i64, i64, f64, f64, f64, f64)> = sqlx::query_as( + &format!( + "SELECT trace_id::bigint, frame_number::bigint, x::float8, y::float8, width::float8, height::float8 \ + FROM {} WHERE file_uuid = $1 AND trace_id IS NOT NULL ORDER BY trace_id, frame_number", + face_table + ) + ) + .bind(file_uuid) + .fetch_all(pool) + .await?; + + let mut frame_map: HashMap<(i64, i64), (f64, f64, f64, f64)> = HashMap::new(); // (trace_id, frame) → (x, y, w, h) + for (tid, frame, x, y, w, h) in &bbox_data { + frame_map.insert((*tid, *frame), (*x, *y, *w, *h)); } - // Deduplicate by pair - let mut pair_frames: HashMap<(i64, i64), Vec> = HashMap::new(); + // Group by pair + let mut pair_frames: HashMap<(i64, i64), Vec<(i64, bool)>> = HashMap::new(); for (tid_a, tid_b, frame) in &rows { - let key = if *tid_a < *tid_b { - (*tid_a, *tid_b) - } else { - (*tid_b, *tid_a) + let key = (*tid_a.min(tid_b), *tid_a.max(tid_b)); + let bbox_a = frame_map.get(&(*tid_a, *frame)); + let bbox_b = frame_map.get(&(*tid_b, *frame)); + + let gaze = match (bbox_a, bbox_b) { + (Some(&(xa, ya, wa, ha)), Some(&(xb, yb, wb, hb))) => { + get_pose_for_face(*frame, xa, ya, wa, ha, pose_data) + .and_then(|(yaw_a, _, _)| { + get_pose_for_face(*frame, xb, yb, wb, hb, pose_data) + .map(|(yaw_b, _, _)| detect_mutual_gaze(xa, wa, yaw_a, xb, wb, yaw_b, 0.05)) + }) + .unwrap_or(false) + } + _ => false, }; - pair_frames.entry(key).or_default().push(*frame); + pair_frames.entry(key).or_default().push((*frame, gaze)); } let mut edge_count = 0; - for ((tid_a, tid_b), frames) in &pair_frames { + // Cache node IDs to avoid repeated queries + let mut node_id_cache: HashMap = HashMap::new(); + for ((tid_a, tid_b), frame_data) in &pair_frames { let ext_a = format!("trace_{}", tid_a); let ext_b = format!("trace_{}", tid_b); - let n_a: Option<(i64,)> = sqlx::query_as(&format!( - "SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2", - nodes_table - )) - .bind(file_uuid) - .bind(&ext_a) - .fetch_optional(pool) - .await?; - - let n_b: Option<(i64,)> = sqlx::query_as(&format!( - "SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2", - nodes_table - )) - .bind(file_uuid) - .bind(&ext_b) - .fetch_optional(pool) - .await?; - - let (n_a_id, n_b_id) = match (n_a, n_b) { - (Some((a,)), Some((b,))) => (a, b), - _ => continue, + let n_a_id = match node_id_cache.get(tid_a) { + Some(id) => *id, + None => { + if let Some((id,)) = sqlx::query_as::<_, (i64,)>(&format!( + "SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2", + nodes_table + )) + .bind(file_uuid).bind(&ext_a).fetch_optional(pool).await? + { + node_id_cache.insert(*tid_a, id); + id + } else { continue; } + } }; - let edge_props = serde_json::json!({ - "first_frame": frames[0], - "frame_count": frames.len() as i64, - }); + let n_b_id = match node_id_cache.get(tid_b) { + Some(id) => *id, + None => { + if let Some((id,)) = sqlx::query_as::<_, (i64,)>(&format!( + "SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2", + nodes_table + )) + .bind(file_uuid).bind(&ext_b).fetch_optional(pool).await? + { + node_id_cache.insert(*tid_b, id); + id + } else { continue; } + } + }; + + let frames: Vec = frame_data.iter().map(|(f, _)| *f).collect(); + let gaze_frames: Vec = frame_data.iter().filter(|(_, g)| *g).map(|(f, _)| *f).collect(); + let gaze_count = gaze_frames.len() as i64; + let has_gaze = gaze_count > 0; + + let edge_props = if has_gaze { + // Compute average yaw values for gaze frames + let mut yaw_a_sum = 0.0f64; + let mut yaw_b_sum = 0.0f64; + let mut gaze_sample = 0i64; + for (frame, _) in frame_data.iter().filter(|(_, g)| *g) { + let bbox_a = frame_map.get(&(*tid_a, *frame)); + let bbox_b = frame_map.get(&(*tid_b, *frame)); + if let (Some(&(xa, ya, wa, ha)), Some(&(xb, yb, wb, hb))) = (bbox_a, bbox_b) { + let pose_a = get_pose_for_face(*frame, xa, ya, wa, ha, pose_data); + let pose_b = get_pose_for_face(*frame, xb, yb, wb, hb, pose_data); + if let (Some((ya, _, _)), Some((yb, _, _))) = (pose_a, pose_b) { + yaw_a_sum += ya; + yaw_b_sum += yb; + gaze_sample += 1; + } + } + } + let (avg_ya, avg_yb) = if gaze_sample > 0 { + (yaw_a_sum / gaze_sample as f64, yaw_b_sum / gaze_sample as f64) + } else { (0.0, 0.0) }; + + serde_json::json!({ + "first_frame": frames[0], + "frame_count": frames.len() as i64, + "mutual_gaze": true, + "gaze_frame_count": gaze_count, + "yaw_a_avg": (avg_ya * 1000.0).round() / 1000.0, + "yaw_b_avg": (avg_yb * 1000.0).round() / 1000.0, + }) + } else { + serde_json::json!({ + "first_frame": frames[0], + "frame_count": frames.len() as i64, + "mutual_gaze": false, + }) + }; sqlx::query(&format!( r#"