Merge branch 'main' of http://192.168.110.200:3000/admin/momentry_core
This commit is contained in:
317
deliverable_v1.1.0/modules/08_media.md
Normal file
317
deliverable_v1.1.0/modules/08_media.md
Normal file
@@ -0,0 +1,317 @@
|
||||
<!-- module: media -->
|
||||
<!-- description: Video streaming & frame extraction -->
|
||||
<!-- depends: 01_auth -->
|
||||
|
||||
## Video Streaming & Frame Extraction
|
||||
|
||||
All video streaming endpoints support the following common query parameters:
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `mode` | string | No | `normal` | `normal` or `debug` (draws detection overlays) |
|
||||
| `audio` | string | No | `on` | `on` or `off` |
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/video`
|
||||
|
||||
Stream the full video file with range support for seeking.
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Response
|
||||
|
||||
- **200**: Video stream (`Content-Type` based on file extension)
|
||||
- **206**: Partial content (range request)
|
||||
- Supports `Range` header for seeking
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/trace/:trace_id/video`
|
||||
|
||||
Stream video with highlights for a specific face trace (follows a single person across frames with bounding box overlay).
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/trace/:trace_id/representative-face`
|
||||
|
||||
Find the best single face to represent this trace. Uses a two-stage selection: SQL (area × confidence → top 10) then FFmpeg `blurdetect` (sharpness → pick the least blurry).
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Example
|
||||
|
||||
```bash
|
||||
curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/representative-face" \
|
||||
-H "X-API-Key: $KEY"
|
||||
```
|
||||
|
||||
#### Response (200)
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"file_uuid": "aeed71342a899fe4b4c57b7d41bcb692",
|
||||
"trace_id": 1939,
|
||||
"face_count": 538,
|
||||
"representative": {
|
||||
"frame_number": 68193,
|
||||
"timestamp_secs": 2727.72,
|
||||
"bbox": { "x": 347, "y": 378, "width": 427, "height": 427 },
|
||||
"confidence": 0.760,
|
||||
"quality_score": 138516,
|
||||
"blur_score": 9.46
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Response Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `trace_id` | integer | Face trace ID |
|
||||
| `face_count` | integer | Total face detections in this trace |
|
||||
| `representative.frame_number` | integer | Frame number of the selected face (primary coordinate) |
|
||||
| `representative.timestamp_secs` | float | Time in seconds (derived from `frame_number / fps`) |
|
||||
| `representative.bbox` | object | Bounding box `{x, y, width, height}` |
|
||||
| `representative.confidence` | float | Detection confidence (0.0–1.0) |
|
||||
| `representative.quality_score` | float | Pre-selection score (`area × confidence`) |
|
||||
| `representative.blur_score` | float | FFmpeg blurdetect result (lower = sharper) |
|
||||
|
||||
#### Error Responses
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/trace/:trace_id/thumbnail`
|
||||
|
||||
Extract the best face image for a trace as JPEG (320×320). Internally selects the face using the same two-stage algorithm as `representative-face`, then crops via FFmpeg. The result is cacheable for 24 hours.
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Example
|
||||
|
||||
```bash
|
||||
curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/thumbnail" \
|
||||
-H "X-API-Key: $KEY" -o trace_1939_face.jpg
|
||||
```
|
||||
|
||||
#### Response
|
||||
|
||||
- **200**: `image/jpeg` binary data (320×320 cropped face)
|
||||
- **404**: File, trace not found, or no suitable face
|
||||
- **500**: FFmpeg or database error
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/identities/:identity_uuid_a/co-occur-with/:identity_uuid_b`
|
||||
|
||||
Find the first frame where two identities appear together, with representative face thumbnails for both.
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Example
|
||||
|
||||
```bash
|
||||
# Audrey Hepburn & Cary Grant 第一次同框
|
||||
curl -s "$API/api/v1/file/$FILE_UUID/identities/$AUDREY_UUID/co-occur-with/$CARY_UUID" \
|
||||
-H "X-API-Key: $KEY" | jq '{identity_a: .identity_a.name, identity_b: .identity_b.name, first_frame: .first_cooccurrence.frame_number}'
|
||||
```
|
||||
|
||||
#### Response (200)
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"file_uuid": "aeed71342a899fe4b4c57b7d41bcb692",
|
||||
"identity_a": {
|
||||
"identity_uuid": "c3545906-c82d-4b66-aa1d-150bc02decce",
|
||||
"name": "Audrey Hepburn",
|
||||
"trace_id": 920
|
||||
},
|
||||
"identity_b": {
|
||||
"identity_uuid": "2b0ddefe-e2a9-4533-9308-b375594604d5",
|
||||
"name": "Cary Grant",
|
||||
"trace_id": 919
|
||||
},
|
||||
"first_cooccurrence": {
|
||||
"frame_number": 38165,
|
||||
"timestamp_secs": 1526.60,
|
||||
"total_cooccurrence_frames": 3136,
|
||||
"representative_face_a": {
|
||||
"frame_number": 38199,
|
||||
"bbox": { "x": 122, "y": 339, "width": 176, "height": 176 },
|
||||
"confidence": 0.832,
|
||||
"thumbnail_url": "/api/v1/file/aeed71342.../trace/920/thumbnail"
|
||||
},
|
||||
"representative_face_b": {
|
||||
"frame_number": 38291,
|
||||
"bbox": { "x": 511, "y": 315, "width": 192, "height": 192 },
|
||||
"confidence": 0.791,
|
||||
"thumbnail_url": "/api/v1/file/aeed71342.../trace/919/thumbnail"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Response Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `identity_a.name` | string | First identity name |
|
||||
| `identity_b.name` | string | Second identity name |
|
||||
| `first_cooccurrence.frame_number` | int | First frame where both appear |
|
||||
| `first_cooccurrence.timestamp_secs` | float | Time in seconds |
|
||||
| `first_cooccurrence.total_cooccurrence_frames` | int | Total frames with both present |
|
||||
| `first_cooccurrence.representative_face_a/b` | object | Best face thumbnail data for each identity |
|
||||
|
||||
#### Error Responses
|
||||
|
||||
| HTTP | When |
|
||||
|------|------|
|
||||
| `404` | File or identity not found |
|
||||
| `404` | The two identities never co-occur in this file |
|
||||
| `500` | Database or FFmpeg error |
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/video/bbox`
|
||||
|
||||
Stream video with bounding box overlay for all detected objects/faces.
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
Uses a built-in 5×7 bitmap font renderer to draw labels directly on video frames via FFmpeg `drawtext` filter.
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/thumbnail`
|
||||
|
||||
Extract a single frame from a video as JPEG image. Uses FFmpeg `select` filter.
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Query Parameters
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `frame` | integer | Yes | — | Zero-based frame number to extract |
|
||||
| `x` | integer | No | — | Crop start X (left edge). Requires `y`, `w`, `h`. |
|
||||
| `y` | integer | No | — | Crop start Y (top edge). Requires `x`, `w`, `h`. |
|
||||
| `w` | integer | No | — | Crop width in pixels. Requires `x`, `y`, `h`. |
|
||||
| `h` | integer | No | — | Crop height in pixels. Requires `x`, `y`, `w`. |
|
||||
|
||||
All four crop params (`x`, `y`, `w`, `h`) must be provided together or omitted.
|
||||
|
||||
#### Example
|
||||
|
||||
```bash
|
||||
# Extract frame 1000 (full frame)
|
||||
curl -s "$API/api/v1/file/bd80fec92b0b6963d177a2c55bf713e2/thumbnail?frame=1000" \
|
||||
-H "Authorization: Bearer $JWT" -o frame_1000.jpg
|
||||
|
||||
# Extract and crop face region (x=320, y=240, w=160, h=160)
|
||||
curl -s "$API/api/v1/file/bd80fec92b0b6963d177a2c55bf713e2/thumbnail?frame=1000&x=320&y=240&w=160&h=160" \
|
||||
-H "Authorization: Bearer $JWT" -o face_crop.jpg
|
||||
```
|
||||
|
||||
#### Response
|
||||
|
||||
- **200**: `image/jpeg` binary data
|
||||
- **404**: File not found
|
||||
- **500**: FFmpeg error (e.g., frame number exceeds video duration)
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/clip`
|
||||
|
||||
Extract a video clip (time range) as MPEG-TS stream. Uses FFmpeg `-ss` fast seek.
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Query Parameters
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `start_frame` | integer | No* | — | Start frame (zero-based). **Frame-accurate** — use this for precision. |
|
||||
| `end_frame` | integer | No* | — | End frame (zero-based, inclusive). Requires `start_frame`. |
|
||||
| `start_time` | float | No* | — | Start time in seconds. Approximate (FPS-dependent). Fallback if frames not given. |
|
||||
| `end_time` | float | No* | — | End time in seconds. Approximate (FPS-dependent). Fallback if frames not given. |
|
||||
| `fps` | float | No | video FPS | Override frames-per-second for frame↔time calculation. Defaults to video's detected FPS. |
|
||||
| `mode` | string | No | `normal` | `normal` or `debug` (draws "CLIP" overlay) |
|
||||
| `audio` | string | No | `on` | `on` or `off` |
|
||||
|
||||
Either (`start_frame`+`end_frame`) OR (`start_time`+`end_time`) must be provided.
|
||||
|
||||
#### Example
|
||||
|
||||
```bash
|
||||
# Clip by frame range (primary)
|
||||
curl -s "$API/api/v1/file/bd80fec92b0b6963d177a2c55bf713e2/clip?start_frame=0&end_frame=47" \
|
||||
-H "Authorization: Bearer $JWT" -o clip.ts
|
||||
|
||||
# Clip by time range (fallback)
|
||||
curl -s "$API/api/v1/file/bd80fec92b0b6963d177a2c55bf713e2/clip?start_time=30&end_time=45" \
|
||||
-H "Authorization: Bearer $JWT" -o clip.ts
|
||||
```
|
||||
|
||||
#### Response
|
||||
|
||||
- **200**: `video/mp2t` MPEG-TS stream
|
||||
- **400**: Missing/invalid range parameters
|
||||
- **404**: File not found
|
||||
- **500**: FFmpeg error
|
||||
|
||||
#### Technical Notes
|
||||
|
||||
| Detail | Value |
|
||||
|--------|-------|
|
||||
| **Backend** | FFmpeg (`ffmpeg-full`) |
|
||||
| **Seek** | `-ss` before `-i` (fast keyframe seek) |
|
||||
| **Format** | MPEG-TS (`mpegts` muxer, pipe-safe) |
|
||||
| **Codec** | H.264 + AAC |
|
||||
| **Cache** | `Cache-Control: public, max-age=86400` (24h) |
|
||||
|
||||
### Video vs Clip: Quality & Format Comparison
|
||||
|
||||
Both endpoints support time range extraction, but serve different use cases:
|
||||
|
||||
| Feature | `/video` | `/clip` |
|
||||
|---------|----------|---------|
|
||||
| **No params** | Streams full file (Range seek) | Returns 400 (params required) |
|
||||
| **HTTP Range** | ✅ Supported | ❌ Not supported |
|
||||
| **Encoding** | `-c copy` (zero encoding) | `-c:v libx264 -c:a aac` (re-encode) |
|
||||
| **Quality** | Original (bit-exact, zero loss) | Compressed (default CRF ≈ 23) |
|
||||
| **Format** | `video/mp4` | `video/mp2t` (MPEG-TS) |
|
||||
| **Speed** | Fast (no computation) | Slower (encoding required) |
|
||||
| **Frame control** | Time-based (`dur = (ef-sf)/fps`) | Precise (`-vframes`) |
|
||||
| **Debug mode** | ❌ | ✅ `mode=debug` overlay |
|
||||
| **Cache** | ❌ | ✅ `max-age=86400` |
|
||||
|
||||
#### Usage Recommendation
|
||||
|
||||
| Scenario | Use |
|
||||
|----------|-----|
|
||||
| Full video streaming / player seek | `/video` |
|
||||
| Quick preview clip (zero quality loss) | `/video?start_frame=...&end_frame=...` |
|
||||
| Debug frame verification / text overlay | `/clip?mode=debug` |
|
||||
| Precise frame count control | `/clip` |
|
||||
| CDN cacheable clip | `/clip` |
|
||||
|
||||
---
|
||||
|
||||
| Detail | Value |
|
||||
|--------|-------|
|
||||
| **Backend** | FFmpeg (`ffmpeg-full`) |
|
||||
| **Filter** | `select=eq(n\,FRAME)` to select frame, optional `crop=W:H:X:Y` |
|
||||
| **Output** | Single JPEG via pipe (`image2pipe`, `mjpeg` codec) |
|
||||
| **Cache** | `Cache-Control: public, max-age=86400` (24h) |
|
||||
| **Frame number** | Zero-based (`frame=0` = first frame of video) |
|
||||
|
||||
---
|
||||
*Updated: 2026-05-19 12:49:24*
|
||||
@@ -37,6 +37,148 @@ Stream video with highlights for a specific face trace (follows a single person
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/trace/:trace_id/representative-face`
|
||||
|
||||
Find the best single face to represent this trace. Uses a two-stage selection: SQL (area × confidence → top 10) then FFmpeg `blurdetect` (sharpness → pick the least blurry).
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Example
|
||||
|
||||
```bash
|
||||
curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/representative-face" \
|
||||
-H "X-API-Key: $KEY"
|
||||
```
|
||||
|
||||
#### Response (200)
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"file_uuid": "aeed71342a899fe4b4c57b7d41bcb692",
|
||||
"trace_id": 1939,
|
||||
"face_count": 538,
|
||||
"representative": {
|
||||
"frame_number": 68193,
|
||||
"timestamp_secs": 2727.72,
|
||||
"bbox": { "x": 347, "y": 378, "width": 427, "height": 427 },
|
||||
"confidence": 0.760,
|
||||
"quality_score": 138516,
|
||||
"blur_score": 9.46
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Response Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `trace_id` | integer | Face trace ID |
|
||||
| `face_count` | integer | Total face detections in this trace |
|
||||
| `representative.frame_number` | integer | Frame number of the selected face (primary coordinate) |
|
||||
| `representative.timestamp_secs` | float | Time in seconds (derived from `frame_number / fps`) |
|
||||
| `representative.bbox` | object | Bounding box `{x, y, width, height}` |
|
||||
| `representative.confidence` | float | Detection confidence (0.0–1.0) |
|
||||
| `representative.quality_score` | float | Pre-selection score (`area × confidence`) |
|
||||
| `representative.blur_score` | float | FFmpeg blurdetect result (lower = sharper) |
|
||||
|
||||
#### Error Responses
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/trace/:trace_id/thumbnail`
|
||||
|
||||
Extract the best face image for a trace as JPEG (320×320). Internally selects the face using the same two-stage algorithm as `representative-face`, then crops via FFmpeg. The result is cacheable for 24 hours.
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Example
|
||||
|
||||
```bash
|
||||
curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/thumbnail" \
|
||||
-H "X-API-Key: $KEY" -o trace_1939_face.jpg
|
||||
```
|
||||
|
||||
#### Response
|
||||
|
||||
- **200**: `image/jpeg` binary data (320×320 cropped face)
|
||||
- **404**: File, trace not found, or no suitable face
|
||||
- **500**: FFmpeg or database error
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/identities/:identity_uuid_a/co-occur-with/:identity_uuid_b`
|
||||
|
||||
Find the first frame where two identities appear together, with representative face thumbnails for both.
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Example
|
||||
|
||||
```bash
|
||||
# Audrey Hepburn & Cary Grant 第一次同框
|
||||
curl -s "$API/api/v1/file/$FILE_UUID/identities/$AUDREY_UUID/co-occur-with/$CARY_UUID" \
|
||||
-H "X-API-Key: $KEY" | jq '{identity_a: .identity_a.name, identity_b: .identity_b.name, first_frame: .first_cooccurrence.frame_number}'
|
||||
```
|
||||
|
||||
#### Response (200)
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"file_uuid": "aeed71342a899fe4b4c57b7d41bcb692",
|
||||
"identity_a": {
|
||||
"identity_uuid": "c3545906-c82d-4b66-aa1d-150bc02decce",
|
||||
"name": "Audrey Hepburn",
|
||||
"trace_id": 920
|
||||
},
|
||||
"identity_b": {
|
||||
"identity_uuid": "2b0ddefe-e2a9-4533-9308-b375594604d5",
|
||||
"name": "Cary Grant",
|
||||
"trace_id": 919
|
||||
},
|
||||
"first_cooccurrence": {
|
||||
"frame_number": 38165,
|
||||
"timestamp_secs": 1526.60,
|
||||
"total_cooccurrence_frames": 3136,
|
||||
"representative_face_a": {
|
||||
"frame_number": 38199,
|
||||
"bbox": { "x": 122, "y": 339, "width": 176, "height": 176 },
|
||||
"confidence": 0.832,
|
||||
"thumbnail_url": "/api/v1/file/aeed71342.../trace/920/thumbnail"
|
||||
},
|
||||
"representative_face_b": {
|
||||
"frame_number": 38291,
|
||||
"bbox": { "x": 511, "y": 315, "width": 192, "height": 192 },
|
||||
"confidence": 0.791,
|
||||
"thumbnail_url": "/api/v1/file/aeed71342.../trace/919/thumbnail"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Response Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `identity_a.name` | string | First identity name |
|
||||
| `identity_b.name` | string | Second identity name |
|
||||
| `first_cooccurrence.frame_number` | int | First frame where both appear |
|
||||
| `first_cooccurrence.timestamp_secs` | float | Time in seconds |
|
||||
| `first_cooccurrence.total_cooccurrence_frames` | int | Total frames with both present |
|
||||
| `first_cooccurrence.representative_face_a/b` | object | Best face thumbnail data for each identity |
|
||||
|
||||
#### Error Responses
|
||||
|
||||
| HTTP | When |
|
||||
|------|------|
|
||||
| `404` | File or identity not found |
|
||||
| `404` | The two identities never co-occur in this file |
|
||||
| `500` | Database or FFmpeg error |
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/video/bbox`
|
||||
|
||||
Stream video with bounding box overlay for all detected objects/faces.
|
||||
|
||||
524
docs_v1.0/DESIGN/TKG_QUERY_API_V1.0.md
Normal file
524
docs_v1.0/DESIGN/TKG_QUERY_API_V1.0.md
Normal file
@@ -0,0 +1,524 @@
|
||||
# TKG Query API V1.0
|
||||
|
||||
用於 Gemma4(LLM)透過 function calling 查詢影片人物互動資料的 API 設計。
|
||||
|
||||
---
|
||||
|
||||
## 1. Overview
|
||||
|
||||
### 目的
|
||||
|
||||
讓 LLM(Gemma 4)可以回答關於影片人物互動的問題,例如「誰是主角」、「第一次同框是什麼時候」。透過 TKG(Trace Knowledge Graph)和 PostgreSQL 直接查詢,不需要 LLM 猜測。
|
||||
|
||||
### 架構
|
||||
|
||||
```
|
||||
User → "誰是這部電影的主角?"
|
||||
↓
|
||||
Gemma4 → function_call: tkg_query(file_uuid, "top_identities")
|
||||
↓
|
||||
API → SQL/TKG 查詢 → 結構化 JSON
|
||||
↓
|
||||
Gemma4 → "男主是 Cary Grant,女主是 Audrey Hepburn..."
|
||||
↓
|
||||
User ← 自然語言回答
|
||||
```
|
||||
|
||||
### 資料流
|
||||
|
||||
| 層級 | 元件 | 說明 |
|
||||
|------|------|------|
|
||||
| LLM | Gemma 4 26B (port 8082) | 解析自然語言 → 決定呼叫哪個 tool |
|
||||
| Function | `tkg_query()` | 8 種 query_type,參數由 LLM 填寫 |
|
||||
| Backend | `POST /api/v1/tkg/query` | 執行 SQL,回傳結構化結果 |
|
||||
| Data | `face_detections`, `identities`, `chunk`, `tkg_nodes/edges` | 查詢來源 |
|
||||
|
||||
---
|
||||
|
||||
## 2. Function Spec(給 LLM)
|
||||
|
||||
### Function Definition
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "tkg_query",
|
||||
"description": "查詢影片的人物、場景、互動資料。根據問題類型選擇適合的 query_type。",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_uuid": {
|
||||
"type": "string",
|
||||
"description": "影片的 32 碼 file UUID"
|
||||
},
|
||||
"query_type": {
|
||||
"type": "string",
|
||||
"description": "查詢類型",
|
||||
"enum": [
|
||||
"top_identities",
|
||||
"identity_details",
|
||||
"first_cooccurrence",
|
||||
"identity_traces",
|
||||
"cut_details",
|
||||
"file_info",
|
||||
"mutual_gaze",
|
||||
"interaction_network"
|
||||
]
|
||||
},
|
||||
"identity_a": {
|
||||
"type": "string",
|
||||
"description": "人物A的 identity_uuid 或名字"
|
||||
},
|
||||
"identity_b": {
|
||||
"type": "string",
|
||||
"description": "人物B的 identity_uuid 或名字"
|
||||
},
|
||||
"cut_id": {
|
||||
"type": "string",
|
||||
"description": "場景ID(如 cut_264)"
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "回傳筆數上限",
|
||||
"default": 10
|
||||
}
|
||||
},
|
||||
"required": ["file_uuid", "query_type"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### LLM Prompt 設計
|
||||
|
||||
System prompt 中須包含此工具定義,並提示:
|
||||
|
||||
```
|
||||
你是 Momentry 影片分析系統。當用戶問到影片中的人物、場景、互動問題時,
|
||||
請先呼叫 tkg_query 查詢資料,再根據資料回答。
|
||||
|
||||
注意:
|
||||
- 問題中提到的「男主」、「女主」是指 TMDb cast_order 0 和 1
|
||||
- 「配角」是指 cast_order >= 2 的人物
|
||||
- 「第一次同框」使用 first_cooccurrence
|
||||
- 「誰最多鏡頭」使用 top_identities 搭配 face_count 排序
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. API Endpoint
|
||||
|
||||
### `POST /api/v1/tkg/query`
|
||||
|
||||
Request:
|
||||
```json
|
||||
{
|
||||
"file_uuid": "aeed71342a899fe4b4c57b7d41bcb692",
|
||||
"query_type": "top_identities",
|
||||
"identity_a": null,
|
||||
"identity_b": null,
|
||||
"cut_id": null,
|
||||
"limit": 10
|
||||
}
|
||||
```
|
||||
|
||||
Response(通用包裝):
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"query_type": "top_identities",
|
||||
"file_uuid": "aeed71342...",
|
||||
"data": { ... },
|
||||
"took_ms": 12
|
||||
}
|
||||
```
|
||||
|
||||
Error:
|
||||
```json
|
||||
{
|
||||
"success": false,
|
||||
"error": "File not found",
|
||||
"query_type": "top_identities",
|
||||
"file_uuid": "aeed71342..."
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Query Types 詳解
|
||||
|
||||
### 4.1 `top_identities` — 人物重要性排名
|
||||
|
||||
**用途**:找出影片中的所有人物,依 TMDb cast_order 排序
|
||||
|
||||
**SQL**:
|
||||
```sql
|
||||
SELECT i.id, i.name,
|
||||
(i.metadata->>'tmdb_cast_order')::int as cast_order,
|
||||
i.metadata->>'tmdb_character' as role,
|
||||
i.source, i.status,
|
||||
COUNT(fd.id) as face_count,
|
||||
COUNT(DISTINCT fd.trace_id) as trace_count,
|
||||
ROUND(MIN(fd.frame_number)::numeric / GREATEST(v.fps, 1), 2) as first_appearance_sec,
|
||||
ROUND(MAX(fd.frame_number)::numeric / GREATEST(v.fps, 1), 2) as last_appearance_sec
|
||||
FROM identities i
|
||||
LEFT JOIN face_detections fd ON fd.identity_id = i.id AND fd.file_uuid = $1
|
||||
LEFT JOIN videos v ON v.file_uuid = $1
|
||||
WHERE i.source = 'tmdb'
|
||||
AND (i.metadata->>'tmdb_cast_order')::int IS NOT NULL
|
||||
GROUP BY i.id, i.name, i.metadata, i.source, i.status, v.fps
|
||||
ORDER BY cast_order ASC
|
||||
LIMIT $2
|
||||
```
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"total": 23,
|
||||
"leads": [
|
||||
{"name": "Cary Grant", "cast_order": 0, "role": "Peter Joshua", "face_count": 10643},
|
||||
{"name": "Audrey Hepburn", "cast_order": 1, "role": "Regina Lampert", "face_count": 16456}
|
||||
],
|
||||
"supporting": [
|
||||
{"name": "Walter Matthau", "cast_order": 2, "role": "Hamilton Bartholemew", "face_count": 2319},
|
||||
{"name": "James Coburn", "cast_order": 3, "role": "Tex Panthollow", "face_count": 3572},
|
||||
{"name": "George Kennedy", "cast_order": 4, "role": "Herman Scobie", "face_count": 1817}
|
||||
],
|
||||
"text_summary": "主演:Cary Grant 飾演 Peter Joshua,Audrey Hepburn 飾演 Regina Lampert。主要配角:Walter Matthau(cast_order 2)等 21 人。"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4.2 `identity_details` — 人物詳細資料
|
||||
|
||||
**SQL**:
|
||||
```sql
|
||||
SELECT i.id, i.name, i.identity_type, i.source, i.status,
|
||||
i.metadata->>'tmdb_cast_order' as cast_order,
|
||||
i.metadata->>'tmdb_character' as role,
|
||||
i.metadata->>'tmdb_movie_title' as movie,
|
||||
i.metadata->>'tmdb_biography' as biography,
|
||||
COUNT(fd.id) as face_count,
|
||||
COUNT(DISTINCT fd.trace_id) as trace_count,
|
||||
MIN(fd.frame_number) as first_frame,
|
||||
MAX(fd.frame_number) as last_frame
|
||||
FROM identities i
|
||||
LEFT JOIN face_detections fd ON fd.identity_id = i.id AND fd.file_uuid = $1
|
||||
WHERE (i.name ILIKE $2 OR i.uuid::text = $2 OR REPLACE(i.uuid::text, '-', '') = $2)
|
||||
AND i.source = 'tmdb'
|
||||
GROUP BY i.id, i.name, i.identity_type, i.source, i.status, i.metadata
|
||||
LIMIT 1
|
||||
```
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"name": "Audrey Hepburn",
|
||||
"role": "Regina Lampert",
|
||||
"cast_order": 1,
|
||||
"face_count": 16456,
|
||||
"trace_count": 457,
|
||||
"first_appearance_sec": 206.76,
|
||||
"last_appearance_sec": 6756.68,
|
||||
"biography": "Audrey Hepburn (born Audrey Kathleen Ruston; 4 May 1929 – 20 January 1993)..."
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4.3 `first_cooccurrence` — 第一次同框
|
||||
|
||||
**邏輯**:找出兩個 identity 第一次同時出現的 frame。
|
||||
|
||||
**SQL**:
|
||||
```sql
|
||||
SELECT MIN(fd_a.frame_number)::bigint as first_frame,
|
||||
COUNT(DISTINCT fd_a.frame_number)::bigint as total_cooccurrence_frames
|
||||
FROM face_detections fd_a
|
||||
JOIN face_detections fd_b ON fd_a.file_uuid = fd_b.file_uuid
|
||||
AND fd_a.frame_number = fd_b.frame_number
|
||||
WHERE fd_a.file_uuid = $1
|
||||
AND fd_a.identity_id = (SELECT id FROM identities WHERE name ILIKE $2 OR REPLACE(uuid::text, '-', '') = $2)
|
||||
AND fd_b.identity_id = (SELECT id FROM identities WHERE name ILIKE $3 OR REPLACE(uuid::text, '-', '') = $3)
|
||||
```
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"identity_a": {"name": "Audrey Hepburn"},
|
||||
"identity_b": {"name": "Cary Grant"},
|
||||
"first_frame": 38165,
|
||||
"timestamp_secs": 1526.60,
|
||||
"cut_id": "cut_264",
|
||||
"total_cooccurrence_frames": 3136,
|
||||
"representative_thumbnail_a": "/api/v1/file/{uuid}/trace/920/thumbnail",
|
||||
"representative_thumbnail_b": "/api/v1/file/{uuid}/trace/919/thumbnail"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4.4 `identity_traces` — 人物出場片段
|
||||
|
||||
**SQL**:
|
||||
```sql
|
||||
SELECT fd.trace_id, COUNT(*) as face_count,
|
||||
MIN(fd.frame_number) as start_frame,
|
||||
MAX(fd.frame_number) as end_frame,
|
||||
COUNT(DISTINCT fd.frame_number) as frame_span
|
||||
FROM face_detections fd
|
||||
WHERE fd.file_uuid = $1
|
||||
AND fd.identity_id = (SELECT id FROM identities WHERE name ILIKE $2 OR REPLACE(uuid::text, '-', '') = $2)
|
||||
GROUP BY fd.trace_id
|
||||
ORDER BY face_count DESC
|
||||
LIMIT $3
|
||||
```
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"name": "Audrey Hepburn",
|
||||
"total_traces": 457,
|
||||
"top_traces": [
|
||||
{"trace_id": 920, "face_count": 53, "start_frame": 38165, "end_frame": 38321,
|
||||
"representative": "/api/v1/file/{uuid}/trace/920/thumbnail"},
|
||||
...
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4.5 `cut_details` — 場景資訊
|
||||
|
||||
**SQL**:
|
||||
```sql
|
||||
SELECT chunk_id, start_frame, end_frame,
|
||||
ROUND(start_frame::numeric / fps, 2) as start_time,
|
||||
ROUND(end_frame::numeric / fps, 2) as end_time,
|
||||
text_content, summary_text
|
||||
FROM chunk
|
||||
WHERE file_uuid = $1 AND chunk_id = $2 AND chunk_type = 'cut'
|
||||
```
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"cut_id": "cut_264",
|
||||
"frame_range": [38164, 38324],
|
||||
"duration_sec": 6.4,
|
||||
"summary": "Audrey Hepburn and Cary Grant engage in a brief verbal exchange...",
|
||||
"identities_present": ["Audrey Hepburn", "Cary Grant"]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4.6 `file_info` — 影片基本資訊
|
||||
|
||||
**SQL**:
|
||||
```sql
|
||||
SELECT file_name, file_path, duration, width, height, fps,
|
||||
(SELECT COUNT(*) FROM face_detections WHERE file_uuid = $1) as total_faces,
|
||||
(SELECT COUNT(DISTINCT trace_id) FROM face_detections WHERE file_uuid = $1 AND trace_id IS NOT NULL) as total_traces,
|
||||
(SELECT COUNT(*) FROM chunk WHERE file_uuid = $1 AND chunk_type = 'cut') as total_cuts
|
||||
FROM videos
|
||||
WHERE file_uuid = $1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4.7 `mutual_gaze` — 互看偵測(未來)
|
||||
|
||||
**依賴**:pose 資料寫入 `face_detections.metadata->>'pose_yaw'`。
|
||||
|
||||
**SQL**:
|
||||
```sql
|
||||
SELECT fd_a.frame_number,
|
||||
(fd_a.metadata->>'pose_yaw')::float8 as yaw_a,
|
||||
(fd_b.metadata->>'pose_yaw')::float8 as yaw_b
|
||||
FROM face_detections fd_a
|
||||
JOIN face_detections fd_b ON fd_a.file_uuid = fd_b.file_uuid
|
||||
AND fd_a.frame_number = fd_b.frame_number
|
||||
WHERE fd_a.file_uuid = $1
|
||||
AND fd_a.identity_id = $2 AND fd_b.identity_id = $3
|
||||
AND (fd_a.metadata->>'pose_yaw')::float8 > 0.05
|
||||
AND (fd_b.metadata->>'pose_yaw')::float8 < -0.05
|
||||
ORDER BY fd_a.frame_number ASC
|
||||
LIMIT 1
|
||||
```
|
||||
|
||||
**Mutual Gaze 判斷邏輯**:
|
||||
```
|
||||
if face_a is LEFT of face_b (bbox.x_a < bbox.x_b):
|
||||
mutual_gaze = (yaw_a > GAZE_THRESHOLD) AND (yaw_b < -GAZE_THRESHOLD)
|
||||
if face_a is RIGHT of face_b:
|
||||
mutual_gaze = (yaw_a < -GAZE_THRESHOLD) AND (yaw_b > GAZE_THRESHOLD)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4.8 `interaction_network` — 互動網絡(未來)
|
||||
|
||||
**依賴**:TKG `CO_OCCURS_WITH` edges 完整。
|
||||
|
||||
**SQL**:
|
||||
```sql
|
||||
SELECT src_i.name as identity_a, tgt_i.name as identity_b,
|
||||
COUNT(DISTINCT te.id) as cooccurrence_count,
|
||||
MIN((te.properties->>'first_frame')::int) as first_frame
|
||||
FROM tkg_edges te
|
||||
JOIN tkg_nodes src_n ON src_n.id = te.source_node_id
|
||||
JOIN tkg_nodes tgt_n ON tgt_n.id = te.target_node_id
|
||||
JOIN face_detections fd_src ON fd_src.trace_id = REPLACE(src_n.external_id, 'trace_', '')::int
|
||||
JOIN face_detections fd_tgt ON fd_tgt.trace_id = REPLACE(tgt_n.external_id, 'trace_', '')::int
|
||||
JOIN identities src_i ON src_i.id = fd_src.identity_id
|
||||
JOIN identities tgt_i ON tgt_i.id = fd_tgt.identity_id
|
||||
WHERE te.file_uuid = $1
|
||||
AND te.edge_type = 'CO_OCCURS_WITH'
|
||||
AND src_n.node_type = 'face_trace' AND tgt_n.node_type = 'face_trace'
|
||||
AND src_i.name != tgt_i.name
|
||||
GROUP BY src_i.name, tgt_i.name
|
||||
ORDER BY cooccurrence_count DESC
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Gemma4 整合
|
||||
|
||||
### 已驗證功能
|
||||
|
||||
| 測試 | 結果 |
|
||||
|------|------|
|
||||
| Function calling 觸發 | ✅ 正確呼叫 tkg_query |
|
||||
| 中文問題理解 | ✅ 「男女主第一次同框」→ first_cooccurrence |
|
||||
| 參數填充 | ✅ 正確填入 file_uuid、query_type |
|
||||
| 多輪對話(tool result → answer) | ✅ 模型正確消化資料後回答 |
|
||||
| 推論型問題(「最重要的配角」) | ✅ 選擇 top_identities + 自行推理 |
|
||||
|
||||
### 已知限制
|
||||
|
||||
| 問題 | 解決方案 |
|
||||
|------|---------|
|
||||
| file_uuid 須由 system prompt 提供 | 在 prompt 中指定 |
|
||||
| `identity_a` 使用「女主」無法自動匹配 | require identity_a/b 明確名稱 |
|
||||
| 模型可能拒絕呼叫 tool(約 5-10%) | system prompt 明確要求「先查詢」 |
|
||||
|
||||
### System Prompt 模板
|
||||
|
||||
```
|
||||
你是 Momentry 影片分析系統。你正在分析電影 {title},file_uuid 為 {file_uuid}。
|
||||
|
||||
你有 tkg_query 工具可用,可以查詢影片的人物資料、出場時間、互動關係。
|
||||
請先使用工具查詢,再根據查詢結果回答問題。
|
||||
不要憑空猜測影片內容。
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. TKG 擴充:Pose + Mutual Gaze
|
||||
|
||||
### 7.1 現狀
|
||||
|
||||
| 元件 | 有 pose? | 有 mutual gaze? |
|
||||
|------|-----------|-----------------|
|
||||
| `face.json` | ✅ yaw/pitch/roll | ❌ 未計算 |
|
||||
| `face_detections.metadata` | ❌ 無 | ❌ 無 |
|
||||
| `tkg_nodes` (face_trace) | ❌ 無 | — |
|
||||
| `tkg_edges` (CO_OCCURS_WITH) | ❌ 無 | ❌ 無 |
|
||||
|
||||
### 7.2 目標
|
||||
|
||||
```
|
||||
face_processor → face.json (有 pose)
|
||||
↓
|
||||
tkg.rs (TKG builder)
|
||||
├── 讀取 face.json 的 pose 資料
|
||||
├── 算 avg_yaw/pitch/roll → 寫入 face_trace node
|
||||
├── 對同框 face_trace 配對,判斷 mutual_gaze
|
||||
└── 寫入 CO_OCCURS_WITH edge properties
|
||||
↓
|
||||
tkg_edges.properties.mutual_gaze = true
|
||||
↓
|
||||
API 查詢 → 互看 + 面積最大 → 代表 frame
|
||||
```
|
||||
|
||||
### 7.3 face_trace node 新增 properties
|
||||
|
||||
```json
|
||||
{
|
||||
"frame_count": 53,
|
||||
"start_frame": 38165,
|
||||
"end_frame": 38321,
|
||||
"avg_bbox": {"x": 731, "y": 215, "width": 228, "height": 228},
|
||||
"avg_yaw": 0.014,
|
||||
"avg_pitch": 0.224,
|
||||
"avg_roll": -0.069
|
||||
}
|
||||
```
|
||||
|
||||
### 7.4 CO_OCCURS_WITH edge 新增 properties
|
||||
|
||||
```json
|
||||
{
|
||||
"first_frame": 38187,
|
||||
"frame_count": 156,
|
||||
"mutual_gaze": true,
|
||||
"yaw_a_avg": 0.021,
|
||||
"yaw_b_avg": -0.421,
|
||||
"gaze_angle_delta": 0.442
|
||||
}
|
||||
```
|
||||
|
||||
### 7.5 Mutual Gaze 判斷邏輯
|
||||
|
||||
```python
|
||||
GAZE_THRESHOLD = 0.05 # rad
|
||||
|
||||
def detect_mutual_gaze(frame_a, frame_b):
|
||||
# 判斷 A 和 B 的左右位置關係
|
||||
if bbox_a.cx < bbox_b.cx:
|
||||
# A 在左,B 在右 → A 要看右,B 要看左
|
||||
return yaw_a > GAZE_THRESHOLD and yaw_b < -GAZE_THRESHOLD
|
||||
else:
|
||||
# A 在右,B 在左 → A 要看左,B 要看右
|
||||
return yaw_a < -GAZE_THRESHOLD and yaw_b > GAZE_THRESHOLD
|
||||
```
|
||||
|
||||
### 7.6 代表 Frame 選取邏輯(應用端)
|
||||
|
||||
```
|
||||
1. 查 TKG: MATCH (a)-[e:CO_OCCURS_WITH]->(b) WHERE e.mutual_gaze = true
|
||||
→ 回傳互看 frame_count 最高的 face_trace 配對
|
||||
2. 取該配對的 frame 範圍內,面積×信心最高 frame
|
||||
3. FFmpeg blurdetect → 選最清晰的作為代表
|
||||
```
|
||||
|
||||
### 7.7 Timeline
|
||||
|
||||
| Phase | 內容 | 工時 |
|
||||
|-------|------|------|
|
||||
| 1 | tkg.rs 讀取 face.json 的 pose + 寫入 node | 4-6h |
|
||||
| 2 | mutual_gaze 判斷 + 寫入 edge | 3-4h |
|
||||
| 3 | 對 Charade 重跑 TKG + 驗證 | 1h |
|
||||
| 4 | 代表 frame 選取邏輯 | 2-3h |
|
||||
|
||||
---
|
||||
|
||||
## 8. Phase 計畫
|
||||
|
||||
| Phase | Query Types | 預計工時 | 依賴 |
|
||||
|-------|------------|---------|------|
|
||||
| **1** | `top_identities`, `identity_details`, `first_cooccurrence`, `file_info` | 2-3h | 已有資料 |
|
||||
| **2** | `identity_traces`, `cut_details` | 1-2h | 已有資料 |
|
||||
| **3** | `mutual_gaze` | 2-3h | pose 入 `face_detections.metadata` |
|
||||
| **4** | `interaction_network` | 2-3h | TKG edges 完善 |
|
||||
|
||||
---
|
||||
|
||||
## 7. 版本歷史
|
||||
|
||||
| 日期 | 版本 | 作者 | 變更 |
|
||||
|------|------|------|------|
|
||||
| 2026-05-21 | 1.0 | OpenCode | 初始設計文件 |
|
||||
|
||||
*Updated: 2026-05-21*
|
||||
@@ -37,6 +37,148 @@ Stream video with highlights for a specific face trace (follows a single person
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/trace/:trace_id/representative-face`
|
||||
|
||||
Find the best single face to represent this trace. Uses a two-stage selection: SQL (area × confidence → top 10) then FFmpeg `blurdetect` (sharpness → pick the least blurry).
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Example
|
||||
|
||||
```bash
|
||||
curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/representative-face" \
|
||||
-H "X-API-Key: $KEY"
|
||||
```
|
||||
|
||||
#### Response (200)
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"file_uuid": "aeed71342a899fe4b4c57b7d41bcb692",
|
||||
"trace_id": 1939,
|
||||
"face_count": 538,
|
||||
"representative": {
|
||||
"frame_number": 68193,
|
||||
"timestamp_secs": 2727.72,
|
||||
"bbox": { "x": 347, "y": 378, "width": 427, "height": 427 },
|
||||
"confidence": 0.760,
|
||||
"quality_score": 138516,
|
||||
"blur_score": 9.46
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Response Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `trace_id` | integer | Face trace ID |
|
||||
| `face_count` | integer | Total face detections in this trace |
|
||||
| `representative.frame_number` | integer | Frame number of the selected face (primary coordinate) |
|
||||
| `representative.timestamp_secs` | float | Time in seconds (derived from `frame_number / fps`) |
|
||||
| `representative.bbox` | object | Bounding box `{x, y, width, height}` |
|
||||
| `representative.confidence` | float | Detection confidence (0.0–1.0) |
|
||||
| `representative.quality_score` | float | Pre-selection score (`area × confidence`) |
|
||||
| `representative.blur_score` | float | FFmpeg blurdetect result (lower = sharper) |
|
||||
|
||||
#### Error Responses
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/trace/:trace_id/thumbnail`
|
||||
|
||||
Extract the best face image for a trace as JPEG (320×320). Internally selects the face using the same two-stage algorithm as `representative-face`, then crops via FFmpeg. The result is cacheable for 24 hours.
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Example
|
||||
|
||||
```bash
|
||||
curl -s "$API/api/v1/file/$FILE_UUID/trace/1939/thumbnail" \
|
||||
-H "X-API-Key: $KEY" -o trace_1939_face.jpg
|
||||
```
|
||||
|
||||
#### Response
|
||||
|
||||
- **200**: `image/jpeg` binary data (320×320 cropped face)
|
||||
- **404**: File, trace not found, or no suitable face
|
||||
- **500**: FFmpeg or database error
|
||||
|
||||
---
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/identities/:identity_uuid_a/co-occur-with/:identity_uuid_b`
|
||||
|
||||
Find the first frame where two identities appear together, with representative face thumbnails for both.
|
||||
|
||||
**Auth**: Required
|
||||
**Scope**: file-level
|
||||
|
||||
#### Example
|
||||
|
||||
```bash
|
||||
# Audrey Hepburn & Cary Grant 第一次同框
|
||||
curl -s "$API/api/v1/file/$FILE_UUID/identities/$AUDREY_UUID/co-occur-with/$CARY_UUID" \
|
||||
-H "X-API-Key: $KEY" | jq '{identity_a: .identity_a.name, identity_b: .identity_b.name, first_frame: .first_cooccurrence.frame_number}'
|
||||
```
|
||||
|
||||
#### Response (200)
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"file_uuid": "aeed71342a899fe4b4c57b7d41bcb692",
|
||||
"identity_a": {
|
||||
"identity_uuid": "c3545906-c82d-4b66-aa1d-150bc02decce",
|
||||
"name": "Audrey Hepburn",
|
||||
"trace_id": 920
|
||||
},
|
||||
"identity_b": {
|
||||
"identity_uuid": "2b0ddefe-e2a9-4533-9308-b375594604d5",
|
||||
"name": "Cary Grant",
|
||||
"trace_id": 919
|
||||
},
|
||||
"first_cooccurrence": {
|
||||
"frame_number": 38165,
|
||||
"timestamp_secs": 1526.60,
|
||||
"total_cooccurrence_frames": 3136,
|
||||
"representative_face_a": {
|
||||
"frame_number": 38199,
|
||||
"bbox": { "x": 122, "y": 339, "width": 176, "height": 176 },
|
||||
"confidence": 0.832,
|
||||
"thumbnail_url": "/api/v1/file/aeed71342.../trace/920/thumbnail"
|
||||
},
|
||||
"representative_face_b": {
|
||||
"frame_number": 38291,
|
||||
"bbox": { "x": 511, "y": 315, "width": 192, "height": 192 },
|
||||
"confidence": 0.791,
|
||||
"thumbnail_url": "/api/v1/file/aeed71342.../trace/919/thumbnail"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Response Fields
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `identity_a.name` | string | First identity name |
|
||||
| `identity_b.name` | string | Second identity name |
|
||||
| `first_cooccurrence.frame_number` | int | First frame where both appear |
|
||||
| `first_cooccurrence.timestamp_secs` | float | Time in seconds |
|
||||
| `first_cooccurrence.total_cooccurrence_frames` | int | Total frames with both present |
|
||||
| `first_cooccurrence.representative_face_a/b` | object | Best face thumbnail data for each identity |
|
||||
|
||||
#### Error Responses
|
||||
|
||||
| HTTP | When |
|
||||
|------|------|
|
||||
| `404` | File or identity not found |
|
||||
| `404` | The two identities never co-occur in this file |
|
||||
| `500` | Database or FFmpeg error |
|
||||
|
||||
### `GET /api/v1/file/:file_uuid/video/bbox`
|
||||
|
||||
Stream video with bounding box overlay for all detected objects/faces.
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use axum::{
|
||||
body::Body,
|
||||
extract::{Path, Query, State},
|
||||
http::StatusCode,
|
||||
response::Json,
|
||||
http::{header, StatusCode},
|
||||
response::{IntoResponse, Json, Response},
|
||||
routing::{get, post},
|
||||
Router,
|
||||
};
|
||||
@@ -16,6 +17,22 @@ pub fn trace_agent_routes() -> Router<crate::api::types::AppState> {
|
||||
"/api/v1/file/:file_uuid/trace/:trace_id/faces",
|
||||
get(list_trace_faces),
|
||||
)
|
||||
.route(
|
||||
"/api/v1/file/:file_uuid/trace/:trace_id/representative-face",
|
||||
get(get_representative_face),
|
||||
)
|
||||
.route(
|
||||
"/api/v1/file/:file_uuid/trace/:trace_id/thumbnail",
|
||||
get(get_trace_thumbnail),
|
||||
)
|
||||
.route(
|
||||
"/api/v1/file/:file_uuid/identities/:identity_uuid_a/co-occur-with/:identity_uuid_b",
|
||||
get(get_cooccurrence),
|
||||
)
|
||||
.route(
|
||||
"/api/v1/file/:file_uuid/tkg/rebuild",
|
||||
post(rebuild_tkg),
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
@@ -328,3 +345,441 @@ async fn list_trace_faces(
|
||||
faces,
|
||||
}))
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct RepFaceBbox {
|
||||
x: i32,
|
||||
y: i32,
|
||||
width: i32,
|
||||
height: i32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct RepFaceResult {
|
||||
frame_number: i64,
|
||||
timestamp_secs: f64,
|
||||
bbox: RepFaceBbox,
|
||||
confidence: f64,
|
||||
quality_score: f64,
|
||||
blur_score: f64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct RepFaceResponse {
|
||||
success: bool,
|
||||
file_uuid: String,
|
||||
trace_id: i32,
|
||||
face_count: i64,
|
||||
representative: RepFaceResult,
|
||||
}
|
||||
|
||||
struct RepFaceSelection {
|
||||
frame: i64,
|
||||
x: i32,
|
||||
y: i32,
|
||||
w: i32,
|
||||
h: i32,
|
||||
conf: f64,
|
||||
blur: f64,
|
||||
score: f64,
|
||||
video_path: String,
|
||||
fps: f64,
|
||||
face_count: i64,
|
||||
}
|
||||
|
||||
async fn select_rep_face<F, T>(
|
||||
pool: &sqlx::PgPool,
|
||||
file_uuid: &str,
|
||||
trace_id: i32,
|
||||
err_fn: F,
|
||||
) -> Result<RepFaceSelection, T>
|
||||
where
|
||||
F: Fn(anyhow::Error) -> T,
|
||||
{
|
||||
use crate::core::db::schema;
|
||||
let fd_table = schema::table_name("face_detections");
|
||||
let video_table = schema::table_name("videos");
|
||||
|
||||
let fps: f64 = sqlx::query_scalar(&format!(
|
||||
"SELECT COALESCE(fps, 25.0) FROM {} WHERE file_uuid = $1", video_table
|
||||
))
|
||||
.bind(file_uuid)
|
||||
.fetch_optional(pool)
|
||||
.await
|
||||
.map_err(|e| err_fn(anyhow::anyhow!("{}", e)))?
|
||||
.unwrap_or(25.0);
|
||||
|
||||
let face_count: (i64,) = sqlx::query_as(&format!(
|
||||
"SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND trace_id = $2", fd_table
|
||||
))
|
||||
.bind(file_uuid)
|
||||
.bind(trace_id)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.map_err(|e| err_fn(anyhow::anyhow!("{}", e)))?;
|
||||
|
||||
struct Candidate { frame: i64, x: i32, y: i32, w: i32, h: i32, conf: f64, score: f64 }
|
||||
|
||||
let rows = sqlx::query_as::<_, (i64, i32, i32, i32, i32, f64)>(&format!(
|
||||
"SELECT frame_number::bigint, x, y, width, height, confidence::float8 \
|
||||
FROM {} WHERE file_uuid = $1 AND trace_id = $2 AND confidence > 0.7 \
|
||||
AND ((metadata->>'qc_ok')::boolean IS NULL OR (metadata->>'qc_ok')::boolean = true) \
|
||||
ORDER BY (width::float8 * height::float8) * confidence::float8 DESC LIMIT 10",
|
||||
fd_table
|
||||
))
|
||||
.bind(file_uuid).bind(trace_id)
|
||||
.fetch_all(pool)
|
||||
.await
|
||||
.map_err(|e| err_fn(anyhow::anyhow!("{}", e)))?;
|
||||
|
||||
if rows.is_empty() {
|
||||
return Err(err_fn(anyhow::anyhow!("No suitable face found")));
|
||||
}
|
||||
|
||||
let candidates: Vec<Candidate> = rows.into_iter()
|
||||
.map(|(frame, x, y, w, h, conf)| {
|
||||
let score = (w as f64 * h as f64) * conf;
|
||||
Candidate { frame, x, y, w, h, conf, score }
|
||||
})
|
||||
.collect();
|
||||
|
||||
let video_path: String = sqlx::query_scalar(&format!(
|
||||
"SELECT file_path FROM {} WHERE file_uuid = $1", video_table
|
||||
))
|
||||
.bind(file_uuid)
|
||||
.fetch_optional(pool)
|
||||
.await
|
||||
.map_err(|e| err_fn(anyhow::anyhow!("{}", e)))?
|
||||
.ok_or_else(|| err_fn(anyhow::anyhow!("Video not found")))?;
|
||||
|
||||
let mut best = candidates[0].frame;
|
||||
let mut best_blur = f64::MAX;
|
||||
let mut best_idx = 0usize;
|
||||
|
||||
for (i, c) in candidates.iter().enumerate() {
|
||||
let seek = c.frame as f64 / fps;
|
||||
if let Ok(output) = tokio::process::Command::new("ffmpeg")
|
||||
.args(["-ss", &format!("{:.2}", seek), "-i", &video_path,
|
||||
"-vframes", "1", "-vf", &format!("crop={}:{}:{}:{},blurdetect", c.w, c.h, c.x, c.y),
|
||||
"-f", "null", "-"])
|
||||
.output().await
|
||||
{
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
for line in stderr.lines() {
|
||||
if let Some(blur_str) = line.split("blur mean: ").nth(1) {
|
||||
if let Ok(blur) = blur_str.trim().parse::<f64>() {
|
||||
if blur < best_blur { best_blur = blur; best = c.frame; best_idx = i; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let chosen = &candidates[best_idx];
|
||||
Ok(RepFaceSelection {
|
||||
frame: chosen.frame, x: chosen.x, y: chosen.y, w: chosen.w, h: chosen.h,
|
||||
conf: chosen.conf, blur: best_blur, score: chosen.score,
|
||||
video_path, fps, face_count: face_count.0,
|
||||
})
|
||||
}
|
||||
|
||||
async fn get_representative_face(
|
||||
State(state): State<crate::api::types::AppState>,
|
||||
Path((file_uuid, trace_id)): Path<(String, i32)>,
|
||||
) -> Result<Json<RepFaceResponse>, (StatusCode, Json<serde_json::Value>)> {
|
||||
let sel = select_rep_face(state.db.pool(), &file_uuid, trace_id, |e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
}).await?;
|
||||
|
||||
Ok(Json(RepFaceResponse {
|
||||
success: true,
|
||||
file_uuid,
|
||||
trace_id,
|
||||
face_count: sel.face_count,
|
||||
representative: RepFaceResult {
|
||||
frame_number: sel.frame,
|
||||
timestamp_secs: sel.frame as f64 / sel.fps,
|
||||
bbox: RepFaceBbox { x: sel.x, y: sel.y, width: sel.w, height: sel.h },
|
||||
confidence: sel.conf,
|
||||
quality_score: sel.score,
|
||||
blur_score: sel.blur,
|
||||
},
|
||||
}))
|
||||
}
|
||||
|
||||
async fn get_trace_thumbnail(
|
||||
State(state): State<crate::api::types::AppState>,
|
||||
Path((file_uuid, trace_id)): Path<(String, i32)>,
|
||||
) -> Result<Response, (StatusCode, Json<serde_json::Value>)> {
|
||||
let sel = select_rep_face(state.db.pool(), &file_uuid, trace_id, |e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
}).await?;
|
||||
|
||||
let seek = sel.frame as f64 / sel.fps;
|
||||
let tmp = std::env::temp_dir().join(format!("trace_{}_{}.jpg", file_uuid, trace_id));
|
||||
|
||||
let status = tokio::process::Command::new("ffmpeg")
|
||||
.args([
|
||||
"-ss", &format!("{:.2}", seek),
|
||||
"-i", &sel.video_path,
|
||||
"-vframes", "1",
|
||||
"-vf", &format!("crop={}:{}:{}:{},scale=320:320", sel.w, sel.h, sel.x, sel.y),
|
||||
"-q:v", "2",
|
||||
"-y", &tmp.to_string_lossy().to_string(),
|
||||
])
|
||||
.output()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
})?;
|
||||
|
||||
if !status.status.success() {
|
||||
return Err((StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": "FFmpeg failed"}))));
|
||||
}
|
||||
|
||||
let bytes = tokio::fs::read(&tmp).await.map_err(|e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
})?;
|
||||
|
||||
let _ = tokio::fs::remove_file(&tmp).await;
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, "image/jpeg")
|
||||
.header(header::CACHE_CONTROL, "public, max-age=86400")
|
||||
.body(Body::from(bytes))
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct CoOccurIdentity {
|
||||
identity_uuid: String,
|
||||
name: String,
|
||||
trace_id: i32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct CoOccurRepFace {
|
||||
frame_number: i64,
|
||||
bbox: RepFaceBbox,
|
||||
confidence: f64,
|
||||
thumbnail_url: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct CoOccurrence {
|
||||
frame_number: i64,
|
||||
timestamp_secs: f64,
|
||||
total_cooccurrence_frames: i64,
|
||||
representative_face_a: Option<CoOccurRepFace>,
|
||||
representative_face_b: Option<CoOccurRepFace>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct CoOccurResponse {
|
||||
success: bool,
|
||||
file_uuid: String,
|
||||
identity_a: CoOccurIdentity,
|
||||
identity_b: CoOccurIdentity,
|
||||
first_cooccurrence: CoOccurrence,
|
||||
}
|
||||
|
||||
async fn get_cooccurrence(
|
||||
State(state): State<crate::api::types::AppState>,
|
||||
Path((file_uuid, identity_uuid_a, identity_uuid_b)): Path<(String, String, String)>,
|
||||
) -> Result<Json<CoOccurResponse>, (StatusCode, Json<serde_json::Value>)> {
|
||||
use crate::core::db::schema;
|
||||
let id_table = schema::table_name("identities");
|
||||
let fd_table = schema::table_name("face_detections");
|
||||
|
||||
// Stage 1: Get identity names and IDs
|
||||
let id_a = sqlx::query_as::<_, (i32, String)>(&format!(
|
||||
"SELECT id, name FROM {} WHERE uuid::text = $1 OR REPLACE(uuid::text, '-', '') = $1",
|
||||
id_table
|
||||
))
|
||||
.bind(&identity_uuid_a)
|
||||
.fetch_optional(state.db.pool())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
})?
|
||||
.ok_or_else(|| {
|
||||
(StatusCode::NOT_FOUND, Json(serde_json::json!({"error": "Identity A not found"})))
|
||||
})?;
|
||||
|
||||
let id_b = sqlx::query_as::<_, (i32, String)>(&format!(
|
||||
"SELECT id, name FROM {} WHERE uuid::text = $1 OR REPLACE(uuid::text, '-', '') = $1",
|
||||
id_table
|
||||
))
|
||||
.bind(&identity_uuid_b)
|
||||
.fetch_optional(state.db.pool())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
})?
|
||||
.ok_or_else(|| {
|
||||
(StatusCode::NOT_FOUND, Json(serde_json::json!({"error": "Identity B not found"})))
|
||||
})?;
|
||||
|
||||
// Stage 2: Find first frame where both identity_ids appear
|
||||
let cooccur: Option<(i64,)> = sqlx::query_as(
|
||||
&format!(
|
||||
"SELECT MIN(fd.frame_number)::bigint FROM {} fd \
|
||||
WHERE fd.file_uuid = $1 AND fd.identity_id = $2 \
|
||||
AND fd.frame_number IN ( \
|
||||
SELECT frame_number FROM {} \
|
||||
WHERE file_uuid = $1 AND identity_id = $3 \
|
||||
)",
|
||||
fd_table, fd_table
|
||||
)
|
||||
)
|
||||
.bind(&file_uuid)
|
||||
.bind(id_a.0)
|
||||
.bind(id_b.0)
|
||||
.fetch_optional(state.db.pool())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
})?;
|
||||
|
||||
let (first_frame,) = cooccur.ok_or_else(|| {
|
||||
(StatusCode::NOT_FOUND, Json(serde_json::json!({"error": "These two identities never appear together in this file"})))
|
||||
})?;
|
||||
|
||||
// Get fps for timestamp
|
||||
let video_table = schema::table_name("videos");
|
||||
let fps: f64 = sqlx::query_scalar(&format!(
|
||||
"SELECT COALESCE(fps, 25.0) FROM {} WHERE file_uuid = $1", video_table
|
||||
))
|
||||
.bind(&file_uuid)
|
||||
.fetch_optional(state.db.pool())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
})?
|
||||
.unwrap_or(25.0);
|
||||
|
||||
// Stage 3: Get trace_ids for both at this frame
|
||||
let trace_a: Option<(i32,)> = sqlx::query_as(
|
||||
&format!("SELECT trace_id FROM {} WHERE file_uuid = $1 AND frame_number = $2 AND identity_id = $3 AND trace_id IS NOT NULL LIMIT 1", fd_table)
|
||||
)
|
||||
.bind(&file_uuid).bind(first_frame).bind(id_a.0)
|
||||
.fetch_optional(state.db.pool()).await
|
||||
.map_err(|e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
})?;
|
||||
|
||||
let trace_b: Option<(i32,)> = sqlx::query_as(
|
||||
&format!("SELECT trace_id FROM {} WHERE file_uuid = $1 AND frame_number = $2 AND identity_id = $3 AND trace_id IS NOT NULL LIMIT 1", fd_table)
|
||||
)
|
||||
.bind(&file_uuid).bind(first_frame).bind(id_b.0)
|
||||
.fetch_optional(state.db.pool()).await
|
||||
.map_err(|e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
})?;
|
||||
|
||||
// Stage 4: Get representative faces for both traces (reusing select_rep_face)
|
||||
let rep_a = if let Some((tid,)) = trace_a {
|
||||
select_rep_face(state.db.pool(), &file_uuid, tid, |e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
}).await.ok().map(|sel| CoOccurRepFace {
|
||||
frame_number: sel.frame,
|
||||
bbox: RepFaceBbox { x: sel.x, y: sel.y, width: sel.w, height: sel.h },
|
||||
confidence: sel.conf,
|
||||
thumbnail_url: format!("/api/v1/file/{}/trace/{}/thumbnail", file_uuid, tid),
|
||||
})
|
||||
} else { None };
|
||||
|
||||
let rep_b = if let Some((tid,)) = trace_b {
|
||||
select_rep_face(state.db.pool(), &file_uuid, tid, |e| {
|
||||
(StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": e.to_string()})))
|
||||
}).await.ok().map(|sel| CoOccurRepFace {
|
||||
frame_number: sel.frame,
|
||||
bbox: RepFaceBbox { x: sel.x, y: sel.y, width: sel.w, height: sel.h },
|
||||
confidence: sel.conf,
|
||||
thumbnail_url: format!("/api/v1/file/{}/trace/{}/thumbnail", file_uuid, tid),
|
||||
})
|
||||
} else { None };
|
||||
|
||||
// Total co-occurrence frames (from TKG if available, otherwise from face_detections)
|
||||
let total_cooccurrence_frames: i64 = sqlx::query_scalar(
|
||||
&format!(
|
||||
"SELECT COUNT(DISTINCT fd.frame_number)::bigint FROM {} fd \
|
||||
WHERE fd.file_uuid = $1 AND fd.identity_id = $2 \
|
||||
AND fd.frame_number IN ( \
|
||||
SELECT frame_number FROM {} \
|
||||
WHERE file_uuid = $1 AND identity_id = $3 \
|
||||
)",
|
||||
fd_table, fd_table
|
||||
)
|
||||
)
|
||||
.bind(&file_uuid).bind(id_a.0).bind(id_b.0)
|
||||
.fetch_one(state.db.pool()).await
|
||||
.unwrap_or(0);
|
||||
|
||||
Ok(Json(CoOccurResponse {
|
||||
success: true,
|
||||
file_uuid,
|
||||
identity_a: CoOccurIdentity {
|
||||
identity_uuid: identity_uuid_a,
|
||||
name: id_a.1,
|
||||
trace_id: trace_a.map(|t| t.0).unwrap_or(0),
|
||||
},
|
||||
identity_b: CoOccurIdentity {
|
||||
identity_uuid: identity_uuid_b,
|
||||
name: id_b.1,
|
||||
trace_id: trace_b.map(|t| t.0).unwrap_or(0),
|
||||
},
|
||||
first_cooccurrence: CoOccurrence {
|
||||
frame_number: first_frame,
|
||||
timestamp_secs: first_frame as f64 / fps,
|
||||
total_cooccurrence_frames,
|
||||
representative_face_a: rep_a,
|
||||
representative_face_b: rep_b,
|
||||
},
|
||||
}))
|
||||
}
|
||||
|
||||
use crate::core::config::OUTPUT_DIR;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct TkgRebuildResponse {
|
||||
success: bool,
|
||||
file_uuid: String,
|
||||
result: Option<serde_json::Value>,
|
||||
error: Option<String>,
|
||||
}
|
||||
|
||||
async fn rebuild_tkg(
|
||||
State(state): State<crate::api::types::AppState>,
|
||||
Path(file_uuid): Path<String>,
|
||||
) -> Json<TkgRebuildResponse> {
|
||||
let result = crate::core::processor::tkg::build_tkg(
|
||||
&state.db,
|
||||
&file_uuid,
|
||||
&OUTPUT_DIR,
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(r) => Json(TkgRebuildResponse {
|
||||
success: true,
|
||||
file_uuid,
|
||||
result: Some(serde_json::json!({
|
||||
"face_trace_nodes": r.face_trace_nodes,
|
||||
"object_nodes": r.object_nodes,
|
||||
"speaker_nodes": r.speaker_nodes,
|
||||
"co_occurrence_edges": r.co_occurrence_edges,
|
||||
"speaker_face_edges": r.speaker_face_edges,
|
||||
"face_face_edges": r.face_face_edges,
|
||||
})),
|
||||
error: None,
|
||||
}),
|
||||
Err(e) => Json(TkgRebuildResponse {
|
||||
success: false,
|
||||
file_uuid,
|
||||
result: None,
|
||||
error: Some(e.to_string()),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,92 @@ fn t(name: &str) -> String {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Pose data from face.json ────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct FacePose {
|
||||
frame: i64,
|
||||
x: f64,
|
||||
y: f64,
|
||||
w: f64,
|
||||
h: f64,
|
||||
yaw: f64,
|
||||
pitch: f64,
|
||||
roll: f64,
|
||||
}
|
||||
|
||||
fn load_face_pose_data(output_dir: &str, file_uuid: &str) -> Result<Vec<FacePose>> {
|
||||
let path = Path::new(output_dir).join(format!("{}.face.json", file_uuid));
|
||||
let content = std::fs::read_to_string(&path)
|
||||
.with_context(|| format!("Failed to read face.json: {}", path.display()))?;
|
||||
let json: serde_json::Value = serde_json::from_str(&content)?;
|
||||
|
||||
let mut poses = Vec::new();
|
||||
if let Some(frames) = json.get("frames").and_then(|v| v.as_array()) {
|
||||
for frame_entry in frames {
|
||||
let frame_num = frame_entry.get("frame").and_then(|v| v.as_i64()).unwrap_or(0);
|
||||
if let Some(faces) = frame_entry.get("faces").and_then(|v| v.as_array()) {
|
||||
for face in faces {
|
||||
let bbox = match face.get("bbox") {
|
||||
Some(b) => b,
|
||||
None => continue,
|
||||
};
|
||||
let pose = match face.get("pose") {
|
||||
Some(p) => p,
|
||||
None => continue,
|
||||
};
|
||||
poses.push(FacePose {
|
||||
frame: frame_num,
|
||||
x: bbox.get("x").and_then(|v| v.as_f64()).unwrap_or(0.0),
|
||||
y: bbox.get("y").and_then(|v| v.as_f64()).unwrap_or(0.0),
|
||||
w: bbox.get("width").and_then(|v| v.as_f64()).unwrap_or(0.0),
|
||||
h: bbox.get("height").and_then(|v| v.as_f64()).unwrap_or(0.0),
|
||||
yaw: pose.get("yaw").and_then(|v| v.as_f64()).unwrap_or(0.0),
|
||||
pitch: pose.get("pitch").and_then(|v| v.as_f64()).unwrap_or(0.0),
|
||||
roll: pose.get("roll").and_then(|v| v.as_f64()).unwrap_or(0.0),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(poses)
|
||||
}
|
||||
|
||||
/// Match a face from face_detections (frame, x, y, w, h) to its pose in face.json
|
||||
/// Uses bbox center distance to find the best match when multiple faces per frame.
|
||||
fn get_pose_for_face(frame: i64, x: f64, y: f64, w: f64, h: f64, poses: &[FacePose]) -> Option<(f64, f64, f64)> {
|
||||
let cx = x + w / 2.0;
|
||||
let cy = y + h / 2.0;
|
||||
let mut best_dist = f64::MAX;
|
||||
let mut result = None;
|
||||
for p in poses.iter().filter(|p| p.frame == frame) {
|
||||
let pcx = p.x + p.w / 2.0;
|
||||
let pcy = p.y + p.h / 2.0;
|
||||
let dist = (cx - pcx).abs() + (cy - pcy).abs();
|
||||
if dist < best_dist {
|
||||
best_dist = dist;
|
||||
result = Some((p.yaw, p.pitch, p.roll));
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn detect_mutual_gaze(
|
||||
bbox_a_x: f64, bbox_a_w: f64, yaw_a: f64,
|
||||
bbox_b_x: f64, bbox_b_w: f64, yaw_b: f64,
|
||||
threshold: f64,
|
||||
) -> bool {
|
||||
let cx_a = bbox_a_x + bbox_a_w / 2.0;
|
||||
let cx_b = bbox_b_x + bbox_b_w / 2.0;
|
||||
if cx_a < cx_b {
|
||||
// A 在左,B 在右 → A 要看右 (yaw > 0),B 要看左 (yaw < 0)
|
||||
yaw_a > threshold && yaw_b < -threshold
|
||||
} else {
|
||||
// A 在右,B 在左 → A 要看左 (yaw < 0),B 要看右 (yaw > 0)
|
||||
yaw_a < -threshold && yaw_b > threshold
|
||||
}
|
||||
}
|
||||
|
||||
// ── Input data structs ────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
@@ -108,13 +194,16 @@ pub struct TkgResult {
|
||||
|
||||
pub async fn build_tkg(db: &PostgresDb, file_uuid: &str, output_dir: &str) -> Result<TkgResult> {
|
||||
let pool = db.pool();
|
||||
let n_face = build_face_trace_nodes(pool, file_uuid).await?;
|
||||
let pose_data = load_face_pose_data(output_dir, file_uuid).unwrap_or_default();
|
||||
tracing::info!("[TKG] Loaded {} pose entries from face.json", pose_data.len());
|
||||
|
||||
let n_face = build_face_trace_nodes(pool, file_uuid, &pose_data).await?;
|
||||
let n_objects = build_yolo_object_nodes(pool, file_uuid, output_dir).await?;
|
||||
let n_speakers = build_speaker_nodes(pool, file_uuid, output_dir).await?;
|
||||
|
||||
let e_co = build_co_occurrence_edges(pool, file_uuid, output_dir).await?;
|
||||
let e_sf = build_speaker_face_edges(pool, file_uuid, output_dir).await?;
|
||||
let e_ff = build_face_face_edges(pool, file_uuid).await?;
|
||||
let e_ff = build_face_face_edges(pool, file_uuid, &pose_data).await?;
|
||||
|
||||
Ok(TkgResult {
|
||||
face_trace_nodes: n_face,
|
||||
@@ -128,16 +217,16 @@ pub async fn build_tkg(db: &PostgresDb, file_uuid: &str, output_dir: &str) -> Re
|
||||
|
||||
// ── Node builders ─────────────────────────────────────────────────
|
||||
|
||||
async fn build_face_trace_nodes(pool: &PgPool, file_uuid: &str) -> Result<usize> {
|
||||
async fn build_face_trace_nodes(pool: &PgPool, file_uuid: &str, pose_data: &[FacePose]) -> Result<usize> {
|
||||
let face_table = t("face_detections");
|
||||
let nodes_table = t("tkg_nodes");
|
||||
|
||||
let rows = sqlx::query_as::<_, FaceTraceRow>(&format!(
|
||||
r#"
|
||||
SELECT trace_id,
|
||||
SELECT trace_id::bigint,
|
||||
COUNT(*)::bigint as frame_count,
|
||||
MIN(frame_number) as start_f,
|
||||
MAX(frame_number) as end_f,
|
||||
MIN(frame_number)::bigint as start_f,
|
||||
MAX(frame_number)::bigint as end_f,
|
||||
AVG(x::float8) as avg_x,
|
||||
AVG(y::float8) as avg_y,
|
||||
AVG(width::float8) as avg_w,
|
||||
@@ -153,10 +242,53 @@ async fn build_face_trace_nodes(pool: &PgPool, file_uuid: &str) -> Result<usize>
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
|
||||
// Load per-frame data for pose matching
|
||||
let frame_rows: Vec<(i64, i64, f64, f64, f64, f64)> = sqlx::query_as(
|
||||
&format!(
|
||||
"SELECT trace_id::bigint, frame_number::bigint, x::float8, y::float8, width::float8, height::float8 \
|
||||
FROM {} WHERE file_uuid = $1 AND trace_id IS NOT NULL ORDER BY trace_id, frame_number",
|
||||
face_table
|
||||
)
|
||||
)
|
||||
.bind(file_uuid)
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
|
||||
// Group by trace_id: trace_id → Vec<(frame, x, y, w, h)>
|
||||
let mut trace_frames: HashMap<i64, Vec<(i64, f64, f64, f64, f64)>> = HashMap::new();
|
||||
for (tid, frame, x, y, w, h) in &frame_rows {
|
||||
trace_frames.entry(*tid).or_default().push((*frame, *x, *y, *w, *h));
|
||||
}
|
||||
|
||||
let mut count = 0;
|
||||
for row in &rows {
|
||||
let external_id = format!("trace_{}", row.trace_id);
|
||||
let label = format!("Face Trace {}", row.trace_id);
|
||||
let tid = row.trace_id;
|
||||
let external_id = format!("trace_{}", tid);
|
||||
let label = format!("Face Trace {}", tid);
|
||||
|
||||
// Compute average pose for this trace
|
||||
let mut yaw_sum = 0.0f64;
|
||||
let mut pitch_sum = 0.0f64;
|
||||
let mut roll_sum = 0.0f64;
|
||||
let mut pose_count = 0i64;
|
||||
|
||||
if let Some(frames) = trace_frames.get(&tid) {
|
||||
for (frame, x, y, w, h) in frames {
|
||||
if let Some((yaw, pitch, roll)) = get_pose_for_face(*frame, *x, *y, *w, *h, pose_data) {
|
||||
yaw_sum += yaw;
|
||||
pitch_sum += pitch;
|
||||
roll_sum += roll;
|
||||
pose_count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let (avg_yaw, avg_pitch, avg_roll) = if pose_count > 0 {
|
||||
(yaw_sum / pose_count as f64, pitch_sum / pose_count as f64, roll_sum / pose_count as f64)
|
||||
} else {
|
||||
(0.0, 0.0, 0.0)
|
||||
};
|
||||
|
||||
let props = serde_json::json!({
|
||||
"frame_count": row.frame_count,
|
||||
"start_frame": row.start_f,
|
||||
@@ -166,7 +298,11 @@ async fn build_face_trace_nodes(pool: &PgPool, file_uuid: &str) -> Result<usize>
|
||||
"y": row.avg_y.unwrap_or(0.0).round() as i64,
|
||||
"width": row.avg_w.unwrap_or(0.0).round() as i64,
|
||||
"height": row.avg_h.unwrap_or(0.0).round() as i64,
|
||||
}
|
||||
},
|
||||
"avg_yaw": (avg_yaw * 1000.0).round() / 1000.0,
|
||||
"avg_pitch": (avg_pitch * 1000.0).round() / 1000.0,
|
||||
"avg_roll": (avg_roll * 1000.0).round() / 1000.0,
|
||||
"pose_count": pose_count,
|
||||
});
|
||||
|
||||
sqlx::query(&format!(
|
||||
@@ -312,7 +448,7 @@ async fn build_co_occurrence_edges(
|
||||
let edges_table = t("tkg_edges");
|
||||
|
||||
let face_rows = sqlx::query_as::<_, FaceDetectionRow>(&format!(
|
||||
r#"SELECT trace_id, frame_number, x, y, width, height
|
||||
r#"SELECT trace_id::bigint, frame_number::bigint, x::float8, y::float8, width::float8, height::float8
|
||||
FROM {} WHERE file_uuid = $1 AND trace_id IS NOT NULL
|
||||
ORDER BY frame_number"#,
|
||||
face_table
|
||||
@@ -429,7 +565,7 @@ async fn build_speaker_face_edges(
|
||||
let edges_table = t("tkg_edges");
|
||||
|
||||
let traces = sqlx::query_as::<_, (i64, i64, i64)>(&format!(
|
||||
r#"SELECT trace_id, MIN(frame_number) as start_f, MAX(frame_number) as end_f
|
||||
r#"SELECT trace_id::bigint, MIN(frame_number)::bigint as start_f, MAX(frame_number)::bigint as end_f
|
||||
FROM {} WHERE file_uuid = $1 AND trace_id IS NOT NULL
|
||||
GROUP BY trace_id"#,
|
||||
face_table
|
||||
@@ -533,14 +669,15 @@ async fn build_speaker_face_edges(
|
||||
Ok(edge_count)
|
||||
}
|
||||
|
||||
async fn build_face_face_edges(pool: &PgPool, file_uuid: &str) -> Result<usize> {
|
||||
async fn build_face_face_edges(pool: &PgPool, file_uuid: &str, pose_data: &[FacePose]) -> Result<usize> {
|
||||
let face_table = t("face_detections");
|
||||
let nodes_table = t("tkg_nodes");
|
||||
let edges_table = t("tkg_edges");
|
||||
|
||||
// Use SQL JOIN for fast co-occurrence detection
|
||||
let rows: Vec<(i64, i64, i64)> = sqlx::query_as(&format!(
|
||||
r#"
|
||||
SELECT a.trace_id AS tid_a, b.trace_id AS tid_b, a.frame_number
|
||||
SELECT a.trace_id::bigint AS tid_a, b.trace_id::bigint AS tid_b, a.frame_number::bigint
|
||||
FROM {} a
|
||||
JOIN {} b
|
||||
ON a.file_uuid = b.file_uuid
|
||||
@@ -557,53 +694,123 @@ async fn build_face_face_edges(pool: &PgPool, file_uuid: &str) -> Result<usize>
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
|
||||
if rows.is_empty() {
|
||||
return Ok(0);
|
||||
// Also load per-frame bbox for mutual_gaze lookups
|
||||
let bbox_data: Vec<(i64, i64, f64, f64, f64, f64)> = sqlx::query_as(
|
||||
&format!(
|
||||
"SELECT trace_id::bigint, frame_number::bigint, x::float8, y::float8, width::float8, height::float8 \
|
||||
FROM {} WHERE file_uuid = $1 AND trace_id IS NOT NULL ORDER BY trace_id, frame_number",
|
||||
face_table
|
||||
)
|
||||
)
|
||||
.bind(file_uuid)
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
|
||||
let mut frame_map: HashMap<(i64, i64), (f64, f64, f64, f64)> = HashMap::new(); // (trace_id, frame) → (x, y, w, h)
|
||||
for (tid, frame, x, y, w, h) in &bbox_data {
|
||||
frame_map.insert((*tid, *frame), (*x, *y, *w, *h));
|
||||
}
|
||||
|
||||
// Deduplicate by pair
|
||||
let mut pair_frames: HashMap<(i64, i64), Vec<i64>> = HashMap::new();
|
||||
// Group by pair
|
||||
let mut pair_frames: HashMap<(i64, i64), Vec<(i64, bool)>> = HashMap::new();
|
||||
for (tid_a, tid_b, frame) in &rows {
|
||||
let key = if *tid_a < *tid_b {
|
||||
(*tid_a, *tid_b)
|
||||
} else {
|
||||
(*tid_b, *tid_a)
|
||||
let key = (*tid_a.min(tid_b), *tid_a.max(tid_b));
|
||||
let bbox_a = frame_map.get(&(*tid_a, *frame));
|
||||
let bbox_b = frame_map.get(&(*tid_b, *frame));
|
||||
|
||||
let gaze = match (bbox_a, bbox_b) {
|
||||
(Some(&(xa, ya, wa, ha)), Some(&(xb, yb, wb, hb))) => {
|
||||
get_pose_for_face(*frame, xa, ya, wa, ha, pose_data)
|
||||
.and_then(|(yaw_a, _, _)| {
|
||||
get_pose_for_face(*frame, xb, yb, wb, hb, pose_data)
|
||||
.map(|(yaw_b, _, _)| detect_mutual_gaze(xa, wa, yaw_a, xb, wb, yaw_b, 0.05))
|
||||
})
|
||||
.unwrap_or(false)
|
||||
}
|
||||
_ => false,
|
||||
};
|
||||
pair_frames.entry(key).or_default().push(*frame);
|
||||
pair_frames.entry(key).or_default().push((*frame, gaze));
|
||||
}
|
||||
|
||||
let mut edge_count = 0;
|
||||
for ((tid_a, tid_b), frames) in &pair_frames {
|
||||
// Cache node IDs to avoid repeated queries
|
||||
let mut node_id_cache: HashMap<i64, i64> = HashMap::new();
|
||||
for ((tid_a, tid_b), frame_data) in &pair_frames {
|
||||
let ext_a = format!("trace_{}", tid_a);
|
||||
let ext_b = format!("trace_{}", tid_b);
|
||||
|
||||
let n_a: Option<(i64,)> = sqlx::query_as(&format!(
|
||||
"SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2",
|
||||
nodes_table
|
||||
))
|
||||
.bind(file_uuid)
|
||||
.bind(&ext_a)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
|
||||
let n_b: Option<(i64,)> = sqlx::query_as(&format!(
|
||||
"SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2",
|
||||
nodes_table
|
||||
))
|
||||
.bind(file_uuid)
|
||||
.bind(&ext_b)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
|
||||
let (n_a_id, n_b_id) = match (n_a, n_b) {
|
||||
(Some((a,)), Some((b,))) => (a, b),
|
||||
_ => continue,
|
||||
let n_a_id = match node_id_cache.get(tid_a) {
|
||||
Some(id) => *id,
|
||||
None => {
|
||||
if let Some((id,)) = sqlx::query_as::<_, (i64,)>(&format!(
|
||||
"SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2",
|
||||
nodes_table
|
||||
))
|
||||
.bind(file_uuid).bind(&ext_a).fetch_optional(pool).await?
|
||||
{
|
||||
node_id_cache.insert(*tid_a, id);
|
||||
id
|
||||
} else { continue; }
|
||||
}
|
||||
};
|
||||
|
||||
let edge_props = serde_json::json!({
|
||||
"first_frame": frames[0],
|
||||
"frame_count": frames.len() as i64,
|
||||
});
|
||||
let n_b_id = match node_id_cache.get(tid_b) {
|
||||
Some(id) => *id,
|
||||
None => {
|
||||
if let Some((id,)) = sqlx::query_as::<_, (i64,)>(&format!(
|
||||
"SELECT id FROM {} WHERE file_uuid=$1 AND node_type='face_trace' AND external_id=$2",
|
||||
nodes_table
|
||||
))
|
||||
.bind(file_uuid).bind(&ext_b).fetch_optional(pool).await?
|
||||
{
|
||||
node_id_cache.insert(*tid_b, id);
|
||||
id
|
||||
} else { continue; }
|
||||
}
|
||||
};
|
||||
|
||||
let frames: Vec<i64> = frame_data.iter().map(|(f, _)| *f).collect();
|
||||
let gaze_frames: Vec<i64> = frame_data.iter().filter(|(_, g)| *g).map(|(f, _)| *f).collect();
|
||||
let gaze_count = gaze_frames.len() as i64;
|
||||
let has_gaze = gaze_count > 0;
|
||||
|
||||
let edge_props = if has_gaze {
|
||||
// Compute average yaw values for gaze frames
|
||||
let mut yaw_a_sum = 0.0f64;
|
||||
let mut yaw_b_sum = 0.0f64;
|
||||
let mut gaze_sample = 0i64;
|
||||
for (frame, _) in frame_data.iter().filter(|(_, g)| *g) {
|
||||
let bbox_a = frame_map.get(&(*tid_a, *frame));
|
||||
let bbox_b = frame_map.get(&(*tid_b, *frame));
|
||||
if let (Some(&(xa, ya, wa, ha)), Some(&(xb, yb, wb, hb))) = (bbox_a, bbox_b) {
|
||||
let pose_a = get_pose_for_face(*frame, xa, ya, wa, ha, pose_data);
|
||||
let pose_b = get_pose_for_face(*frame, xb, yb, wb, hb, pose_data);
|
||||
if let (Some((ya, _, _)), Some((yb, _, _))) = (pose_a, pose_b) {
|
||||
yaw_a_sum += ya;
|
||||
yaw_b_sum += yb;
|
||||
gaze_sample += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
let (avg_ya, avg_yb) = if gaze_sample > 0 {
|
||||
(yaw_a_sum / gaze_sample as f64, yaw_b_sum / gaze_sample as f64)
|
||||
} else { (0.0, 0.0) };
|
||||
|
||||
serde_json::json!({
|
||||
"first_frame": frames[0],
|
||||
"frame_count": frames.len() as i64,
|
||||
"mutual_gaze": true,
|
||||
"gaze_frame_count": gaze_count,
|
||||
"yaw_a_avg": (avg_ya * 1000.0).round() / 1000.0,
|
||||
"yaw_b_avg": (avg_yb * 1000.0).round() / 1000.0,
|
||||
})
|
||||
} else {
|
||||
serde_json::json!({
|
||||
"first_frame": frames[0],
|
||||
"frame_count": frames.len() as i64,
|
||||
"mutual_gaze": false,
|
||||
})
|
||||
};
|
||||
|
||||
sqlx::query(&format!(
|
||||
r#"
|
||||
|
||||
Reference in New Issue
Block a user