Files
momentry_core/src/api/agent_search.rs
Accusys 17e4e15860 feat: add Vision LLM integration (CLIP + Qwen3-VL cascade)
- Add Qwen3-VL dynamic management (start/stop/status CLI)
- Add CLIP + Qwen3-VL cascade detection strategy
- Add Vision CLI commands (vision start/stop/status, detect)
- Add cascade_vision processor module
- Add clip processor module
- Add qwen_vl_manager module

Changes:
- scripts/start_qwen3vl.sh, stop_qwen3vl.sh: Qwen3-VL management scripts
- src/core/vision/: Qwen3-VL manager module
- src/core/processor/cascade_vision.rs: CLIP + Qwen3-VL cascade logic
- src/core/processor/clip.rs: CLIP classification and detection
- src/api/clip_api.rs: CLIP API endpoints
- src/cli/vision.rs: Vision CLI implementation
- src/cli/args.rs: Add Vision and Detect commands
- src/main.rs: Integrate Vision CLI
- src/core/mod.rs: Add vision module
- src/core/processor/mod.rs: Add cascade_vision module
2026-06-13 16:25:52 +08:00

1010 lines
44 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
use axum::{extract::State, http::StatusCode, response::Json, routing::post, Router};
use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Mutex;
use std::time::Instant;
use crate::api::types::AppState;
use crate::core::db::schema;
use crate::core::llm::function_calling::{
self, call_llm_vision, ChatMessage, LlmResponse, ToolCall, ToolDef,
};
use base64::{engine::general_purpose::STANDARD as BASE64, Engine};
// ── Conversation Manager ─────────────────────────────────────────
struct Conversation {
messages: Vec<ChatMessage>,
created_at: Instant,
last_active: Instant,
}
static CONVERSATIONS: Lazy<Mutex<HashMap<String, Conversation>>> = Lazy::new(|| {
// Spawn cleanup task
std::thread::spawn(|| loop {
std::thread::sleep(std::time::Duration::from_secs(60));
let mut map = CONVERSATIONS.lock().unwrap();
let now = Instant::now();
map.retain(|_, conv| now.duration_since(conv.last_active).as_secs() < 1800);
});
Mutex::new(HashMap::new())
});
fn get_or_create_conv(conv_id: Option<&str>) -> (String, Vec<ChatMessage>) {
let mut map = CONVERSATIONS.lock().unwrap();
if let Some(cid) = conv_id {
if let Some(conv) = map.get_mut(cid) {
conv.last_active = Instant::now();
return (cid.to_string(), conv.messages.clone());
}
}
let id = uuid::Uuid::new_v4().to_string().replace('-', "")[..16].to_string();
map.insert(
id.clone(),
Conversation {
messages: Vec::new(),
created_at: Instant::now(),
last_active: Instant::now(),
},
);
(id, Vec::new())
}
fn save_messages(conv_id: &str, messages: &[ChatMessage]) {
if let Some(conv) = CONVERSATIONS.lock().unwrap().get_mut(conv_id) {
conv.messages = messages.to_vec();
conv.last_active = Instant::now();
}
}
// ── Request / Response ───────────────────────────────────────────
#[derive(Debug, Deserialize)]
pub struct AgentSearchRequest {
pub query: String,
pub conversation_id: Option<String>,
pub file_uuid: Option<String>,
}
#[derive(Debug, Serialize)]
pub struct AgentSearchResponse {
pub success: bool,
pub conversation_id: String,
pub answer: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub suggestions: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub sources: Option<Vec<serde_json::Value>>,
}
// ── Tool Definitions ──────────────────────────────────────────────
const SYSTEM_PROMPT: &str = r#"你是 Momentry 影片分析助手。回答用戶關於影片內容的問題。
## 工具使用規則
1. 先確認用戶在問哪部影片 — 使用 find_file 或 list_files
2. 人物問題優先使用 tkg_query
3. 人物台詞/發言問題使用 identities_search輸入人名→回傳台詞片段
4. 人物對話互動(誰跟誰說話)使用 tkg_query 的 speaker_interaction
5. 人物台詞內容使用 tkg_query 的 speaker_dialogue
6. 用文字反查人物使用 identity_text輸入關鍵字→找出誰說/提到這段話)
7. 語意/內容問題使用 smart_search 或 universal_search
8. 畫面分析使用 analyze_frame — 可以分析影片中的任何畫面內容(場景、人物表情、動作、物件等)
9. **可以同時呼叫多個工具,但需符合以下條件:**
- ✅ 查詢多部影片的相同資訊3部影片的人物列表
- ✅ 需要組合多個來源的資訊才能回答file_info + tkg_query
- ❌ 不要為了「嘗試所有可能」而盲目並行呼叫
- ❌ 如果單一工具已返回足夠答案,不需要額外呼叫
## 引導規則(優化版)
- **搜尋優先原則**
1. **所有問題都先嘗試搜尋,不要過早判斷用戶是否說了片名**
2. 根據搜尋結果和答案性質決定是否反問:
- **列举型問題**(找出所有、列出)→ ✅ 不反問,列出所有結果
- **指定型問題**(这部、那个)→ ⚠️ 反問選擇具體哪個
- **統計型問題**(多少、幾個)→ ✅ 不反問,統計所有結果
- **分析型問題**(分析、描述)→ ⚠️ 視問題表述決定
- **反問條件(精確)**
1. **答案需要分辨才反問**,不是「找到多部影片就反問」
2. 判断标准:
- ✅ 如果問題要求「所有」「列出」→ 答案不需要分辨 → 不反問
- ⚠️ 如果問題要求「这部」「那个」→ 答案需要分辨 → 反問
- ⚠️ 如果問題不明確 → 根據常理判断是否需要分辨
- **反問優化**
1. 反問時提供智能 suggestions依問題類型調整
2. 人物問題 → suggestions: ["演員名", "角色名", "年代"]
3. 內容問題 → suggestions: ["片名", "年代", "主題關鍵字"]
4. 畫面問題 → suggestions: ["片名", "時間範圍", "場景描述"]
- **特殊情況**
- 如果影片的 has_data 為 false → 不要推薦,引導選擇 has_data=true
- 如果搜尋結果直接包含答案 → 直接回答,不額外呼叫工具
- 如果找不到影片 → 反問提供更多資訊(片名、演員、年份)
- **回答格式**
- 不要輸出 JSON用自然語言回答
- 引用資料時附上具體數字frame 編號、時間秒數)
## 回答規則(優化版)
- 回答長度依問題類型調整:
- 簡單查詢(如「列出影片」)→ 簡潔列表回答1-2句
- 分析問題(如「描述情節」)→ 詳細回答3-5句
- 計數問題(如「有幾個場景」)→ 直接回答數字 + 簡短說明
- 回答格式:
- ✅ 如果找到影片,附上 file_uuid用戶之後可能需要
- ✅ 對於人物問題,說出角色名和演員名(如果有)
- ✅ 引用資料時附上具體數字frame 編號、時間秒數)
- ❌ 不要輸出 JSON 格式,用自然語言回答
- ❌ 不要編造資料,如果找不到就明確說「找不到」
## 停止規則(重要)
- **如果已經找到足夠資訊回答用戶問題,立即停止呼叫工具,直接回答**
- **如果連續 2 轪呼叫工具都返回空結果或相同資訊,停止並告知用戶「找不到更多相關資訊」**
- **如果用戶問題不明確或範圍過大,停止並反問用戶(提供 suggestions**
- **如果單一工具呼叫返回完整答案,不需要額外呼叫其他工具補充**
- **優化效率:避免重複呼叫相同工具或查詢相同內容**
- **成本控制:主動判斷是否需要繼續,不要盲目嘗試所有工具**"#;
fn make_tools(pool: &sqlx::PgPool) -> Vec<ToolDef> {
vec![
function_calling::make_tool(
"find_file",
"透過關鍵字搜尋影片(片名、演員、年份)。回傳符合的影片列表。",
serde_json::json!({
"query": {"type": "string", "description": "搜尋關鍵字(片名、演員名、年份)"}
}),
vec!["query"],
),
function_calling::make_tool(
"list_files",
"列出近期註冊的影片。",
serde_json::json!({
"limit": {"type": "integer", "description": "回傳筆數上限", "default": 10}
}),
vec![],
),
function_calling::make_tool(
"tkg_query",
"查詢影片的人物互動、配對、同框、台詞資料。query_type 包括top_identities人物排名、first_cooccurrence第一次同框、identity_details人物詳細、mutual_gaze互看、interaction_network互動網絡、identity_traces出場片段、file_info影片資訊、speaker_dialogue人物台詞、speaker_interaction兩人對話互動",
serde_json::json!({
"file_uuid": {"type": "string", "description": "影片 UUID"},
"query_type": {
"type": "string",
"enum": ["top_identities", "first_cooccurrence", "identity_details", "mutual_gaze", "interaction_network", "identity_traces", "file_info", "speaker_dialogue", "speaker_interaction"],
"description": "查詢類型"
},
"identity_name": {"type": "string", "description": "人物名稱(配合 identity_details / identity_traces / speaker_dialogue / speaker_interaction"},
"identity_b": {"type": "string", "description": "第二人物名稱(配合 first_cooccurrence / mutual_gaze / speaker_interaction"},
"limit": {"type": "integer", "default": 5}
}),
vec!["file_uuid", "query_type"],
),
function_calling::make_tool(
"smart_search",
"語意搜尋 chunk 文字內容。適合需要理解意圖的查詢。",
serde_json::json!({
"file_uuid": {"type": "string", "description": "限制搜尋範圍(可選)"},
"query": {"type": "string", "description": "搜尋關鍵字"},
"limit": {"type": "integer", "default": 5}
}),
vec!["query"],
),
function_calling::make_tool(
"identity_text",
"搜尋文字關鍵字找出有提及該內容的影片人物。適合回答「誰說了OOO」、「誰跟OOO有關」。不是查詢人物的台詞而是用文字反查人物。",
serde_json::json!({
"q": {"type": "string", "description": "搜尋關鍵字(台詞片段、主題等)"},
"file_uuid": {"type": "string", "description": "限制搜尋範圍(可選)"},
"limit": {"type": "integer", "default": 10}
}),
vec!["q"],
),
function_calling::make_tool(
"identities_search",
"查詢特定人物的台詞/發言內容。輸入人物名稱,回傳該人物在影片中說過的話。適合回答「某某人說了什麼」、「某某人的台詞」。",
serde_json::json!({
"q": {"type": "string", "description": "人物名稱關鍵字(姓名、角色名、別名)"},
"file_uuid": {"type": "string", "description": "限制搜尋範圍(可選)"},
"limit": {"type": "integer", "default": 10}
}),
vec!["q"],
),
function_calling::make_tool(
"get_identity_detail",
"查詢單一身份的詳細資料名字、角色、TMDb 資訊)。",
serde_json::json!({
"name": {"type": "string", "description": "人物名稱"}
}),
vec!["name"],
),
function_calling::make_tool(
"get_file_info",
"查詢影片基本資訊(片名、長度、解析度)。",
serde_json::json!({
"file_uuid": {"type": "string", "description": "影片 UUID"}
}),
vec!["file_uuid"],
),
function_calling::make_tool(
"get_representative_frame",
"查詢影片最具代表性的 frame 資訊frame 編號、時間、人物)。",
serde_json::json!({
"file_uuid": {"type": "string", "description": "影片 UUID"}
}),
vec!["file_uuid"],
),
function_calling::make_tool(
"analyze_frame",
"分析影片中指定畫面的視覺內容(場景、人物表情、動作、物件等)。若不指定 frame_number會使用代表性畫面。問題會傳給視覺 LLM 分析。",
serde_json::json!({
"file_uuid": {"type": "string", "description": "影片 UUID"},
"question": {"type": "string", "description": "關於畫面的問題,例如「這個場景發生什麼事?」"},
"frame_number": {"type": "integer", "description": "指定的 frame 編號(可選)"}
}),
vec!["file_uuid"],
),
]
}
// ── Tool Executors ───────────────────────────────────────────────
async fn exec_find_file(pool: &sqlx::PgPool, args: &serde_json::Value) -> Result<String, String> {
let query = args.get("query").and_then(|v| v.as_str()).unwrap_or("");
let videos = schema::table_name("videos");
let fd_table = schema::table_name("face_detections");
let like = format!("%{}%", query);
let rows: Vec<(String, String, bool)> = sqlx::query_as(&format!(
"SELECT v.file_uuid::text, v.file_name, \
(SELECT COUNT(*) FROM {} fd WHERE fd.file_uuid = v.file_uuid) > 0 AS has_data \
FROM {} v WHERE v.file_name ILIKE $1 \
ORDER BY v.created_at DESC LIMIT 10",
fd_table, videos
))
.bind(&like)
.fetch_all(pool)
.await
.map_err(|e| e.to_string())?;
if rows.is_empty() {
return Ok(serde_json::json!({"found": false, "message": "No files match the query. Try different keywords."}).to_string());
}
let files: Vec<serde_json::Value> = rows
.into_iter()
.map(|(u, n, hd)| serde_json::json!({"file_uuid": u, "file_name": n, "has_data": hd}))
.collect();
Ok(serde_json::json!({"found": true, "files": files}).to_string())
}
async fn exec_list_files(pool: &sqlx::PgPool, args: &serde_json::Value) -> Result<String, String> {
let limit = args.get("limit").and_then(|v| v.as_i64()).unwrap_or(10);
let videos = schema::table_name("videos");
let fd_table = schema::table_name("face_detections");
let rows: Vec<(String, String, bool)> = sqlx::query_as(&format!(
"SELECT v.file_uuid::text, v.file_name, \
(SELECT COUNT(*) FROM {} fd WHERE fd.file_uuid = v.file_uuid) > 0 AS has_data \
FROM {} v ORDER BY v.created_at DESC LIMIT $1",
fd_table, videos
))
.bind(limit)
.fetch_all(pool)
.await
.map_err(|e| e.to_string())?;
let files: Vec<serde_json::Value> = rows
.into_iter()
.map(|(u, n, hd)| serde_json::json!({"file_uuid": u, "file_name": n, "has_data": hd}))
.collect();
Ok(serde_json::json!({"files": files}).to_string())
}
async fn exec_tkg_query(pool: &sqlx::PgPool, args: &serde_json::Value) -> Result<String, String> {
let file_uuid = args.get("file_uuid").and_then(|v| v.as_str()).unwrap_or("");
let query_type = args
.get("query_type")
.and_then(|v| v.as_str())
.unwrap_or("");
let identity_name = args.get("identity_name").and_then(|v| v.as_str());
let identity_b = args.get("identity_b").and_then(|v| v.as_str());
let limit = args.get("limit").and_then(|v| v.as_i64()).unwrap_or(5);
let id_table = schema::table_name("identities");
let fd_table = schema::table_name("face_detections");
let videos = schema::table_name("videos");
let nodes = schema::table_name("tkg_nodes");
let edges = schema::table_name("tkg_edges");
match query_type {
"top_identities" => {
let rows: Vec<(String, String, i64)> = sqlx::query_as(&format!(
"SELECT i.uuid::text, i.name, COUNT(fd.id)::bigint AS face_count \
FROM {} fd JOIN {} i ON i.id = fd.identity_id \
WHERE fd.file_uuid = $1 AND fd.identity_id IS NOT NULL AND i.source = 'tmdb' \
GROUP BY i.uuid, i.name ORDER BY face_count DESC LIMIT $2",
fd_table, id_table
))
.bind(file_uuid)
.bind(limit)
.fetch_all(pool)
.await
.map_err(|e| e.to_string())?;
Ok(serde_json::json!({"identities": rows}).to_string())
}
"first_cooccurrence" => {
let name_a = identity_name.unwrap_or("");
let name_b = identity_b.unwrap_or("");
let row: Option<(i64, f64)> = sqlx::query_as(&format!(
"SELECT MIN(fd_a.frame_number)::bigint, \
ROUND(MIN(fd_a.frame_number)::numeric / GREATEST(MAX(v.fps)::numeric, 25.0), 2)::float8 \
FROM {} fd_a JOIN {} fd_b ON fd_a.frame_number = fd_b.frame_number \
JOIN {} v ON v.file_uuid = $1 \
WHERE fd_a.file_uuid = $1 \
AND fd_a.identity_id = (SELECT id FROM {} WHERE name ILIKE $2 LIMIT 1) \
AND fd_b.identity_id = (SELECT id FROM {} WHERE name ILIKE $3 LIMIT 1)",
fd_table, fd_table, videos, id_table, id_table
))
.bind(file_uuid).bind(name_a).bind(name_b)
.fetch_optional(pool)
.await.map_err(|e| e.to_string())?;
Ok(serde_json::json!({"first_cooccurrence": row.map(|(f, t)| serde_json::json!({"frame": f, "timestamp_secs": t}))}).to_string())
}
"identity_details" => {
let name = identity_name.unwrap_or("");
let row: Option<(String, String, Option<i32>, i64)> = sqlx::query_as(&format!(
"SELECT i.uuid::text, i.name, i.tmdb_id, \
(SELECT COUNT(*) FROM {} fd WHERE fd.identity_id = i.id AND fd.file_uuid = $1)::bigint \
FROM {} i WHERE i.name ILIKE $2 LIMIT 1",
fd_table, id_table
))
.bind(file_uuid).bind(name)
.fetch_optional(pool)
.await.map_err(|e| e.to_string())?;
Ok(serde_json::json!({"identity": row.map(|(u, n, tid, fc)| serde_json::json!({"uuid": u, "name": n, "tmdb_id": tid, "face_count": fc}))}).to_string())
}
"mutual_gaze" => {
let name_a = identity_name.unwrap_or("");
let name_b = identity_b.unwrap_or("");
let row: Option<(i64, i64, f64, f64)> = sqlx::query_as(&format!(
"SELECT (e.properties->>'first_frame')::bigint, \
(e.properties->>'gaze_frame_count')::int::bigint, \
(e.properties->>'yaw_a_avg')::float8, \
(e.properties->>'yaw_b_avg')::float8 \
FROM {} e \
JOIN {} a ON a.id = e.source_node_id \
JOIN {} b ON b.id = e.target_node_id \
JOIN {} fd_a ON fd_a.file_uuid = $1 AND fd_a.trace_id = REPLACE(a.external_id, 'trace_', '')::int \
JOIN {} fd_b ON fd_b.file_uuid = $1 AND fd_b.trace_id = REPLACE(b.external_id, 'trace_', '')::int \
JOIN {} ia ON ia.id = fd_a.identity_id \
JOIN {} ib ON ib.id = fd_b.identity_id \
WHERE e.file_uuid = $1 AND ia.name ILIKE $2 AND ib.name ILIKE $3 \
AND e.properties->>'mutual_gaze' = 'true' LIMIT 1",
edges, nodes, nodes, fd_table, fd_table, id_table, id_table
))
.bind(file_uuid).bind(name_a).bind(name_b)
.fetch_optional(pool)
.await.map_err(|e| e.to_string())?;
Ok(serde_json::json!({"mutual_gaze": row.map(|(f, gc, ya, yb)| serde_json::json!({"first_frame": f, "gaze_frame_count": gc, "yaw_a": ya, "yaw_b": yb}))}).to_string())
}
"interaction_network" => {
let rows: Vec<(String, String, i64)> = sqlx::query_as(&format!(
"SELECT ia.name, ib.name, COUNT(*)::bigint \
FROM {} e \
JOIN {} a ON a.id = e.source_node_id \
JOIN {} b ON b.id = e.target_node_id \
JOIN {} fd_a ON fd_a.trace_id = REPLACE(a.external_id, 'trace_', '')::int AND fd_a.file_uuid = $1 \
JOIN {} fd_b ON fd_b.trace_id = REPLACE(b.external_id, 'trace_', '')::int AND fd_b.file_uuid = $1 \
JOIN {} ia ON ia.id = fd_a.identity_id \
JOIN {} ib ON ib.id = fd_b.identity_id \
WHERE e.file_uuid = $1 AND e.edge_type = 'CO_OCCURS_WITH' \
AND ia.name != ib.name AND ia.source = 'tmdb' AND ib.source = 'tmdb' \
GROUP BY ia.name, ib.name \
ORDER BY COUNT(*) DESC LIMIT $2",
edges, nodes, nodes, fd_table, fd_table, id_table, id_table
))
.bind(file_uuid).bind(limit)
.fetch_all(pool)
.await.map_err(|e| e.to_string())?;
Ok(serde_json::json!({"interaction_network": rows}).to_string())
}
"identity_traces" => {
let name = identity_name.unwrap_or("");
// MIN/MAX frame_number should be bigint (i64), not int
let rows: Vec<(i32, i64, i64, i64)> = sqlx::query_as(&format!(
"SELECT fd.trace_id, COUNT(*)::bigint, MIN(fd.frame_number)::bigint, MAX(fd.frame_number)::bigint \
FROM {} fd JOIN {} i ON i.id = fd.identity_id \
WHERE fd.file_uuid = $1 AND i.name ILIKE $2 \
GROUP BY fd.trace_id ORDER BY COUNT(*) DESC LIMIT $3",
fd_table, id_table
))
.bind(file_uuid).bind(name).bind(limit)
.fetch_all(pool)
.await.map_err(|e| e.to_string())?;
Ok(serde_json::json!({"traces": rows}).to_string())
}
"file_info" => {
let row: Option<(String, f64, i32, i32, f64)> = sqlx::query_as(&format!(
"SELECT file_name, duration, width, height, fps FROM {} WHERE file_uuid = $1",
videos
))
.bind(file_uuid)
.fetch_optional(pool)
.await
.map_err(|e| e.to_string())?;
Ok(serde_json::json!({"file_info": row.map(|(n, d, w, h, f)| serde_json::json!({"file_name": n, "duration_sec": d, "width": w, "height": h, "fps": f}))}).to_string())
}
"speaker_dialogue" => {
let name = identity_name.unwrap_or("");
let rows: Vec<(String, Option<String>)> = sqlx::query_as(&format!(
"SELECT DISTINCT sn.external_id, sn.properties->>'full_text' AS full_text \
FROM {} i \
JOIN {} fd ON fd.identity_id = i.id AND ($2::text IS NULL OR fd.file_uuid = $2) \
JOIN {} fn ON fn.file_uuid = fd.file_uuid \
AND fn.node_type = 'face_trace' \
AND fn.external_id = CONCAT('trace_', fd.trace_id) \
JOIN {} e ON e.source_node_id = fn.id \
AND e.edge_type = 'SPEAKS_AS' \
AND ($2::text IS NULL OR e.file_uuid = $2) \
JOIN {} sn ON sn.id = e.target_node_id \
WHERE i.name ILIKE $1 \
LIMIT $3",
id_table, fd_table, nodes, edges, nodes
))
.bind(name)
.bind(file_uuid)
.bind(limit)
.fetch_all(pool)
.await
.map_err(|e| e.to_string())?;
Ok(
serde_json::json!({"speakers": rows.iter().map(|(sid, text)| {
serde_json::json!({"speaker_id": sid, "dialogue": text})
}).collect::<Vec<_>>()})
.to_string(),
)
}
"speaker_interaction" => {
let name_a = identity_name.unwrap_or("");
let name_b = identity_b.unwrap_or("");
if name_a.is_empty() || name_b.is_empty() {
return Ok(
serde_json::json!({"error": "identity_name and identity_b are required"})
.to_string(),
);
}
// Get both speakers' segments from TKG
let rows: Vec<(String, String, serde_json::Value)> = sqlx::query_as(&format!(
"SELECT sn.external_id, sn.properties->>'full_text' AS full_text, sn.properties->'segments' AS segments \
FROM {} i \
JOIN {} fd ON fd.identity_id = i.id AND ($3::text IS NULL OR fd.file_uuid = $3) \
JOIN {} fn ON fn.file_uuid = fd.file_uuid \
AND fn.node_type = 'face_trace' \
AND fn.external_id = CONCAT('trace_', fd.trace_id) \
JOIN {} e ON e.source_node_id = fn.id \
AND e.edge_type = 'SPEAKS_AS' \
AND ($3::text IS NULL OR e.file_uuid = $3) \
JOIN {} sn ON sn.id = e.target_node_id \
WHERE (i.name ILIKE $1 OR i.name ILIKE $2) \
ORDER BY sn.external_id",
id_table, fd_table, nodes, edges, nodes
))
.bind(name_a)
.bind(name_b)
.bind(file_uuid)
.fetch_all(pool)
.await
.map_err(|e| e.to_string())?;
let mut interactions = Vec::new();
for i in 0..rows.len() {
for j in i + 1..rows.len() {
let (sid_a, text_a, segs_a_val) = &rows[i];
let (sid_b, text_b, segs_b_val) = &rows[j];
let segs_a = segs_a_val.as_array();
let segs_b = segs_b_val.as_array();
if let (Some(a_list), Some(b_list)) = (segs_a, segs_b) {
for sa in a_list {
let sa_start = sa.get("start").and_then(|v| v.as_f64()).unwrap_or(0.0);
let sa_end = sa.get("end").and_then(|v| v.as_f64()).unwrap_or(0.0);
let sa_text = sa.get("text").and_then(|v| v.as_str()).unwrap_or("");
if sa_text.is_empty() {
continue;
}
for sb in b_list {
let sb_start =
sb.get("start").and_then(|v| v.as_f64()).unwrap_or(0.0);
let sb_end = sb.get("end").and_then(|v| v.as_f64()).unwrap_or(0.0);
let sb_text = sb.get("text").and_then(|v| v.as_str()).unwrap_or("");
if sb_text.is_empty() {
continue;
}
// Check temporal overlap
let overlap_start = sa_start.max(sb_start);
let overlap_end = sa_end.min(sb_end);
if overlap_start < overlap_end {
interactions.push(serde_json::json!({
"speaker_a": sid_a,
"speaker_b": sid_b,
"time_range_s": [overlap_start, overlap_end],
"dialogue_a": sa_text,
"dialogue_b": sb_text,
}));
}
}
}
}
}
}
interactions.sort_by(|a, b| {
let a_start = a["time_range_s"][0].as_f64().unwrap_or(0.0);
let b_start = b["time_range_s"][0].as_f64().unwrap_or(0.0);
a_start.partial_cmp(&b_start).unwrap()
});
interactions.truncate(limit as usize);
Ok(serde_json::json!({"interactions": interactions, "speaker_a_text": rows.first().map(|r| r.1.clone()), "speaker_b_text": rows.get(1).map(|r| r.1.clone())}).to_string())
}
_ => Ok(
serde_json::json!({"error": format!("Unknown query_type: {}", query_type)}).to_string(),
),
}
}
async fn exec_smart_search(
_pool: &sqlx::PgPool,
args: &serde_json::Value,
) -> Result<String, String> {
let query = args.get("query").and_then(|v| v.as_str()).unwrap_or("");
let file_uuid = args.get("file_uuid").and_then(|v| v.as_str());
let limit = args.get("limit").and_then(|v| v.as_i64()).unwrap_or(5);
let chunk_table = schema::table_name("chunk");
let mut sql = format!(
"SELECT chunk_id, text_content, start_frame, end_frame, chunk_type \
FROM {} WHERE text_content ILIKE $1",
chunk_table
);
if file_uuid.is_some() {
sql.push_str(" AND file_uuid = $2");
}
sql.push_str(&format!(" ORDER BY start_frame LIMIT {}", limit));
if let Some(fuid) = file_uuid {
let like = format!("%{}%", query);
let rows: Vec<(String, Option<String>, i64, i64, String)> = sqlx::query_as(&sql)
.bind(&like)
.bind(fuid)
.fetch_all(_pool)
.await
.map_err(|e| e.to_string())?;
Ok(serde_json::json!({"results": rows}).to_string())
} else {
let like = format!("%{}%", query);
let rows: Vec<(String, Option<String>, i64, i64, String)> = sqlx::query_as(&sql)
.bind(&like)
.fetch_all(_pool)
.await
.map_err(|e| e.to_string())?;
Ok(serde_json::json!({"results": rows}).to_string())
}
}
async fn exec_identity_text(
pool: &sqlx::PgPool,
args: &serde_json::Value,
) -> Result<String, String> {
let q = args.get("q").and_then(|v| v.as_str()).unwrap_or("");
let file_uuid = args.get("file_uuid").and_then(|v| v.as_str());
let limit = args
.get("limit")
.and_then(|v| v.as_i64())
.unwrap_or(10)
.min(50);
let chunk_table = schema::table_name("chunk");
let fd_table = schema::table_name("face_detections");
let id_table = schema::table_name("identities");
let like_q = format!("%{}%", q.replace('%', "%%"));
let sql = format!(
"SELECT c.chunk_id, c.start_time, c.end_time, c.text_content, \
i.name AS identity_name, fd.trace_id, i.source AS identity_source \
FROM {} c \
JOIN {} fd ON fd.file_uuid = c.file_uuid \
AND fd.frame_number BETWEEN c.start_frame AND c.end_frame \
AND fd.identity_id IS NOT NULL \
JOIN {} i ON i.id = fd.identity_id \
WHERE ($1::text IS NULL OR c.file_uuid = $1) \
AND (LOWER(c.text_content) LIKE LOWER($2) OR LOWER(c.content::text) LIKE LOWER($2)) \
ORDER BY c.start_time \
LIMIT $3",
chunk_table, fd_table, id_table
);
let rows: Vec<(
String,
f64,
f64,
Option<String>,
String,
Option<i32>,
String,
)> = sqlx::query_as(&sql)
.bind(file_uuid)
.bind(&like_q)
.bind(limit)
.fetch_all(pool)
.await
.map_err(|e| e.to_string())?;
Ok(
serde_json::json!({"results": rows.iter().map(|(chunk_id, st, et, txt, name, tid, src)| {
serde_json::json!({
"chunk_id": chunk_id,
"start_time": st,
"end_time": et,
"text": txt,
"identity_name": name,
"trace_id": tid,
"source": src
})
} ).collect::<Vec<_>>()})
.to_string(),
)
}
async fn exec_identities_search(
pool: &sqlx::PgPool,
args: &serde_json::Value,
) -> Result<String, String> {
let q = args.get("q").and_then(|v| v.as_str()).unwrap_or("");
let file_uuid = args.get("file_uuid").and_then(|v| v.as_str());
let limit = args
.get("limit")
.and_then(|v| v.as_i64())
.unwrap_or(10)
.min(50);
let id_table = schema::table_name("identities");
let fd_table = schema::table_name("face_detections");
let chunk_table = schema::table_name("chunk");
let like_q = format!("%{}%", q.replace('%', "%%"));
let sql = format!(
"SELECT DISTINCT ON (i.name, c.chunk_id) \
i.name, c.chunk_id, c.start_time, c.end_time, c.text_content, fd.trace_id \
FROM {} i \
JOIN {} fd ON fd.identity_id = i.id \
JOIN {} c ON c.file_uuid = fd.file_uuid \
AND c.start_time <= fd.frame_number / COALESCE(c.fps, 25.0) \
AND c.end_time >= fd.frame_number / COALESCE(c.fps, 25.0) \
WHERE (i.name ILIKE $1 \
OR EXISTS (SELECT 1 FROM jsonb_array_elements(i.metadata->'aliases') AS a WHERE a->>'name' ILIKE $1)) \
AND ($2::text IS NULL OR fd.file_uuid = $2) \
ORDER BY i.name, c.chunk_id, c.start_time \
LIMIT $3",
id_table, fd_table, chunk_table
);
let rows: Vec<(String, String, f64, f64, Option<String>, Option<i32>)> = sqlx::query_as(&sql)
.bind(&like_q)
.bind(file_uuid)
.bind(limit)
.fetch_all(pool)
.await
.map_err(|e| e.to_string())?;
Ok(
serde_json::json!({"results": rows.iter().map(|(name, chunk_id, st, et, txt, tid)| {
serde_json::json!({
"identity_name": name,
"chunk_id": chunk_id,
"start_time": st,
"end_time": et,
"text": txt,
"trace_id": tid,
})
}).collect::<Vec<_>>()})
.to_string(),
)
}
async fn exec_get_identity_detail(
pool: &sqlx::PgPool,
args: &serde_json::Value,
) -> Result<String, String> {
let name = args.get("name").and_then(|v| v.as_str()).unwrap_or("");
let id_table = schema::table_name("identities");
let row: Option<(String, String, Option<String>, Option<i32>, Option<String>)> = sqlx::query_as(&format!(
"SELECT uuid::text, name, source, tmdb_id, metadata->>'tmdb_character' FROM {} WHERE name ILIKE $1 LIMIT 1",
id_table
))
.bind(name)
.fetch_optional(pool)
.await.map_err(|e| e.to_string())?;
Ok(serde_json::json!({"identity": row.map(|(u, n, s, t, c)| serde_json::json!({"uuid": u, "name": n, "source": s, "tmdb_id": t, "character": c}))}).to_string())
}
async fn exec_get_file_info(
pool: &sqlx::PgPool,
args: &serde_json::Value,
) -> Result<String, String> {
let file_uuid = args.get("file_uuid").and_then(|v| v.as_str()).unwrap_or("");
let videos = schema::table_name("videos");
let row: Option<(String, f64, i32, i32, f64)> = sqlx::query_as(&format!(
"SELECT file_name, duration, width, height, fps FROM {} WHERE file_uuid = $1",
videos
))
.bind(file_uuid)
.fetch_optional(pool)
.await
.map_err(|e| e.to_string())?;
Ok(serde_json::json!({"file_info": row.map(|(n, d, w, h, f)| serde_json::json!({"file_name": n, "duration_sec": d, "width": w, "height": h, "fps": f}))}).to_string())
}
async fn exec_get_representative_frame(
pool: &sqlx::PgPool,
args: &serde_json::Value,
) -> Result<String, String> {
let file_uuid = args.get("file_uuid").and_then(|v| v.as_str()).unwrap_or("");
match crate::core::processor::tkg::query_auto_representative_frame(pool, file_uuid).await {
Ok(r) => Ok(serde_json::json!({
"frame_number": r.frame_number,
"face_quality": r.face_quality,
"main_identities": r.main_identities,
"traces": r.traces,
})
.to_string()),
Err(e) => Ok(serde_json::json!({"error": e.to_string()}).to_string()),
}
}
async fn exec_analyze_frame(
pool: &sqlx::PgPool,
args: &serde_json::Value,
) -> Result<String, String> {
let file_uuid = args.get("file_uuid").and_then(|v| v.as_str()).unwrap_or("");
let question = args
.get("question")
.and_then(|v| v.as_str())
.unwrap_or("請描述這個畫面中的內容");
if file_uuid.is_empty() {
return Ok(serde_json::json!({"error": "file_uuid is required"}).to_string());
}
let videos = schema::table_name("videos");
let (video_path, fps): (String, f64) = sqlx::query_as(&format!(
"SELECT file_path, COALESCE(fps, 25.0) FROM {} WHERE file_uuid = $1",
videos
))
.bind(file_uuid)
.fetch_optional(pool)
.await
.map_err(|e| e.to_string())?
.ok_or_else(|| "Video not found".to_string())?;
let frame_number = match args.get("frame_number").and_then(|v| v.as_i64()) {
Some(f) => f,
None => {
match crate::core::processor::tkg::query_auto_representative_frame(pool, file_uuid)
.await
{
Ok(r) => r.frame_number,
Err(_) => {
let duration: f64 = sqlx::query_scalar(&format!(
"SELECT COALESCE(duration, 0) FROM {} WHERE file_uuid = $1",
videos
))
.bind(file_uuid)
.fetch_optional(pool)
.await
.map_err(|e| e.to_string())?
.unwrap_or(0.0);
if duration > 0.0 {
((duration / 2.0) * fps) as i64
} else {
0
}
}
}
}
};
let timestamp_secs = frame_number as f64 / fps;
let ffmpeg_path = std::env::var("MOMENTRY_FFMPEG").unwrap_or_else(|_| {
let full = "/opt/homebrew/opt/ffmpeg-full/bin/ffmpeg";
if std::path::Path::new(full).exists() {
full.to_string()
} else {
"ffmpeg".to_string()
}
});
let output = tokio::process::Command::new(&ffmpeg_path)
.args([
"-ss",
&format!("{:.3}", timestamp_secs),
"-i",
&video_path,
"-vframes",
"1",
"-f",
"image2pipe",
"-vcodec",
"mjpeg",
"-",
])
.output()
.await
.map_err(|e| format!("ffmpeg execution error: {}", e))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Ok(serde_json::json!({"error": format!("ffmpeg failed: {}", stderr)}).to_string());
}
let base64_img = BASE64.encode(&output.stdout);
let system_prompt =
"你是一個專業的影片畫面分析助手。請根據提供的畫面以及用戶的問題,詳細描述畫面中的內容,包括場景、人物、動作、表情、物件等。請用繁體中文回答。";
let vision_result = call_llm_vision(system_prompt, question, vec![base64_img], 1024, 120)
.await
.map_err(|e| e.to_string())?;
Ok(serde_json::json!({
"frame_number": frame_number,
"timestamp_secs": timestamp_secs,
"analysis": vision_result,
})
.to_string())
}
// ── Tool Router ───────────────────────────────────────────────────
async fn execute_tool(pool: &sqlx::PgPool, tool_call: &ToolCall) -> (String, String, String) {
let name = tool_call.function.name.clone();
let tool_call_id = tool_call.id.clone().unwrap_or_default();
let args: serde_json::Value =
match serde_json::from_str(&tool_call.function.arguments) {
Ok(v) => v,
Err(e) => return (tool_call_id, name, serde_json::json!({"error": format!("Invalid arguments: {}", e)}).to_string()),
};
let result = match name.as_str() {
"find_file" => exec_find_file(pool, &args).await,
"list_files" => exec_list_files(pool, &args).await,
"tkg_query" => exec_tkg_query(pool, &args).await,
"smart_search" => exec_smart_search(pool, &args).await,
"identity_text" => exec_identity_text(pool, &args).await,
"identities_search" => exec_identities_search(pool, &args).await,
"get_identity_detail" => exec_get_identity_detail(pool, &args).await,
"get_file_info" => exec_get_file_info(pool, &args).await,
"get_representative_frame" => exec_get_representative_frame(pool, &args).await,
"analyze_frame" => exec_analyze_frame(pool, &args).await,
_ => Err(format!("Unknown tool: {}", name)),
};
let content = match result {
Ok(s) => s,
Err(e) => serde_json::json!({"error": e}).to_string(),
};
(tool_call_id, name, content)
}
// ── Tool Loop ─────────────────────────────────────────────────────
const MAX_ROUNDS: u32 = 15;
async fn run_tool_loop(
pool: &sqlx::PgPool,
system_prompt: &str,
user_query: &str,
history: Vec<ChatMessage>,
) -> (String, Vec<ChatMessage>, Vec<serde_json::Value>) {
let mut messages = function_calling::build_conversation(system_prompt, user_query, history);
let mut sources = Vec::new();
for round in 0..MAX_ROUNDS {
let tools = make_tools(pool);
tracing::info!(
"[AGENT] Round {} started, message_count: {}, tools_available: {}",
round + 1,
messages.len(),
tools.len()
);
match function_calling::call_llm(messages.clone(), Some(tools.clone()), 2048, 120).await {
Ok(LlmResponse::Text(text)) => {
tracing::info!(
"[AGENT] Loop finished: rounds_used={}, total_tools_called={}, answer_length={} chars",
round + 1,
sources.len(),
text.len()
);
return (text, messages, sources);
}
Ok(LlmResponse::ToolCalls(calls)) => {
messages.push(ChatMessage {
role: "assistant".to_string(),
content: None,
tool_calls: Some(calls.clone()),
tool_call_id: None,
name: None,
});
for call in &calls {
let (tool_call_id, name, content) = execute_tool(pool, call).await;
tracing::info!(
"[AGENT] Tool called: {}, result_size: {} chars, round: {}",
name,
content.len(),
round + 1
);
sources.push(serde_json::json!({"tool": name, "result": content}));
messages.push(function_calling::make_tool_result(
&tool_call_id, &name, &content,
));
}
}
Err(e) => {
tracing::error!("[AGENT] LLM call failed: {}", e);
return (format!("系統錯誤:{}", e), messages, sources);
}
}
}
tracing::warn!(
"[AGENT] Max rounds reached: rounds_used={}, total_tools_called={}",
MAX_ROUNDS,
sources.len()
);
(
"已達到最大查詢次數,請縮小問題範圍後重新詢問。".to_string(),
messages,
sources,
)
}
// ── Handler ───────────────────────────────────────────────────────
async fn agent_search(
State(state): State<AppState>,
Json(req): Json<AgentSearchRequest>,
) -> Result<Json<AgentSearchResponse>, (StatusCode, Json<serde_json::Value>)> {
let (conv_id, history) = get_or_create_conv(req.conversation_id.as_deref());
let (answer, messages, sources) =
run_tool_loop(state.db.pool(), SYSTEM_PROMPT, &req.query, history).await;
// Save messages (skip system prompt — build_conversation re-adds it)
let history: Vec<ChatMessage> = messages.into_iter().skip(1).collect();
save_messages(&conv_id, &history);
let needs_input = answer.contains('') || answer.contains('?');
let suggestions = if needs_input {
Some(vec![
"演員名".to_string(),
"電影片名".to_string(),
"年份".to_string(),
])
} else {
None
};
Ok(Json(AgentSearchResponse {
success: true,
conversation_id: conv_id,
answer,
suggestions,
sources: Some(sources),
}))
}
// ── Routes ─────────────────────────────────────────────────────────
pub fn agent_search_routes() -> Router<AppState> {
Router::new().route("/api/v1/agents/search", post(agent_search))
}