feat: add Vision LLM integration (CLIP + Qwen3-VL cascade)

- Add Qwen3-VL dynamic management (start/stop/status CLI) - Add CLIP + Qwen3-VL cascade detection strategy - Add Vision CLI commands (vision start/stop/status, detect) - Add cascade_vision processor module - Add clip processor module - Add qwen_vl_manager module Changes: - scripts/start_qwen3vl.sh, stop_qwen3vl.sh: Qwen3-VL management scripts - src/core/vision/: Qwen3-VL manager module - src/core/processor/cascade_vision.rs: CLIP + Qwen3-VL cascade logic - src/core/processor/clip.rs: CLIP classification and detection - src/api/clip_api.rs: CLIP API endpoints - src/cli/vision.rs: Vision CLI implementation - src/cli/args.rs: Add Vision and Detect commands - src/main.rs: Integrate Vision CLI - src/core/mod.rs: Add vision module - src/core/processor/mod.rs: Add cascade_vision module
2026-06-13 16:25:52 +08:00
parent 834b0d4865
commit 17e4e15860
37 changed files with 2185 additions and 294 deletions
--- a/src/api/agent_search.rs
+++ b/src/api/agent_search.rs
@@ -91,19 +91,63 @@ const SYSTEM_PROMPT: &str = r#"你是 Momentry 影片分析助手。回答用戶
 6. 用文字反查人物使用 identity_text（輸入關鍵字→找出誰說/提到這段話）
 7. 語意/內容問題使用 smart_search 或 universal_search
 8. 畫面分析使用 analyze_frame — 可以分析影片中的任何畫面內容（場景、人物表情、動作、物件等）
-9. 可以同時呼叫多個工具
+9. **可以同時呼叫多個工具，但需符合以下條件：**
+   - ✅ 查詢多部影片的相同資訊（如：3部影片的人物列表）
+   - ✅ 需要組合多個來源的資訊才能回答（如：file_info + tkg_query）
+   - ❌ 不要為了「嘗試所有可能」而盲目並行呼叫
+   - ❌ 如果單一工具已返回足夠答案，不需要額外呼叫

-## 引導規則
- 如果用戶沒說片名 → 用 find_file 搜尋，如果名稱不明確就反問
- 反問時提供 suggestions，例如演員名、年代
- **如果影片的 has_data 為 false，代表尚未完成處理，不要推薦用戶使用。引導用戶選擇 has_data=true 的影片**
- 不要輸出 JSON，用自然語言回答
- 引用資料時附上具體數字（frame 編號、時間秒數）
+## 引導規則（優化版）
+- **搜尋優先原則**：
+  1. **所有問題都先嘗試搜尋，不要過早判斷用戶是否說了片名**
+  2. 根據搜尋結果和答案性質決定是否反問：
+     - **列举型問題**（找出所有、列出）→ ✅ 不反問，列出所有結果
+     - **指定型問題**（这部、那个）→ ⚠️ 反問選擇具體哪個
+     - **統計型問題**（多少、幾個）→ ✅ 不反問，統計所有結果
+     - **分析型問題**（分析、描述）→ ⚠️ 視問題表述決定

-## 回答規則
- 回答要簡潔但完整
- 如果找到影片，附上 file_uuid（用戶之後可能需要）
- 對於人物問題，說出角色名和演員名"#;
+- **反問條件（精確）**：
+  1. **答案需要分辨才反問**，不是「找到多部影片就反問」
+  2. 判断标准：
+     - ✅ 如果問題要求「所有」「列出」→ 答案不需要分辨 → 不反問
+     - ⚠️ 如果問題要求「这部」「那个」→ 答案需要分辨 → 反問
+     - ⚠️ 如果問題不明確 → 根據常理判断是否需要分辨
+
+- **反問優化**：
+  1. 反問時提供智能 suggestions（依問題類型調整）
+  2. 人物問題 → suggestions: ["演員名", "角色名", "年代"]
+  3. 內容問題 → suggestions: ["片名", "年代", "主題關鍵字"]
+  4. 畫面問題 → suggestions: ["片名", "時間範圍", "場景描述"]
+
+- **特殊情況**：
+  - 如果影片的 has_data 為 false → 不要推薦，引導選擇 has_data=true
+  - 如果搜尋結果直接包含答案 → 直接回答，不額外呼叫工具
+  - 如果找不到影片 → 反問提供更多資訊（片名、演員、年份）
+
+- **回答格式**：
+  - 不要輸出 JSON，用自然語言回答
+  - 引用資料時附上具體數字（frame 編號、時間秒數）
+
+## 回答規則（優化版）
+- 回答長度依問題類型調整：
+  - 簡單查詢（如「列出影片」）→ 簡潔列表回答（1-2句）
+  - 分析問題（如「描述情節」）→ 詳細回答（3-5句）
+  - 計數問題（如「有幾個場景」）→ 直接回答數字 + 簡短說明
+
+- 回答格式：
+  - ✅ 如果找到影片，附上 file_uuid（用戶之後可能需要）
+  - ✅ 對於人物問題，說出角色名和演員名（如果有）
+  - ✅ 引用資料時附上具體數字（frame 編號、時間秒數）
+  - ❌ 不要輸出 JSON 格式，用自然語言回答
+  - ❌ 不要編造資料，如果找不到就明確說「找不到」
+
+## 停止規則（重要）
+- **如果已經找到足夠資訊回答用戶問題，立即停止呼叫工具，直接回答**
+- **如果連續 2 轪呼叫工具都返回空結果或相同資訊，停止並告知用戶「找不到更多相關資訊」**
+- **如果用戶問題不明確或範圍過大，停止並反問用戶（提供 suggestions）**
+- **如果單一工具呼叫返回完整答案，不需要額外呼叫其他工具補充**
+- **優化效率：避免重複呼叫相同工具或查詢相同內容**
+- **成本控制：主動判斷是否需要繼續，不要盲目嘗試所有工具**"#;

 fn make_tools(pool: &sqlx::PgPool) -> Vec<ToolDef> {
    vec![
@@ -825,8 +869,12 @@ async fn exec_analyze_frame(

 async fn execute_tool(pool: &sqlx::PgPool, tool_call: &ToolCall) -> (String, String, String) {
    let name = tool_call.function.name.clone();
+    let tool_call_id = tool_call.id.clone().unwrap_or_default();
    let args: serde_json::Value =
-        serde_json::from_str(&tool_call.function.arguments).unwrap_or_default();
+        match serde_json::from_str(&tool_call.function.arguments) {
+            Ok(v) => v,
+            Err(e) => return (tool_call_id, name, serde_json::json!({"error": format!("Invalid arguments: {}", e)}).to_string()),
+        };
    let result = match name.as_str() {
        "find_file" => exec_find_file(pool, &args).await,
        "list_files" => exec_list_files(pool, &args).await,
@@ -844,31 +892,42 @@ async fn execute_tool(pool: &sqlx::PgPool, tool_call: &ToolCall) -> (String, Str
        Ok(s) => s,
        Err(e) => serde_json::json!({"error": e}).to_string(),
    };
-    let tool_call_id = tool_call.id.clone().unwrap_or_default();
    (tool_call_id, name, content)
 }

 // ── Tool Loop ─────────────────────────────────────────────────────

-const MAX_ROUNDS: u32 = 5;
+const MAX_ROUNDS: u32 = 15;

 async fn run_tool_loop(
    pool: &sqlx::PgPool,
    system_prompt: &str,
    user_query: &str,
    history: Vec<ChatMessage>,
-) -> (String, Vec<serde_json::Value>) {
+) -> (String, Vec<ChatMessage>, Vec<serde_json::Value>) {
    let mut messages = function_calling::build_conversation(system_prompt, user_query, history);
    let mut sources = Vec::new();

    for round in 0..MAX_ROUNDS {
-        let tools = Some(make_tools(pool));
-        match function_calling::call_llm(messages.clone(), tools, 2048, 120).await {
+        let tools = make_tools(pool);
+        tracing::info!(
+            "[AGENT] Round {} started, message_count: {}, tools_available: {}",
+            round + 1,
+            messages.len(),
+            tools.len()
+        );
+        
+        match function_calling::call_llm(messages.clone(), Some(tools.clone()), 2048, 120).await {
            Ok(LlmResponse::Text(text)) => {
-                return (text, sources);
+                tracing::info!(
+                    "[AGENT] Loop finished: rounds_used={}, total_tools_called={}, answer_length={} chars",
+                    round + 1,
+                    sources.len(),
+                    text.len()
+                );
+                return (text, messages, sources);
            }
            Ok(LlmResponse::ToolCalls(calls)) => {
-                // Push assistant message with tool_calls so Gemma4 remembers
                messages.push(ChatMessage {
                    role: "assistant".to_string(),
                    content: None,
@@ -878,21 +937,32 @@ async fn run_tool_loop(
                });
                for call in &calls {
                    let (tool_call_id, name, content) = execute_tool(pool, call).await;
+                    tracing::info!(
+                        "[AGENT] Tool called: {}, result_size: {} chars, round: {}",
+                        name,
+                        content.len(),
+                        round + 1
+                    );
                    sources.push(serde_json::json!({"tool": name, "result": content}));
                    messages.push(function_calling::make_tool_result(
-                        &tool_call_id,
-                        &name,
-                        &content,
+                        &tool_call_id, &name, &content,
                    ));
                }
            }
            Err(e) => {
-                return (format!("系統錯誤：{}", e), sources);
+                tracing::error!("[AGENT] LLM call failed: {}", e);
+                return (format!("系統錯誤：{}", e), messages, sources);
            }
        }
    }
+    tracing::warn!(
+        "[AGENT] Max rounds reached: rounds_used={}, total_tools_called={}",
+        MAX_ROUNDS,
+        sources.len()
+    );
    (
        "已達到最大查詢次數，請縮小問題範圍後重新詢問。".to_string(),
+        messages,
        sources,
    )
 }
@@ -905,12 +975,12 @@ async fn agent_search(
 ) -> Result<Json<AgentSearchResponse>, (StatusCode, Json<serde_json::Value>)> {
    let (conv_id, history) = get_or_create_conv(req.conversation_id.as_deref());

-    let (answer, sources) =
+    let (answer, messages, sources) =
        run_tool_loop(state.db.pool(), SYSTEM_PROMPT, &req.query, history).await;

-    // Save updated messages for conversation continuation
-    let new_msgs = function_calling::build_conversation(SYSTEM_PROMPT, &req.query, vec![]);
-    save_messages(&conv_id, &new_msgs);
+    // Save messages (skip system prompt — build_conversation re-adds it)
+    let history: Vec<ChatMessage> = messages.into_iter().skip(1).collect();
+    save_messages(&conv_id, &history);

    let needs_input = answer.contains('？') || answer.contains('?');
    let suggestions = if needs_input {