From f65ac89e6aa6c4fafbf6a6af2ae92ee0b2334d27 Mon Sep 17 00:00:00 2001 From: Warren Date: Wed, 6 May 2026 17:13:32 +0800 Subject: [PATCH] deploy: Gemma 4 31B llama-server running on M5 Max (192.168.110.201:8081) --- .../DEPLOY/GEM4_LLM_DEPLOY_PLAN_V1.0.0.md | 159 +++++++++++++++++ .../results/exp_008/labels.json | 160 +++++++++++++----- .../results/exp_008/metrics.json | 2 +- experiments/identity_clustering/runner_v2.py | 30 +++- 4 files changed, 308 insertions(+), 43 deletions(-) create mode 100644 docs_v1.0/API_V1.0.0/DEPLOY/GEM4_LLM_DEPLOY_PLAN_V1.0.0.md diff --git a/docs_v1.0/API_V1.0.0/DEPLOY/GEM4_LLM_DEPLOY_PLAN_V1.0.0.md b/docs_v1.0/API_V1.0.0/DEPLOY/GEM4_LLM_DEPLOY_PLAN_V1.0.0.md new file mode 100644 index 0000000..dcdc309 --- /dev/null +++ b/docs_v1.0/API_V1.0.0/DEPLOY/GEM4_LLM_DEPLOY_PLAN_V1.0.0.md @@ -0,0 +1,159 @@ +--- +document_type: "deployment_plan" +service: "MOMENTRY_CORE" +title: "Gemma 4 LLM 部署計劃 — M5 Max MacBook Pro" +date: "2026-05-06" +version: "V1.0" +status: "draft" +owner: "Warren" +created_by: "OpenCode" +--- + +# Gemma 4 LLM 部署計劃 — M5 Max + +## 1. 環境 + +| 項目 | 規格 | +|------|------| +| 機型 | MacBook Pro M5 Max | +| 統一記憶體 | 48 GB | +| 架構 | arm64 (Apple Silicon) | +| SSH | `accusys@10.10.10.10` | +| 外網 | ❌ 無(需透過本機 scp) | +| 本機 | M4 Mac,有外網,已有 llama.cpp | + +## 2. 模型選擇 + +| 版本 | 參數 | Q5_K_M 大小 | 預估速度 | 備註 | +|------|------|------------|---------|------| +| **Gemma 4 31B-it** | 33B | ~20 GB | 15-25 tok/s | 多模態,可處理圖像 | +| Gemma 4 26B-A4B-it | 27B MoE | ~15 GB | 25-40 tok/s | MoE,更快 | +| Gemma 4 E4B-it | 8B | ~5 GB | 60+ tok/s | 最快,品質較低 | + +**推薦**: Gemma 4 31B-it (Q5_K_M)。48GB 記憶體綽綽有餘。 + +## 3. 部署步驟 + +### Step 1: 本機下載模型 + +```bash +# 登入 HuggingFace(需 access token) +huggingface-cli login + +# 下載 Gemma 4 31B GGUF +huggingface-cli download bartowski/gemma-4-31B-it-GGUF \ + gemma-4-31b-it-Q5_K_M.gguf \ + --local-dir ~/llama.cpp/models/ +``` + +### Step 2: 準備 llama.cpp binary + +```bash +# llama.cpp 已安裝於本機 /opt/homebrew/bin/llama-server +# 收集依賴的 dylib +mkdir -p /tmp/llama_bundle/bin /tmp/llama_bundle/lib +cp /opt/homebrew/bin/llama-server /tmp/llama_bundle/bin/ +cp /opt/homebrew/lib/libggml*.dylib /tmp/llama_bundle/lib/ +cp /opt/homebrew/lib/libllama*.dylib /tmp/llama_bundle/lib/ +``` + +### Step 3: scp 到 M5 Max + +```bash +# 傳送 binary +scp -r /tmp/llama_bundle/* accusys@10.10.10.10:~/bin/ + +# 傳送模型 +scp ~/llama.cpp/models/gemma-4-31b-it-Q5_K_M.gguf \ + accusys@10.10.10.10:~/models/ + +# 傳送模型(若檔案太大可分批或用 rsync) +rsync -avz --progress ~/llama.cpp/models/ accusys@10.10.10.10:~/models/ +``` + +### Step 4: M5 Max 上啟動 + +```bash +ssh accusys@10.10.10.10 + +# 設定 library path +export DYLD_LIBRARY_PATH=~/bin/lib:$DYLD_LIBRARY_PATH + +# 啟動 llama-server +~/bin/bin/llama-server \ + -m ~/models/gemma-4-31b-it-Q5_K_M.gguf \ + --host 0.0.0.0 \ + --port 8081 \ + --n-gpu-layers 999 \ + --ctx-size 8192 \ + --threads 10 \ + --parallel 2 \ + --mlock +``` + +## 4. 記憶體分配 + +``` +48 GB total + ├─ 20 GB Gemma 4 31B Q5_K_M + ├─ 4 GB PostgreSQL + ├─ 1 GB Redis + ├─ 1 GB MongoDB + Qdrant + ├─ 2 GB swift_face / face_processor (burst) + ├─ 3 GB llama-server overhead + └─ 17 GB 剩餘 (OS + buffer) +``` + +## 5. Momentry 整合 + +更新 `.env` 或 config: + +```bash +MOMENTRY_LLM_ENDPOINT=http://10.10.10.10:8081/v1 +MOMENTRY_LLM_MODEL=gemma-4-31b-it +``` + +Agent 端點改用 LLM: +- `POST /api/v1/agents/translate` → llama.cpp server +- `POST /api/v1/agents/identity/suggest` → llama.cpp server +- `POST /api/v1/agents/5w1h/analyze` → llama.cpp server +- `POST /api/v1/agents/suggest/merge` → llama.cpp server + +## 6. 測試驗證 + +```bash +# Health check +curl http://10.10.10.10:8081/health + +# Inference test +curl http://10.10.10.10:8081/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gemma-4-31b-it", + "messages": [{"role": "user", "content": "Hello"}], + "max_tokens": 100 + }' +``` + +## 7. 啟動腳本(M5 Max 上) + +```bash +#!/bin/bash +# ~/start_llm.sh +export DYLD_LIBRARY_PATH=~/bin/lib:$DYLD_LIBRARY_PATH +exec ~/bin/bin/llama-server \ + -m ~/models/gemma-4-31b-it-Q5_K_M.gguf \ + --host 0.0.0.0 --port 8081 \ + --n-gpu-layers 999 --ctx-size 8192 \ + --threads 10 --parallel 2 --mlock \ + >> ~/llama.log 2>&1 +``` + +## 8. 風險與備案 + +| 風險 | 備案 | +|------|------| +| GGUF 下載失敗(HF gated) | 用 ollama pull + ollama export to GGUF | +| M5 Max Metal 不相容 | 改用 CPU only (`--n-gpu-layers 0`) | +| 31B 太大速度太慢 | 改用 26B-A4B (MoE, 更快) | +| scp 傳輸中斷 | 用 rsync --partial 續傳 | diff --git a/experiments/identity_clustering/results/exp_008/labels.json b/experiments/identity_clustering/results/exp_008/labels.json index 638d81e..156a275 100644 --- a/experiments/identity_clustering/results/exp_008/labels.json +++ b/experiments/identity_clustering/results/exp_008/labels.json @@ -7,8 +7,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_001", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 1, @@ -18,8 +22,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_002", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 2, @@ -29,8 +37,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_003", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 3, @@ -40,8 +52,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_004", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 4, @@ -51,8 +67,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_005", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 5, @@ -62,8 +82,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_006", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 6, @@ -73,8 +97,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_007", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 7, @@ -84,8 +112,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_008", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 8, @@ -95,8 +127,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_009", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 9, @@ -106,8 +142,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_010", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 10, @@ -117,8 +157,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_011", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 11, @@ -128,8 +172,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_012", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 12, @@ -139,8 +187,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_013", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 13, @@ -150,8 +202,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_014", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 14, @@ -161,8 +217,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_015", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 15, @@ -172,8 +232,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_016", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 16, @@ -183,8 +247,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_017", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 17, @@ -194,8 +262,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_018", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 18, @@ -205,8 +277,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_019", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 19, @@ -216,8 +292,12 @@ ], "dominant_speaker": null, "speaker_score": 0, - "binding": null, - "binding_stage": null + "binding": { + "name": "Stranger_020", + "source": "auto_temp", + "trace_count": 1 + }, + "binding_stage": "auto_temp" }, { "cluster_id": 20, diff --git a/experiments/identity_clustering/results/exp_008/metrics.json b/experiments/identity_clustering/results/exp_008/metrics.json index f9dea5d..02bfd76 100644 --- a/experiments/identity_clustering/results/exp_008/metrics.json +++ b/experiments/identity_clustering/results/exp_008/metrics.json @@ -5,6 +5,6 @@ "stage2_clusters": 20, "stage2_unbound_clustered": 20, "total_clusters": 677, - "execution_time_s": 15.544250011444092, + "execution_time_s": 16.848892211914062, "coverage": 1.0 } \ No newline at end of file diff --git a/experiments/identity_clustering/runner_v2.py b/experiments/identity_clustering/runner_v2.py index 1e9e5d4..f20e9d2 100644 --- a/experiments/identity_clustering/runner_v2.py +++ b/experiments/identity_clustering/runner_v2.py @@ -406,6 +406,31 @@ def run_experiment(config: dict) -> dict: "dominant_speaker": next(iter(speaker_overlaps.get(t["trace_id"], {}).keys()), None) if t["trace_id"] in speaker_overlaps else None, }) + # --- Temp Identity: assign names to unbound clusters --- + temp_count = 0 + for label in all_labels: + if label.get("binding") is not None: + continue # already has known identity + tids = label.get("trace_ids", []) + if len(tids) < 1: + continue + + # Create temp identity for all unbound clusters (even singletons as "strangers") + if len(tids) >= 1: + temp_count += 1 + if len(tids) >= 2: + temp_name = f"Person_{temp_count:03d}" + else: + temp_name = f"Stranger_{temp_count:03d}" + label["binding"] = { + "name": temp_name, + "source": "auto_temp", + "trace_count": len(tids), + } + label["binding_stage"] = "auto_temp" + if temp_count > 0: + print(f" Temp identities created: {temp_count}") + # Metrics metrics = { "total_traces": len(traces), @@ -438,9 +463,10 @@ def run_experiment(config: dict) -> dict: if row: identity_id = row[0] else: + source = binding.get("source", "auto") cur2.execute( - f"INSERT INTO {SCHEMA}.identities (name, identity_type, source, status) VALUES (%s,'people','auto','pending') RETURNING id", - (identity_name,)) + f"INSERT INTO {SCHEMA}.identities (name, identity_type, source, status) VALUES (%s,'people',%s,'pending') RETURNING id", + (identity_name, source)) identity_id = cur2.fetchone()[0] # Bind all faces in each trace to the identity