deploy: Gemma 4 31B llama-server running on M5 Max (192.168.110.201:8081)
This commit is contained in:
159
docs_v1.0/API_V1.0.0/DEPLOY/GEM4_LLM_DEPLOY_PLAN_V1.0.0.md
Normal file
159
docs_v1.0/API_V1.0.0/DEPLOY/GEM4_LLM_DEPLOY_PLAN_V1.0.0.md
Normal file
@@ -0,0 +1,159 @@
|
||||
---
|
||||
document_type: "deployment_plan"
|
||||
service: "MOMENTRY_CORE"
|
||||
title: "Gemma 4 LLM 部署計劃 — M5 Max MacBook Pro"
|
||||
date: "2026-05-06"
|
||||
version: "V1.0"
|
||||
status: "draft"
|
||||
owner: "Warren"
|
||||
created_by: "OpenCode"
|
||||
---
|
||||
|
||||
# Gemma 4 LLM 部署計劃 — M5 Max
|
||||
|
||||
## 1. 環境
|
||||
|
||||
| 項目 | 規格 |
|
||||
|------|------|
|
||||
| 機型 | MacBook Pro M5 Max |
|
||||
| 統一記憶體 | 48 GB |
|
||||
| 架構 | arm64 (Apple Silicon) |
|
||||
| SSH | `accusys@10.10.10.10` |
|
||||
| 外網 | ❌ 無(需透過本機 scp) |
|
||||
| 本機 | M4 Mac,有外網,已有 llama.cpp |
|
||||
|
||||
## 2. 模型選擇
|
||||
|
||||
| 版本 | 參數 | Q5_K_M 大小 | 預估速度 | 備註 |
|
||||
|------|------|------------|---------|------|
|
||||
| **Gemma 4 31B-it** | 33B | ~20 GB | 15-25 tok/s | 多模態,可處理圖像 |
|
||||
| Gemma 4 26B-A4B-it | 27B MoE | ~15 GB | 25-40 tok/s | MoE,更快 |
|
||||
| Gemma 4 E4B-it | 8B | ~5 GB | 60+ tok/s | 最快,品質較低 |
|
||||
|
||||
**推薦**: Gemma 4 31B-it (Q5_K_M)。48GB 記憶體綽綽有餘。
|
||||
|
||||
## 3. 部署步驟
|
||||
|
||||
### Step 1: 本機下載模型
|
||||
|
||||
```bash
|
||||
# 登入 HuggingFace(需 access token)
|
||||
huggingface-cli login
|
||||
|
||||
# 下載 Gemma 4 31B GGUF
|
||||
huggingface-cli download bartowski/gemma-4-31B-it-GGUF \
|
||||
gemma-4-31b-it-Q5_K_M.gguf \
|
||||
--local-dir ~/llama.cpp/models/
|
||||
```
|
||||
|
||||
### Step 2: 準備 llama.cpp binary
|
||||
|
||||
```bash
|
||||
# llama.cpp 已安裝於本機 /opt/homebrew/bin/llama-server
|
||||
# 收集依賴的 dylib
|
||||
mkdir -p /tmp/llama_bundle/bin /tmp/llama_bundle/lib
|
||||
cp /opt/homebrew/bin/llama-server /tmp/llama_bundle/bin/
|
||||
cp /opt/homebrew/lib/libggml*.dylib /tmp/llama_bundle/lib/
|
||||
cp /opt/homebrew/lib/libllama*.dylib /tmp/llama_bundle/lib/
|
||||
```
|
||||
|
||||
### Step 3: scp 到 M5 Max
|
||||
|
||||
```bash
|
||||
# 傳送 binary
|
||||
scp -r /tmp/llama_bundle/* accusys@10.10.10.10:~/bin/
|
||||
|
||||
# 傳送模型
|
||||
scp ~/llama.cpp/models/gemma-4-31b-it-Q5_K_M.gguf \
|
||||
accusys@10.10.10.10:~/models/
|
||||
|
||||
# 傳送模型(若檔案太大可分批或用 rsync)
|
||||
rsync -avz --progress ~/llama.cpp/models/ accusys@10.10.10.10:~/models/
|
||||
```
|
||||
|
||||
### Step 4: M5 Max 上啟動
|
||||
|
||||
```bash
|
||||
ssh accusys@10.10.10.10
|
||||
|
||||
# 設定 library path
|
||||
export DYLD_LIBRARY_PATH=~/bin/lib:$DYLD_LIBRARY_PATH
|
||||
|
||||
# 啟動 llama-server
|
||||
~/bin/bin/llama-server \
|
||||
-m ~/models/gemma-4-31b-it-Q5_K_M.gguf \
|
||||
--host 0.0.0.0 \
|
||||
--port 8081 \
|
||||
--n-gpu-layers 999 \
|
||||
--ctx-size 8192 \
|
||||
--threads 10 \
|
||||
--parallel 2 \
|
||||
--mlock
|
||||
```
|
||||
|
||||
## 4. 記憶體分配
|
||||
|
||||
```
|
||||
48 GB total
|
||||
├─ 20 GB Gemma 4 31B Q5_K_M
|
||||
├─ 4 GB PostgreSQL
|
||||
├─ 1 GB Redis
|
||||
├─ 1 GB MongoDB + Qdrant
|
||||
├─ 2 GB swift_face / face_processor (burst)
|
||||
├─ 3 GB llama-server overhead
|
||||
└─ 17 GB 剩餘 (OS + buffer)
|
||||
```
|
||||
|
||||
## 5. Momentry 整合
|
||||
|
||||
更新 `.env` 或 config:
|
||||
|
||||
```bash
|
||||
MOMENTRY_LLM_ENDPOINT=http://10.10.10.10:8081/v1
|
||||
MOMENTRY_LLM_MODEL=gemma-4-31b-it
|
||||
```
|
||||
|
||||
Agent 端點改用 LLM:
|
||||
- `POST /api/v1/agents/translate` → llama.cpp server
|
||||
- `POST /api/v1/agents/identity/suggest` → llama.cpp server
|
||||
- `POST /api/v1/agents/5w1h/analyze` → llama.cpp server
|
||||
- `POST /api/v1/agents/suggest/merge` → llama.cpp server
|
||||
|
||||
## 6. 測試驗證
|
||||
|
||||
```bash
|
||||
# Health check
|
||||
curl http://10.10.10.10:8081/health
|
||||
|
||||
# Inference test
|
||||
curl http://10.10.10.10:8081/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "gemma-4-31b-it",
|
||||
"messages": [{"role": "user", "content": "Hello"}],
|
||||
"max_tokens": 100
|
||||
}'
|
||||
```
|
||||
|
||||
## 7. 啟動腳本(M5 Max 上)
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# ~/start_llm.sh
|
||||
export DYLD_LIBRARY_PATH=~/bin/lib:$DYLD_LIBRARY_PATH
|
||||
exec ~/bin/bin/llama-server \
|
||||
-m ~/models/gemma-4-31b-it-Q5_K_M.gguf \
|
||||
--host 0.0.0.0 --port 8081 \
|
||||
--n-gpu-layers 999 --ctx-size 8192 \
|
||||
--threads 10 --parallel 2 --mlock \
|
||||
>> ~/llama.log 2>&1
|
||||
```
|
||||
|
||||
## 8. 風險與備案
|
||||
|
||||
| 風險 | 備案 |
|
||||
|------|------|
|
||||
| GGUF 下載失敗(HF gated) | 用 ollama pull + ollama export to GGUF |
|
||||
| M5 Max Metal 不相容 | 改用 CPU only (`--n-gpu-layers 0`) |
|
||||
| 31B 太大速度太慢 | 改用 26B-A4B (MoE, 更快) |
|
||||
| scp 傳輸中斷 | 用 rsync --partial 續傳 |
|
||||
@@ -7,8 +7,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_001",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 1,
|
||||
@@ -18,8 +22,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_002",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 2,
|
||||
@@ -29,8 +37,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_003",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 3,
|
||||
@@ -40,8 +52,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_004",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 4,
|
||||
@@ -51,8 +67,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_005",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 5,
|
||||
@@ -62,8 +82,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_006",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 6,
|
||||
@@ -73,8 +97,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_007",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 7,
|
||||
@@ -84,8 +112,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_008",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 8,
|
||||
@@ -95,8 +127,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_009",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 9,
|
||||
@@ -106,8 +142,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_010",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 10,
|
||||
@@ -117,8 +157,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_011",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 11,
|
||||
@@ -128,8 +172,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_012",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 12,
|
||||
@@ -139,8 +187,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_013",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 13,
|
||||
@@ -150,8 +202,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_014",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 14,
|
||||
@@ -161,8 +217,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_015",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 15,
|
||||
@@ -172,8 +232,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_016",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 16,
|
||||
@@ -183,8 +247,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_017",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 17,
|
||||
@@ -194,8 +262,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_018",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 18,
|
||||
@@ -205,8 +277,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_019",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 19,
|
||||
@@ -216,8 +292,12 @@
|
||||
],
|
||||
"dominant_speaker": null,
|
||||
"speaker_score": 0,
|
||||
"binding": null,
|
||||
"binding_stage": null
|
||||
"binding": {
|
||||
"name": "Stranger_020",
|
||||
"source": "auto_temp",
|
||||
"trace_count": 1
|
||||
},
|
||||
"binding_stage": "auto_temp"
|
||||
},
|
||||
{
|
||||
"cluster_id": 20,
|
||||
|
||||
@@ -5,6 +5,6 @@
|
||||
"stage2_clusters": 20,
|
||||
"stage2_unbound_clustered": 20,
|
||||
"total_clusters": 677,
|
||||
"execution_time_s": 15.544250011444092,
|
||||
"execution_time_s": 16.848892211914062,
|
||||
"coverage": 1.0
|
||||
}
|
||||
@@ -406,6 +406,31 @@ def run_experiment(config: dict) -> dict:
|
||||
"dominant_speaker": next(iter(speaker_overlaps.get(t["trace_id"], {}).keys()), None) if t["trace_id"] in speaker_overlaps else None,
|
||||
})
|
||||
|
||||
# --- Temp Identity: assign names to unbound clusters ---
|
||||
temp_count = 0
|
||||
for label in all_labels:
|
||||
if label.get("binding") is not None:
|
||||
continue # already has known identity
|
||||
tids = label.get("trace_ids", [])
|
||||
if len(tids) < 1:
|
||||
continue
|
||||
|
||||
# Create temp identity for all unbound clusters (even singletons as "strangers")
|
||||
if len(tids) >= 1:
|
||||
temp_count += 1
|
||||
if len(tids) >= 2:
|
||||
temp_name = f"Person_{temp_count:03d}"
|
||||
else:
|
||||
temp_name = f"Stranger_{temp_count:03d}"
|
||||
label["binding"] = {
|
||||
"name": temp_name,
|
||||
"source": "auto_temp",
|
||||
"trace_count": len(tids),
|
||||
}
|
||||
label["binding_stage"] = "auto_temp"
|
||||
if temp_count > 0:
|
||||
print(f" Temp identities created: {temp_count}")
|
||||
|
||||
# Metrics
|
||||
metrics = {
|
||||
"total_traces": len(traces),
|
||||
@@ -438,9 +463,10 @@ def run_experiment(config: dict) -> dict:
|
||||
if row:
|
||||
identity_id = row[0]
|
||||
else:
|
||||
source = binding.get("source", "auto")
|
||||
cur2.execute(
|
||||
f"INSERT INTO {SCHEMA}.identities (name, identity_type, source, status) VALUES (%s,'people','auto','pending') RETURNING id",
|
||||
(identity_name,))
|
||||
f"INSERT INTO {SCHEMA}.identities (name, identity_type, source, status) VALUES (%s,'people',%s,'pending') RETURNING id",
|
||||
(identity_name, source))
|
||||
identity_id = cur2.fetchone()[0]
|
||||
|
||||
# Bind all faces in each trace to the identity
|
||||
|
||||
Reference in New Issue
Block a user