ollama source for Momentry Core verification

This commit is contained in:
Accusys
2026-05-22 17:19:10 +08:00
commit 0b31ff9135
2020 changed files with 1413145 additions and 0 deletions

7
llm/llm_darwin.go Normal file
View File

@@ -0,0 +1,7 @@
package llm
import (
"syscall"
)
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

7
llm/llm_linux.go Normal file
View File

@@ -0,0 +1,7 @@
package llm
import (
"syscall"
)
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

23
llm/llm_windows.go Normal file
View File

@@ -0,0 +1,23 @@
package llm
import (
"syscall"
)
const (
CREATE_DEFAULT_ERROR_MODE = 0x04000000
ABOVE_NORMAL_PRIORITY_CLASS = 0x00008000
CREATE_NO_WINDOW = 0x08000000
)
var LlamaServerSysProcAttr = &syscall.SysProcAttr{
// Wire up the default error handling logic If for some reason a DLL is
// missing in the path this will pop up a GUI Dialog explaining the fault so
// the user can either fix their PATH, or report a bug. Without this
// setting, the process exits immediately with a generic exit status but no
// way to (easily) figure out what the actual missing DLL was.
//
// Setting Above Normal priority class ensures when running as a "background service"
// with "programs" given best priority, we aren't starved of cpu cycles
CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS | CREATE_NO_WINDOW,
}

1951
llm/server.go Normal file

File diff suppressed because it is too large Load Diff

281
llm/server_test.go Normal file
View File

@@ -0,0 +1,281 @@
package llm
import (
"context"
"errors"
"fmt"
"strings"
"testing"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml"
"golang.org/x/sync/semaphore"
)
func TestLLMServerFitGPU(t *testing.T) {
minMemory := 457 * format.MebiByte
tests := []struct {
name string
gpus []ml.DeviceInfo
layers []int
numGPU int
requireFull bool
expected ml.GPULayersList
expectedErr error
}{
{
name: "No GPU",
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{},
requireFull: true, // Should not try to evict even though we can't load any layers
},
{
name: "Full single GPU",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
},
{
name: "Partial single GPU",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
},
{
name: "Single GPU with numGPU 1",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
},
{
name: "Single GPU with numGPU 0",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 0,
expected: ml.GPULayersList{},
},
{
name: "Single GPU with numGPU 999",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
},
{
name: "Multi GPU fits on one",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
},
{
name: "Multi GPU split",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
},
{
name: "Multi GPU partial",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 1",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 2",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 2,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
},
{
name: "Multi GPU numGPU 999",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
},
{
name: "Multi GPU different libraries",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
},
{
name: "requireFull",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
requireFull: true,
expectedErr: ErrLoadRequiredFull,
},
{
name: "requireFull numGPU",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(256 * format.MebiByte)}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 4,
requireFull: true,
expectedErr: ErrLoadRequiredFull,
},
{
name: "iGPU",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
},
{
name: "iGPU + dGPU",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
},
{
name: "iGPU + dGPU fits on one",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1}}},
},
{
name: "iGPU + dGPU partial",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
},
{
name: "iGPU + dGPU numGPU 1",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
},
{
name: "iGPU + dGPU numGPU 999",
gpus: []ml.DeviceInfo{{DeviceID: ml.DeviceID{ID: "gpu0"}, FreeMemory: uint64(128*format.MebiByte + minMemory)}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Integrated: true, FreeMemory: uint64(256*format.MebiByte + minMemory)}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 999,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1, 2, 3}}},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var systemInfo ml.SystemInfo
systemInfo.TotalMemory = format.GibiByte
systemInfo.FreeMemory = 512 * format.MebiByte
systemInfo.FreeSwap = 256 * format.MebiByte
s := &ollamaServer{
llmServer: llmServer{
totalLayers: uint64(len(tt.layers)),
options: api.Options{
Runner: api.Runner{
NumGPU: tt.numGPU,
},
},
},
}
s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
Weights: make([]uint64, s.totalLayers),
Cache: make([]uint64, s.totalLayers),
}, GPUs: make([]ml.DeviceMemory, len(tt.gpus))}
for i := range tt.layers {
s.mem.CPU.Weights[i] = uint64(tt.layers[i])
}
for i := range s.mem.GPUs {
s.mem.GPUs[i].DeviceID = tt.gpus[i].DeviceID
s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
}
gpuLayers, err := s.createLayout(systemInfo, tt.gpus, s.mem, tt.requireFull, 0)
if err != tt.expectedErr {
t.Fatalf("fitGPU returned error: %v", err)
}
if gpuLayers.Hash() != tt.expected.Hash() {
t.Errorf("fitGPU assigned %v, want %v", gpuLayers, tt.expected)
}
})
}
}
func TestLLMServerCompletionFormat(t *testing.T) {
// This test was written to fix an already deployed issue. It is a bit
// of a mess, and but it's good enough, until we can refactoring the
// Completion method to be more testable.
ctx, cancel := context.WithCancel(t.Context())
s := &llmServer{
sem: semaphore.NewWeighted(1), // required to prevent nil panic
}
checkInvalid := func(format string) {
t.Helper()
err := s.Completion(ctx, CompletionRequest{
Options: new(api.Options),
Format: []byte(format),
}, nil)
want := fmt.Sprintf("invalid format: %q; expected \"json\" or a valid JSON Schema", format)
if err == nil || !strings.Contains(err.Error(), want) {
t.Fatalf("err = %v; want %q", err, want)
}
}
checkInvalid("X") // invalid format
checkInvalid(`"X"`) // invalid JSON Schema
cancel() // prevent further processing if request makes it past the format check
checkValid := func(err error) {
t.Helper()
if !errors.Is(err, context.Canceled) {
t.Fatalf("Completion: err = %v; expected context.Canceled", err)
}
}
valids := []string{
// "missing"
``,
`""`,
`null`,
// JSON
`"json"`,
`{"type":"object"}`,
}
for _, valid := range valids {
err := s.Completion(ctx, CompletionRequest{
Options: new(api.Options),
Format: []byte(valid),
}, nil)
checkValid(err)
}
err := s.Completion(ctx, CompletionRequest{
Options: new(api.Options),
Format: nil, // missing format
}, nil)
checkValid(err)
}

31
llm/server_wait_test.go Normal file
View File

@@ -0,0 +1,31 @@
package llm
import (
"context"
"strings"
"testing"
)
func TestWaitUntilRunningUsesStatusMessageWhenDoneErrIsNil(t *testing.T) {
done := make(chan struct{})
close(done)
status := &StatusWriter{}
status.SetLastError("llama_init_from_model: failed to initialize the context: failed to initialize Metal backend")
s := &llmServer{
done: done,
status: status,
}
err := s.WaitUntilRunning(context.Background())
if err == nil {
t.Fatal("expected error")
}
if strings.Contains(err.Error(), "%!w(<nil>)") {
t.Fatalf("unexpected wrapped nil error: %q", err)
}
if !strings.Contains(err.Error(), s.status.LastError()) {
t.Fatalf("error %q does not include status message %q", err, s.status.LastError())
}
}

107
llm/status.go Normal file
View File

@@ -0,0 +1,107 @@
package llm
import (
"bytes"
"io"
"strings"
"sync/atomic"
)
// StatusWriter is a writer that captures error messages from the llama runner process
type StatusWriter struct {
out io.Writer
// StartRunner wires both Stdout and Stderr to the same StatusWriter, and
// os/exec serializes Write calls in that case.
lastErrMsg atomic.Value
}
const maxCapturedErrorBytes = 8 * 1024
func NewStatusWriter(out io.Writer) *StatusWriter {
return &StatusWriter{
out: out,
}
}
func (w *StatusWriter) LastError() string {
if w == nil {
return ""
}
if v := w.lastErrMsg.Load(); v != nil {
return v.(string)
}
return ""
}
func (w *StatusWriter) SetLastError(msg string) {
if w == nil {
return
}
w.lastErrMsg.Store(msg)
}
func (w *StatusWriter) AppendError(msg string) {
if w == nil || msg == "" {
return
}
if current := w.LastError(); current != "" {
msg = current + "\n" + msg
}
if len(msg) > maxCapturedErrorBytes {
msg = msg[len(msg)-maxCapturedErrorBytes:]
if i := strings.IndexByte(msg, '\n'); i >= 0 {
msg = msg[i+1:]
}
}
w.SetLastError(msg)
}
// TODO - regex matching to detect errors like
// libcublasLt.so.11: cannot open shared object file: No such file or directory
// TODO - if we later see error lines split across multiple Write calls in real
// logs, add a small rolling buffer here to capture those fragments.
var errorPrefixes = []string{
"mlx:",
"MLX:",
"panic:",
"fatal error:",
"error:",
"Error:",
"CUDA error",
"ROCm error",
"cudaMalloc failed",
"\"ERR\"",
"error loading model",
"GGML_ASSERT",
"Deepseek2 does not support K-shift",
"signal arrived during cgo execution",
"llama_init_from_model:",
}
func (w *StatusWriter) Write(b []byte) (int, error) {
var errMsg string
errStart := -1
var errPrefix string
for _, prefix := range errorPrefixes {
if i := bytes.Index(b, []byte(prefix)); i >= 0 && (errStart < 0 || i < errStart) {
errStart = i
errPrefix = prefix
}
}
if errStart >= 0 {
line := b[errStart+len(errPrefix):]
if j := bytes.IndexByte(line, '\n'); j >= 0 {
line = line[:j]
}
errMsg = errPrefix + string(bytes.TrimRight(line, " \t\r"))
}
if errMsg != "" {
w.AppendError(errMsg)
}
return w.out.Write(b)
}

68
llm/status_test.go Normal file
View File

@@ -0,0 +1,68 @@
package llm
import (
"io"
"testing"
)
func TestStatusWriterCapturesErrorLine(t *testing.T) {
tests := []struct {
name string
log string
want string
}{
{
name: "llama init",
log: "llama_init_from_model: failed to initialize the context: failed to initialize Metal backend\n",
want: "llama_init_from_model: failed to initialize the context: failed to initialize Metal backend",
},
{
name: "cobra error",
log: "Error: foo baz bar\n",
want: "Error: foo baz bar",
},
{
name: "uppercase mlx",
log: "MLX: there was an error\n",
want: "MLX: there was an error",
},
{
name: "panic header",
log: "time=2026-05-01T15:36:45.053Z level=INFO source=pipeline.go:71 msg=\"peak memory\" size=\"8.26 GiB\"\n" +
"panic: mlx: Failed to compile kernel: nvrtc: error: invalid value for --gpu-architecture (-arch)\n" +
"\t. at /go/src/github.com/ollama/ollama/build/_deps/mlx-c-src/mlx/c/transforms.cpp:15\n\n" +
"goroutine 31 [running]:\n" +
"golang.org/x/sync/errgroup.(*Group).Go.func1()\n" +
"\tgolang.org/x/sync@v0.17.0/errgroup/errgroup.go:93 +0x50\n",
want: "panic: mlx: Failed to compile kernel: nvrtc: error: invalid value for --gpu-architecture (-arch)",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
w := NewStatusWriter(io.Discard)
if _, err := w.Write([]byte(tt.log)); err != nil {
t.Fatal(err)
}
if got := w.LastError(); got != tt.want {
t.Fatalf("LastError = %q, want %q", got, tt.want)
}
})
}
}
func TestStatusWriterAccumulatesErrorLines(t *testing.T) {
w := NewStatusWriter(io.Discard)
if _, err := w.Write([]byte("error: failed to initialize the Metal library\n")); err != nil {
t.Fatal(err)
}
if _, err := w.Write([]byte("GGML_ASSERT([rsets->data count] == 0) failed\n")); err != nil {
t.Fatal(err)
}
want := "error: failed to initialize the Metal library\nGGML_ASSERT([rsets->data count] == 0) failed"
if got := w.LastError(); got != want {
t.Fatalf("LastError = %q, want %q", got, want)
}
}