ollama source for Momentry Core verification

2026-05-22 17:19:10 +08:00
commit 0b31ff9135
2020 changed files with 1413145 additions and 0 deletions
--- a/ml/nn/attention.go
+++ b/ml/nn/attention.go
@@ -0,0 +1,84 @@
+package nn
+
+import (
+	"fmt"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+)
+
+// Attention implements scaled dot-product attention for transformer models:
+// Attention(Q, K, V) = softmax(QK^T/√d_k)V
+//
+// Parameters:
+//   - ctx: Context for tensor operations
+//   - query: Query tensor (Q) with shape [d_k, heads, seq_len_q]
+//   - key: Key tensor (K) with shape [d_k, kv_heads, seq_len_k], can be nil to read from cache only
+//   - value: Value tensor (V) with shape [d_v, kv_heads, seq_len_k], can be nil to read from cache only
+//   - scale: Scaling factor, typically 1/√d_k where d_k is the key dimension
+//   - cache: KV cache to store key/value and get past history, can be nil to only use provided key/value
+//
+// Returns:
+//
+//	Attention output with shape [d_v, heads, seq_len_q]
+func Attention(ctx ml.Context, query, key, value ml.Tensor, scale float64, cache kvcache.Cache) ml.Tensor {
+	return AttentionWithVMLA(ctx, query, key, value, nil, nil, scale, cache)
+}
+
+func AttentionWithSinks(ctx ml.Context, query, key, value, sinks ml.Tensor, scale float64, cache kvcache.Cache) ml.Tensor {
+	return AttentionWithVMLA(ctx, query, key, value, sinks, nil, scale, cache)
+}
+
+func AttentionWithVMLA(ctx ml.Context, query, key, value, sinks ml.Tensor, vmla ml.Tensor, scale float64, cache kvcache.Cache) ml.Tensor {
+	ctx.Forward(query)
+	if key != nil && value != nil {
+		if query.Dim(0) != key.Dim(0) {
+			panic(fmt.Errorf("d_k in attention operation does not match between query(%v) and key(%v)", query.Dim(0), key.Dim(0)))
+		}
+
+		if key.Dim(1) != value.Dim(1) {
+			panic(fmt.Errorf("kv_heads in attention operation does not match between key(%v) and value(%v)", key.Dim(1), value.Dim(1)))
+		}
+
+		if key.Dim(2) != value.Dim(2) {
+			panic(fmt.Errorf("seq_len_k in attention operation does not match between key(%v) and value(%v)", key.Dim(2), value.Dim(2)))
+		}
+
+		ctx.Forward(key, value)
+		if cache != nil {
+			cache.Put(ctx, key, value)
+		}
+	} else if cache == nil {
+		panic("key & value tensors must be provided if cache is nil")
+	}
+
+	var mask ml.Tensor
+	if cache != nil {
+		key, value, mask = cache.Get(ctx)
+	}
+
+	if sdpa, ok := query.(ml.ScaledDotProductAttention); ok {
+		cacheConfigApplied := cache != nil
+		return sdpa.ScaledDotProductAttention(ctx, key, value, mask, sinks, vmla, scale, cacheConfigApplied)
+	} else {
+		query = query.Permute(ctx, 0, 2, 1, 3)
+		key = key.Permute(ctx, 0, 2, 1, 3)
+		value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+		kq := key.MulmatFullPrec(ctx, query)
+
+		kq = kq.Scale(ctx, scale)
+		if mask != nil {
+			kq = kq.Add(ctx, mask)
+		}
+		kq = kq.Softmax(ctx)
+
+		kqv := value.Mulmat(ctx, kq)
+
+		if vmla != nil {
+			kqv = vmla.Mulmat(ctx, kqv)
+		}
+
+		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	}
+}
--- a/ml/nn/convolution.go
+++ b/ml/nn/convolution.go
@@ -0,0 +1,30 @@
+package nn
+
+import "github.com/ollama/ollama/ml"
+
+type Conv2D struct {
+	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
+}
+
+func (m *Conv2D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
+	t = m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
+	if m.Bias != nil {
+		// Bias shape is (out_channels,) while t shape is (width, height, out_channels, batch)
+		t = t.Add(ctx, m.Bias.Reshape(ctx, 1, 1, -1))
+	}
+	return t
+}
+
+type Conv3D struct {
+	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
+}
+
+func (m *Conv3D) Forward(ctx ml.Context, t ml.Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) ml.Tensor {
+	t = m.Weight.Conv3D(ctx, t, c, s0, s1, s2, p0, p1, p2, d0, d1, d2)
+	if m.Bias != nil {
+		t = t.Add(ctx, m.Bias)
+	}
+	return t
+}
--- a/ml/nn/embedding.go
+++ b/ml/nn/embedding.go
@@ -0,0 +1,11 @@
+package nn
+
+import "github.com/ollama/ollama/ml"
+
+type Embedding struct {
+	Weight ml.Tensor `gguf:"weight"`
+}
+
+func (m *Embedding) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
+	return m.Weight.Rows(ctx, hiddenState)
+}
--- a/ml/nn/linear.go
+++ b/ml/nn/linear.go
@@ -0,0 +1,31 @@
+package nn
+
+import "github.com/ollama/ollama/ml"
+
+type Linear struct {
+	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
+}
+
+func (m *Linear) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
+	t = m.Weight.Mulmat(ctx, t)
+	if m.Bias != nil {
+		t = t.Add(ctx, m.Bias)
+	}
+
+	return t
+}
+
+type LinearBatch struct {
+	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
+}
+
+func (m *LinearBatch) Forward(ctx ml.Context, t, indices ml.Tensor) ml.Tensor {
+	t = m.Weight.MulmatID(ctx, t, indices)
+	if m.Bias != nil {
+		t = t.AddID(ctx, m.Bias, indices)
+	}
+
+	return t
+}
--- a/ml/nn/normalization.go
+++ b/ml/nn/normalization.go
@@ -0,0 +1,22 @@
+package nn
+
+import (
+	"github.com/ollama/ollama/ml"
+)
+
+type LayerNorm struct {
+	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
+}
+
+func (m *LayerNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
+	return t.LayerNorm(ctx, m.Weight, m.Bias, eps)
+}
+
+type RMSNorm struct {
+	Weight ml.Tensor `gguf:"weight"`
+}
+
+func (m *RMSNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
+	return t.RMSNorm(ctx, m.Weight, eps)
+}
--- a/ml/nn/pooling/pooling.go
+++ b/ml/nn/pooling/pooling.go
@@ -0,0 +1,41 @@
+package pooling
+
+import (
+	"github.com/ollama/ollama/ml"
+)
+
+type Type uint32
+
+const (
+	TypeNone Type = iota
+	TypeMean
+	TypeCLS
+	TypeLast
+)
+
+func (t Type) String() string {
+	switch t {
+	case TypeMean:
+		return "Mean"
+	case TypeCLS:
+		return "CLS"
+	case TypeLast:
+		return "Last"
+	default:
+		return "Unknown"
+	}
+}
+
+func (t Type) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
+	switch t {
+	case TypeMean:
+		hiddenStates = hiddenStates.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx).Mean(ctx)
+		return hiddenStates.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+	case TypeCLS:
+		return hiddenStates.Slice(ctx, 1, 0, 1, 1)
+	case TypeLast:
+		return hiddenStates.Slice(ctx, 1, hiddenStates.Dim(1)-1, hiddenStates.Dim(1), 1)
+	default:
+		panic("unknown pooling type")
+	}
+}
--- a/ml/nn/pooling/pooling_test.go
+++ b/ml/nn/pooling/pooling_test.go
@@ -0,0 +1,64 @@
+package pooling_test
+
+import (
+	"bytes"
+	"os"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	fsggml "github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/backend/ggml"
+	"github.com/ollama/ollama/ml/nn/pooling"
+)
+
+func setup(tb testing.TB, n int) ml.Backend {
+	tb.Helper()
+
+	f, err := os.CreateTemp(tb.TempDir(), "*.bin")
+	if err != nil {
+		tb.Fatal(err)
+	}
+	defer f.Close()
+
+	if err := fsggml.WriteGGUF(f, fsggml.KV{
+		"general.architecture": "test",
+		"test.block_count":     uint32(1),
+	}, []*fsggml.Tensor{
+		{Name: "blk.0.weight", Shape: []uint64{1}, WriterTo: bytes.NewBuffer(make([]byte, 4))},
+	}); err != nil {
+		tb.Fatal(err)
+	}
+
+	b, err := ggml.New(f.Name(), ml.BackendParams{AllocMemory: true})
+	if err != nil {
+		tb.Fatal(err)
+	}
+
+	return b
+}
+
+func TestForward(t *testing.T) {
+	cases := map[pooling.Type][]float32{
+		pooling.TypeMean: {4, 5, 6, 7, 8, 9, 10, 11},
+		pooling.TypeCLS:  {0, 1, 2, 3, 4, 5, 6, 7},
+		pooling.TypeLast: {8, 9, 10, 11, 12, 13, 14, 15},
+	}
+	for typ, want := range cases {
+		t.Run(typ.String(), func(t *testing.T) {
+			b := setup(t, 99)
+			defer b.Close()
+
+			ctx := b.NewContext()
+			defer ctx.Close()
+
+			tt := ctx.Input().Arange(0, 16, 1, ml.DTypeF32).Reshape(ctx, 8, 2)
+			tt = typ.Forward(ctx, tt)
+
+			ctx.Forward(tt).Compute(tt)
+			if diff := cmp.Diff(want, tt.Floats()); diff != "" {
+				t.Error(diff)
+			}
+		})
+	}
+}
--- a/ml/nn/rope.go
+++ b/ml/nn/rope.go
@@ -0,0 +1,20 @@
+package nn
+
+import (
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn/rope"
+)
+
+// fastRoPE is an interface for tensors that support fast rotary positional embedding.
+type fastRoPE interface {
+	RoPE(ctx ml.Context, positions ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor
+}
+
+// RoPE applies rotary positional embedding to tensor `t`.
+func RoPE(ctx ml.Context, t, positions ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor {
+	if t, ok := t.(fastRoPE); ok {
+		return t.RoPE(ctx, positions, dim, base, scale, options...)
+	}
+
+	panic("RoPE not implemented for this tensor type")
+}
--- a/ml/nn/rope/options.go
+++ b/ml/nn/rope/options.go
@@ -0,0 +1,92 @@
+// Package rope provides options for RoPE
+package rope
+
+import "github.com/ollama/ollama/ml"
+
+// Options contains optional parameters for RoPE function
+type Options struct {
+	Type    int
+	Factors ml.Tensor
+
+	// YaRN options
+	YaRN struct {
+		OriginalContextLength int
+		ExtrapolationFactor,
+		AttentionFactor,
+		BetaFast,
+		BetaSlow float32
+	}
+
+	// MRoPE options
+	MRoPE struct {
+		Sections []int
+	}
+}
+
+// WithTypeNeoX sets RoPE type to NeoX
+func WithTypeNeoX() func(*Options) {
+	return func(opts *Options) {
+		opts.Type = 2
+	}
+}
+
+// WithFactors sets custom rope factors
+func WithFactors(factors ml.Tensor) func(*Options) {
+	return func(opts *Options) {
+		if factors != nil {
+			opts.Factors = factors
+		}
+	}
+}
+
+// WithOriginalContextLength sets a custom context length
+func WithOriginalContextLength(n int) func(*Options) {
+	return func(opts *Options) {
+		opts.YaRN.OriginalContextLength = n
+	}
+}
+
+func WithExtrapolationFactor(extrapolationFactor float32) func(*Options) {
+	return func(opts *Options) {
+		opts.YaRN.ExtrapolationFactor = extrapolationFactor
+	}
+}
+
+func WithAttentionFactor(attentionFactor float32) func(*Options) {
+	return func(opts *Options) {
+		opts.YaRN.AttentionFactor = attentionFactor
+	}
+}
+
+func WithBetaFast(betaFast float32) func(*Options) {
+	return func(opts *Options) {
+		opts.YaRN.BetaFast = betaFast
+	}
+}
+
+func WithBetaSlow(betaSlow float32) func(*Options) {
+	return func(opts *Options) {
+		opts.YaRN.BetaSlow = betaSlow
+	}
+}
+
+func WithMRoPE(sections []int) func(*Options) {
+	return func(opts *Options) {
+		opts.Type |= 1 << 3
+		opts.MRoPE.Sections = sections
+	}
+}
+
+func WithVision(sections []int) func(*Options) {
+	return func(opts *Options) {
+		opts.Type |= 1<<3 | 1<<4
+		opts.MRoPE.Sections = sections
+	}
+}
+
+func WithInterleaveMRoPE(sections []int) func(*Options) {
+	return func(opts *Options) {
+		opts.Type |= 1<<3 | 1<<5
+		opts.MRoPE.Sections = sections
+	}
+}