ollama source for Momentry Core verification

2026-05-22 17:19:10 +08:00
commit 0b31ff9135
2020 changed files with 1413145 additions and 0 deletions
--- a/model/models/laguna/model.go
+++ b/model/models/laguna/model.go
@@ -0,0 +1,444 @@
+package laguna
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
+)
+
+const (
+	cacheTypeSWA = iota
+	cacheTypeCausal
+)
+
+type Options struct {
+	hiddenSize int
+	headDim    int
+
+	numHeads   []int
+	numKVHeads int
+
+	eps float32
+
+	slidingWindow        int
+	slidingWindowPattern []bool
+
+	fullRopeDim                   int
+	fullRopeBase, fullRopeScale   float32
+	fullRopeOriginalContextLength int
+	fullRopeAttentionFactor       float32
+	fullRopeBetaFast              float32
+	fullRopeBetaSlow              float32
+
+	swaRopeDim                 int
+	swaRopeBase, swaRopeScale  float32
+	numExperts, numExpertsUsed int
+	normTopKProb               bool
+	routedScalingFactor        float32
+	decoderSparseStep          int
+	denseLayers                map[int]bool
+}
+
+func (o *Options) numHeadsForLayer(layer int) int {
+	if layer < len(o.numHeads) && o.numHeads[layer] > 0 {
+		return o.numHeads[layer]
+	}
+	if len(o.numHeads) > 0 && o.numHeads[0] > 0 {
+		return o.numHeads[0]
+	}
+	return 1
+}
+
+func (o *Options) layerIsSliding(layer int) bool {
+	return layer < len(o.slidingWindowPattern) && o.slidingWindowPattern[layer]
+}
+
+func (o *Options) layerUsesMoE(layer int) bool {
+	if o.numExperts == 0 || o.denseLayers[layer] {
+		return false
+	}
+	step := o.decoderSparseStep
+	if step <= 0 {
+		step = 1
+	}
+	return (layer+1)%step == 0
+}
+
+func (o *Options) applyRotaryPositionEmbeddings(ctx ml.Context, layer int, states, positions ml.Tensor) ml.Tensor {
+	opts := []func(*rope.Options){rope.WithTypeNeoX()}
+	if o.layerIsSliding(layer) {
+		return nn.RoPE(ctx, states, positions, o.swaRopeDim, o.swaRopeBase, 1./o.swaRopeScale, opts...)
+	}
+
+	opts = append(opts,
+		rope.WithOriginalContextLength(o.fullRopeOriginalContextLength),
+		rope.WithExtrapolationFactor(1),
+		rope.WithAttentionFactor(o.fullRopeAttentionFactor),
+		rope.WithBetaFast(o.fullRopeBetaFast),
+		rope.WithBetaSlow(o.fullRopeBetaSlow),
+	)
+	return nn.RoPE(ctx, states, positions, o.fullRopeDim, o.fullRopeBase, 1./o.fullRopeScale, opts...)
+}
+
+type Attention struct {
+	Query     *nn.Linear  `gguf:"attn_q"`
+	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
+	Key       *nn.Linear  `gguf:"attn_k"`
+	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
+	Value     *nn.Linear  `gguf:"attn_v"`
+	Gate      *nn.Linear  `gguf:"attn_g"`
+	Output    *nn.Linear  `gguf:"attn_output"`
+}
+
+func (sa *Attention) Forward(ctx ml.Context, layer int, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+	numHeads := opts.numHeadsForLayer(layer)
+
+	query := sa.Query.Forward(ctx, hiddenStates)
+	key := sa.Key.Forward(ctx, hiddenStates)
+	value := sa.Value.Forward(ctx, hiddenStates)
+	gate := sa.Gate.Forward(ctx, hiddenStates)
+
+	query = query.Reshape(ctx, opts.headDim, numHeads, batchSize)
+	key = key.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
+	value = value.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
+
+	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
+	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
+
+	query = opts.applyRotaryPositionEmbeddings(ctx, layer, query, positions)
+	key = opts.applyRotaryPositionEmbeddings(ctx, layer, key, positions)
+
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim)), cache)
+	// Laguna applies the per-head gate softplus in float32, then casts back.
+	gate = gate.Cast(ctx, ml.DTypeF32).Softplus(ctx).Cast(ctx, attention.DType())
+	attention = attention.Mul(ctx, gate.Reshape(ctx, 1, numHeads, batchSize))
+	attention = attention.Reshape(ctx, opts.headDim*numHeads, batchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+
+type MLP interface {
+	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
+}
+
+type dense struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *Options) ml.Tensor {
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+type sparse struct {
+	Router       *nn.Linear      `gguf:"ffn_gate_inp"`
+	Gate         *nn.LinearBatch `gguf:"ffn_gate_exps"`
+	Up           *nn.LinearBatch `gguf:"ffn_up_exps"`
+	Down         *nn.LinearBatch `gguf:"ffn_down_exps"`
+	SharedExpert *dense          `gguf:",suf:_shexp"`
+	ExpProbsBias ml.Tensor       `gguf:"exp_probs_b.bias,alt:exp_probs_b"`
+}
+
+func (moe *sparse) topKIndices(ctx ml.Context, scores ml.Tensor, opts *Options) ml.Tensor {
+	if moe.ExpProbsBias != nil {
+		scores = scores.Add(ctx, moe.ExpProbsBias)
+	}
+	return scores.TopK(ctx, opts.numExpertsUsed)
+}
+
+func (moe *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
+	residual := hiddenStates
+
+	scores := moe.Router.Forward(ctx, hiddenStates).Cast(ctx, ml.DTypeF32).Sigmoid(ctx)
+	selectedExperts := moe.topKIndices(ctx, scores, opts)
+	routingWeights := scores.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
+	if opts.normTopKProb {
+		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
+		routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
+		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
+	}
+	routingWeights = routingWeights.Scale(ctx, float64(opts.routedScalingFactor))
+
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
+	upStates := moe.Up.Forward(ctx, hiddenStates, selectedExperts)
+	hiddenStates = moe.Gate.Forward(ctx, hiddenStates, selectedExperts).SILU(ctx, upStates)
+
+	experts := moe.Down.Forward(ctx, hiddenStates, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+
+	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+
+	return nextStates.Add(ctx, moe.SharedExpert.Forward(ctx, residual, opts))
+}
+
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	*Attention
+
+	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP
+}
+
+func (l *Layer) Forward(ctx ml.Context, layer int, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = l.Attention.Forward(ctx, layer, hiddenStates, positions, cache, opts)
+
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+	residual = hiddenStates
+
+	hiddenStates = l.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type Model struct {
+	model.Base
+	tokenizer.Tokenizer
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	Layers         []Layer       `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	*Options
+}
+
+func New(c fs.Config) (model.Model, error) {
+	if c.Bool("attention.sink_enabled") {
+		return nil, fmt.Errorf("laguna: SWA attention sinks are not supported")
+	}
+	if c.Uint("attention.gating_type") != 1 {
+		return nil, fmt.Errorf("laguna: unsupported attention gating type %d", c.Uint("attention.gating_type"))
+	}
+	if !c.Bool("attention.qk_norm") {
+		return nil, fmt.Errorf("laguna: Q/K RMSNorm is required")
+	}
+	if gating := c.Uint("expert_gating_func"); gating != 2 {
+		return nil, fmt.Errorf("laguna: unsupported expert gating function %d", gating)
+	}
+
+	numLayers := int(c.Uint("block_count"))
+	opts := newOptions(c, numLayers)
+	layers := make([]Layer, numLayers)
+	for i := range layers {
+		if opts.layerUsesMoE(i) {
+			layers[i].MLP = &sparse{}
+		} else {
+			layers[i].MLP = &dense{}
+		}
+	}
+
+	var pre []string
+	switch c.String("tokenizer.ggml.pre") {
+	case "laguna":
+		pre = []string{
+			`(?:\r?\n)+(?!\r?\n)`,
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+		}
+	default:
+		return nil, model.ErrUnsupportedTokenizer
+	}
+
+	m := Model{
+		Tokenizer: tokenizer.NewBytePairEncoding(
+			&tokenizer.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+			pre...,
+		),
+		Layers:  layers,
+		Options: opts,
+	}
+
+	m.Cache = kvcache.NewWrapperCache(
+		kvcache.NewSWACache(int32(opts.slidingWindow), m.Shift),
+		kvcache.NewCausalCache(m.Shift),
+	)
+	return &m, nil
+}
+
+func newOptions(c fs.Config, numLayers int) *Options {
+	denseLayers := make(map[int]bool)
+	for _, layer := range configUints(c, "dense_layers") {
+		denseLayers[int(layer)] = true
+	}
+	for i := range c.Uint("leading_dense_block_count") {
+		denseLayers[int(i)] = true
+	}
+
+	fullRopeScale := c.Float("rope.scaling.factor", 1)
+	if fullRopeScale == 0 {
+		fullRopeScale = 1
+	}
+	swaRopeScale := c.Float("rope.swa.scaling.factor", 1)
+	if swaRopeScale == 0 {
+		swaRopeScale = 1
+	}
+	fullRopeType := c.String("rope.scaling.type")
+	fullRopeAttentionFactor := lagunaAttentionFactor(fullRopeType, fullRopeScale, c.Float("rope.scaling.attn_factor"))
+
+	return &Options{
+		hiddenSize:                    int(c.Uint("embedding_length")),
+		headDim:                       int(c.Uint("attention.key_length")),
+		numHeads:                      expandIntArray(configUints(c, "attention.head_count"), numLayers, c.Uint("attention.head_count", 1)),
+		numKVHeads:                    int(c.Uint("attention.head_count_kv")),
+		eps:                           c.Float("attention.layer_norm_rms_epsilon", 1e-6),
+		slidingWindow:                 int(c.Uint("attention.sliding_window", 512)),
+		slidingWindowPattern:          slidingWindowPattern(c, numLayers),
+		fullRopeDim:                   int(c.Uint("rope.dimension_count", c.Uint("attention.key_length"))),
+		fullRopeBase:                  c.Float("rope.freq_base", 500000),
+		fullRopeScale:                 fullRopeScale,
+		fullRopeOriginalContextLength: int(c.Uint("rope.scaling.original_context_length", 4096)),
+		fullRopeAttentionFactor:       fullRopeAttentionFactor,
+		fullRopeBetaFast:              c.Float("rope.scaling.beta_fast", 64),
+		fullRopeBetaSlow:              c.Float("rope.scaling.beta_slow", 1),
+		swaRopeDim:                    int(c.Uint("rope.swa.dimension_count", c.Uint("attention.key_length"))),
+		swaRopeBase:                   c.Float("rope.swa.freq_base", 10000),
+		swaRopeScale:                  swaRopeScale,
+		numExperts:                    int(c.Uint("expert_count")),
+		numExpertsUsed:                int(c.Uint("expert_used_count")),
+		normTopKProb:                  c.Bool("expert_weights_norm", true),
+		routedScalingFactor:           c.Float("expert_weights_scale", 1),
+		decoderSparseStep:             int(c.Uint("decoder_sparse_step", 1)),
+		denseLayers:                   denseLayers,
+	}
+}
+
+func lagunaAttentionFactor(ropeType string, scaleFactor, attentionFactor float32) float32 {
+	if attentionFactor != 0 {
+		return attentionFactor
+	}
+	if ropeType == "yarn" && scaleFactor > 1 {
+		return float32(0.1*math.Log(float64(scaleFactor)) + 1)
+	}
+	return 1
+}
+
+func slidingWindowPattern(c fs.Config, numLayers int) []bool {
+	pattern := c.Bools("attention.sliding_window_pattern")
+	if len(pattern) == numLayers {
+		return pattern
+	}
+
+	layerTypes := configUints(c, "attention.layer_types")
+	if len(layerTypes) == numLayers {
+		pattern = make([]bool, numLayers)
+		for i, layerType := range layerTypes {
+			pattern[i] = layerType == 1
+		}
+		return pattern
+	}
+
+	return make([]bool, numLayers)
+}
+
+func configUints(c fs.Config, key string) []uint32 {
+	keyExists := c.Value(c.Architecture()+"."+key) != nil || c.Value(key) != nil
+	if cc, ok := c.(interface {
+		Uints(string, ...[]uint32) []uint32
+	}); ok {
+		if values := cc.Uints(key); len(values) > 0 && (keyExists || !(len(values) == 1 && values[0] == 0)) {
+			return values
+		}
+	}
+
+	ints := c.Ints(key)
+	if len(ints) > 0 && (keyExists || !(len(ints) == 1 && ints[0] == 0)) {
+		values := make([]uint32, len(ints))
+		for i, v := range ints {
+			values[i] = uint32(v)
+		}
+		return values
+	}
+
+	if scalar := c.Uint(key); scalar != 0 {
+		return []uint32{scalar}
+	}
+	return nil
+}
+
+func expandIntArray(values []uint32, n int, fallback uint32) []int {
+	if len(values) == 0 {
+		values = []uint32{fallback}
+	}
+	defaultValue := values[0]
+	if len(values) == 1 {
+		defaultValue = values[0]
+	}
+
+	out := make([]int, n)
+	for i := range out {
+		if i < len(values) {
+			out[i] = int(values[i])
+		} else {
+			out[i] = int(defaultValue)
+		}
+	}
+	return out
+}
+
+func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return m.Options.applyRotaryPositionEmbeddings(ctx, layer, key, shift), nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+
+	for i, layer := range m.Layers {
+		if m.Cache != nil {
+			m.Cache.SetLayer(i)
+			if wrapper, ok := m.Cache.(*kvcache.WrapperCache); ok {
+				cacheType := cacheTypeCausal
+				if m.Options.layerIsSliding(i) {
+					cacheType = cacheTypeSWA
+				}
+				wrapper.SetLayerType(cacheType)
+			}
+		}
+
+		var outputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			outputs = batch.Outputs
+		}
+
+		hiddenStates = layer.Forward(ctx, i, hiddenStates, positions, outputs, m.Cache, m.Options)
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func init() {
+	model.Register("laguna", New)
+}
+
+var _ model.Model = (*Model)(nil)