ollama source for Momentry Core verification

2026-05-22 17:19:10 +08:00
commit 0b31ff9135
2020 changed files with 1413145 additions and 0 deletions
--- a/model/models/gemma4/model_text.go
+++ b/model/models/gemma4/model_text.go
@@ -0,0 +1,475 @@
+package gemma4
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model/input"
+)
+
+const (
+	cacheTypeSWA = iota
+	cacheTypeCausal
+)
+
+type TextOptions struct {
+	hiddenSize              int
+	numHeads, numKVHeads    int
+	numGlobalKVHeads        int
+	headDim, globalHeadDim  int
+	hiddenLayers            int
+	hiddenSizePerLayerInput int
+
+	eps               float32
+	ropeBase          float32
+	ropeLocalBase     float32
+	partialRotaryDims int // RoPE dims for full-attention (global) layers
+
+	slidingWindowPattern []bool
+	// kvDonorMap maps shared layer index -> donor layer index.
+	// Donor is the last non-shared layer of the same type (sliding/full).
+	kvDonorMap map[int]int
+
+	finalLogitSoftcap float32
+
+	numExperts     int
+	numExpertsUsed int
+}
+
+func (o *TextOptions) isLocal(layer int) bool {
+	if layer < len(o.slidingWindowPattern) {
+		return o.slidingWindowPattern[layer]
+	}
+	return false
+}
+
+func (o *TextOptions) ropeForLayer(layer int) (base float32, dims int) {
+	if o.isLocal(layer) {
+		return o.ropeLocalBase, o.headDim
+	}
+	return o.ropeBase, o.partialRotaryDims
+}
+
+func (o *TextOptions) kvHeadsForLayer(layer int) int {
+	if o.isLocal(layer) {
+		return o.numKVHeads
+	}
+	if o.numGlobalKVHeads > 0 {
+		return o.numGlobalKVHeads
+	}
+	return o.numKVHeads
+}
+
+func (o *TextOptions) headDimForLayer(layer int) int {
+	if o.isLocal(layer) {
+		return o.headDim
+	}
+	return o.globalHeadDim
+}
+
+type TextModel struct {
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	*PerLayerProjector
+	Layers     []TextLayer `gguf:"blk"`
+	OutputNorm *nn.RMSNorm `gguf:"output_norm"`
+	Output     *nn.Linear  `gguf:"output,alt:token_embd"`
+	TextOptions
+}
+
+func newTextModel(c fs.Config) *TextModel {
+	numLayers := int(c.Uint("block_count"))
+
+	// Head dimensions: key_length is global head dim, key_length_swa is local (SWA) head dim.
+	globalHeadDim := int(c.Uint("attention.key_length", 512))
+	headDim := int(c.Uint("attention.key_length_swa", 256))
+
+	// RoPE dimensions for global (full attention) layers with proportional RoPE.
+	// The freq_factors tensor handles partial rotation (1.0 for rotated pairs,
+	// 1e30 for non-rotated), so ropeDims equals the full global head dim.
+	partialRotaryDims := int(c.Uint("rope.dimension_count", 0))
+	if partialRotaryDims == 0 {
+		partialFactor := c.Float("rope.partial_rotary_factor", 1.0)
+		partialRotaryDims = int(float32(globalHeadDim) * partialFactor)
+	}
+
+	ropeBase := c.Float("rope.freq_base", 1000000.0)
+	ropeLocalBase := c.Float("rope.freq_base_swa", 0)
+	if ropeLocalBase == 0 {
+		ropeLocalBase = c.Float("rope.local.freq_base", 10000.0)
+	}
+
+	numGlobalKVHeads := int(c.Uint("attention.global_head_count_kv", 0))
+	slidingPattern := c.Bools("attention.sliding_window_pattern")
+
+	// KV heads: try per-layer array first (MoE models), then fall back to scalar
+	numKVHeads := 0
+	kvHeadsArray := c.Ints("attention.head_count_kv")
+	if len(kvHeadsArray) > 0 {
+		numKVHeads = int(kvHeadsArray[0])
+		if numGlobalKVHeads == 0 && len(slidingPattern) > 0 {
+			for i, isLocal := range slidingPattern {
+				if !isLocal && i < len(kvHeadsArray) {
+					numGlobalKVHeads = int(kvHeadsArray[i])
+					break
+				}
+			}
+		}
+	}
+	if numKVHeads == 0 {
+		numKVHeads = int(c.Uint("attention.head_count_kv", 0))
+	}
+
+	// Compute KV sharing donor map (same logic as MLX)
+	sharedLayers := int(c.Uint("attention.shared_kv_layers", 0))
+	kvDonorMap := make(map[int]int)
+	if sharedLayers > 0 && len(slidingPattern) > 0 {
+		firstShared := numLayers - sharedLayers
+		for i := firstShared; i < numLayers; i++ {
+			isLocal := slidingPattern[i]
+			// Find last non-shared layer of same type
+			for j := firstShared - 1; j >= 0; j-- {
+				if slidingPattern[j] == isLocal {
+					kvDonorMap[i] = j
+					break
+				}
+			}
+		}
+	}
+
+	return &TextModel{
+		Layers: make([]TextLayer, numLayers),
+		TextOptions: TextOptions{
+			hiddenSize:              int(c.Uint("embedding_length")),
+			numHeads:                int(c.Uint("attention.head_count")),
+			numKVHeads:              numKVHeads,
+			numGlobalKVHeads:        numGlobalKVHeads,
+			headDim:                 headDim,
+			globalHeadDim:           globalHeadDim,
+			hiddenLayers:            numLayers,
+			hiddenSizePerLayerInput: int(c.Uint("embedding_length_per_layer_input", 0)),
+			eps:                     c.Float("attention.layer_norm_rms_epsilon", 1e-06),
+			ropeBase:                ropeBase,
+			ropeLocalBase:           ropeLocalBase,
+			partialRotaryDims:       partialRotaryDims,
+			slidingWindowPattern:    slidingPattern,
+			kvDonorMap:              kvDonorMap,
+			finalLogitSoftcap:       c.Float("final_logit_softcapping", 0.0),
+			numExperts:              int(c.Uint("expert_count", 0)),
+			numExpertsUsed:          int(c.Uint("expert_used_count", 0)),
+		},
+	}
+}
+
+func (m *TextModel) Forward(ctx ml.Context, batch input.Batch, cache kvcache.Cache) ml.Tensor {
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.hiddenSize)))
+
+	// Inject vision embeddings into the hidden state
+	var except []int
+	for _, image := range batch.Multimodal {
+		visionOutputs := image.Multimodal[0].Tensor
+		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+
+		for i := range visionOutputs.Dim(1) {
+			except = append(except, image.Index+i)
+		}
+	}
+
+	// PLE
+	var perLayerInputs ml.Tensor
+	if m.PerLayerProjector != nil {
+		perLayerInputs = m.PerLayerProjector.Forward(ctx, batch, hiddenState, &m.TextOptions)
+	}
+
+	for i := range len(m.Layers) {
+		layer := m.Layers[i]
+		if cache != nil {
+			cache.SetLayer(i)
+			cacheType := cacheTypeSWA
+			if !m.isLocal(i) {
+				cacheType = cacheTypeCausal
+			}
+			wc := cache.(*kvcache.WrapperCache)
+			wc.SetLayerType(cacheType)
+
+			if causal, ok := wc.UnderlyingCache().(*kvcache.Causal); ok {
+				causal.SetCausal(ctx, kvcache.CausalOptions{Except: except})
+			}
+		}
+
+		var lastLayerOutputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			lastLayerOutputs = batch.Outputs
+		}
+
+		var perLayerInput ml.Tensor
+		if perLayerInputs != nil {
+			perLayerInput = perLayerInputs.View(ctx, i*perLayerInputs.Stride(1), perLayerInputs.Dim(0), perLayerInputs.Stride(2), perLayerInputs.Dim(2))
+		}
+
+		// KV sharing: layers >= firstShared reuse K/V from donor layers
+		isShared := false
+		if donorLayer, ok := m.kvDonorMap[i]; ok {
+			// Set cache layer to donor so Get() reads donor's K/V
+			cache.SetLayer(donorLayer)
+			isShared = true
+		}
+		hiddenState = layer.Forward(ctx, i, hiddenState, positions, perLayerInput, lastLayerOutputs, cache, isShared, &m.TextOptions)
+	}
+
+	return m.OutputNorm.Forward(ctx, hiddenState, m.eps)
+}
+
+// PerLayerProjector implements PLE.
+type PerLayerProjector struct {
+	TokenEmbedding *nn.Embedding `gguf:"per_layer_token_embd"`
+	Projector      *nn.Linear    `gguf:"per_layer_model_proj"`
+	Norm           *nn.RMSNorm   `gguf:"per_layer_proj_norm"`
+}
+
+func (p *PerLayerProjector) Forward(ctx ml.Context, batch input.Batch, inputs ml.Tensor, opts *TextOptions) ml.Tensor {
+	inputsPerLayer := p.TokenEmbedding.Forward(ctx, batch.Inputs)
+	inputsPerLayer = inputsPerLayer.Scale(ctx, math.Sqrt(float64(opts.hiddenSizePerLayerInput)))
+	// Reshape to [pleDim, numLayers, numTokens] — matching projection shape
+	inputsPerLayer = inputsPerLayer.Reshape(ctx, opts.hiddenSizePerLayerInput, opts.hiddenLayers, inputs.Dim(1))
+
+	perLayerProjection := p.Projector.Forward(ctx, inputs)
+	perLayerProjection = perLayerProjection.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize)))
+	perLayerProjection = perLayerProjection.Reshape(ctx, opts.hiddenSizePerLayerInput, opts.hiddenLayers, inputs.Dim(1))
+	perLayerProjection = p.Norm.Forward(ctx, perLayerProjection, opts.eps)
+
+	if inputsPerLayer != nil {
+		perLayerProjection = perLayerProjection.Add(ctx, inputsPerLayer)
+		perLayerProjection = perLayerProjection.Scale(ctx, 1/math.Sqrt(2))
+	}
+
+	return perLayerProjection
+}
+
+type TextSelfAttention struct {
+	Query       *nn.Linear  `gguf:"attn_q"`
+	QueryNorm   *nn.RMSNorm `gguf:"attn_q_norm"`
+	Key         *nn.Linear  `gguf:"attn_k"`
+	KeyNorm     *nn.RMSNorm `gguf:"attn_k_norm"`
+	Value       *nn.Linear  `gguf:"attn_v"`
+	Output      *nn.Linear  `gguf:"attn_output"`
+	RopeFactors ml.Tensor   `gguf:"rope_freqs.weight"` // proportional RoPE freq_factors
+}
+
+func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positions ml.Tensor, cache kvcache.Cache, sharedKV bool, opts *TextOptions) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	hd := opts.headDimForLayer(layer)
+	kvHeads := opts.kvHeadsForLayer(layer)
+	ropeBase, ropeDims := opts.ropeForLayer(layer)
+
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, hd, opts.numHeads, batchSize)
+	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
+
+	var k, v ml.Tensor
+	if !sharedKV {
+		k = sa.Key.Forward(ctx, hiddenState)
+		k = k.Reshape(ctx, hd, kvHeads, batchSize)
+
+		if sa.Value != nil {
+			v = sa.Value.Forward(ctx, hiddenState)
+			v = v.Reshape(ctx, hd, kvHeads, batchSize)
+		} else {
+			// K=V: use raw K projection (before K norm) as V
+			v = k
+		}
+
+		k = sa.KeyNorm.Forward(ctx, k, opts.eps)
+		v = v.RMSNorm(ctx, nil, opts.eps) // V norm: unweighted RMSNorm
+	}
+
+	// RoPE with proportional freq_factors on global layers
+	ropeOpts := []func(*rope.Options){rope.WithTypeNeoX()}
+	if sa.RopeFactors != nil && !opts.isLocal(layer) {
+		ropeOpts = append(ropeOpts, rope.WithFactors(sa.RopeFactors))
+	}
+	q = nn.RoPE(ctx, q, positions, ropeDims, ropeBase, 1.0, ropeOpts...)
+	if k != nil {
+		k = nn.RoPE(ctx, k, positions, ropeDims, ropeBase, 1.0, ropeOpts...)
+	}
+
+	attention := nn.Attention(ctx, q, k, v, 1.0, cache)
+
+	attention = attention.Reshape(ctx, hd*opts.numHeads, batchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+
+type TextMLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *TextMLP) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).GELU(ctx, mlp.Up.Forward(ctx, hiddenState))
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+// TextRouter implements the Gemma 4 MoE router.
+type TextRouter struct {
+	Proj  *nn.Linear `gguf:"ffn_gate_inp"`
+	Scale ml.Tensor  `gguf:"ffn_gate_inp.scale"`
+}
+
+func (r *TextRouter) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) (routingWeights, selectedExperts ml.Tensor) {
+	// RMSNorm without learned weight
+	x := hiddenState.RMSNorm(ctx, nil, opts.eps)
+	// Scale by 1/sqrt(hidden_size)
+	x = x.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize)))
+	// Multiply by learned scale parameter
+	x = x.Mul(ctx, r.Scale)
+	// Project to expert logits
+	expertScores := r.Proj.Forward(ctx, x)
+	// Softmax over experts
+	routingWeights = expertScores.Softmax(ctx)
+	// TopK expert selection
+	selectedExperts = routingWeights.TopK(ctx, opts.numExpertsUsed)
+	return routingWeights, selectedExperts
+}
+
+// TextMoEBlock implements the Gemma 4 sparse MoE.
+type TextMoEBlock struct {
+	GateUp    *nn.LinearBatch `gguf:"ffn_gate_up_exps"`
+	Gate      *nn.LinearBatch `gguf:"ffn_gate_exps"`
+	Up        *nn.LinearBatch `gguf:"ffn_up_exps"`
+	Down      *nn.LinearBatch `gguf:"ffn_down_exps"`
+	DownScale ml.Tensor       `gguf:"ffn_down_exps.scale,alt:ffn_gate_inp.per_expert_scale"`
+}
+
+func (moe *TextMoEBlock) Forward(ctx ml.Context, hiddenState, routingWeights, selectedExperts ml.Tensor, opts *TextOptions) ml.Tensor {
+	// Select routing weights for chosen experts and renormalize
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenState.Dim(1)).Rows(ctx, selectedExperts)
+	routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenState.Dim(1))
+	routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenState.Dim(1))
+
+	hiddenState = hiddenState.Reshape(ctx, hiddenState.Dim(0), 1, hiddenState.Dim(1))
+
+	// Expert computation using LinearBatch (MulmatID selecting experts by index)
+	var gateOut, upOut ml.Tensor
+	if moe.GateUp != nil && moe.GateUp.Weight != nil {
+		gateUp := moe.GateUp.Forward(ctx, hiddenState, selectedExperts)
+		nFF := gateUp.Dim(0) / 2
+		gateOut = gateUp.Slice(ctx, 0, 0, nFF, 1)
+		upOut = gateUp.Slice(ctx, 0, nFF, gateUp.Dim(0), 1)
+	} else {
+		gateOut = moe.Gate.Forward(ctx, hiddenState, selectedExperts)
+		upOut = moe.Up.Forward(ctx, hiddenState, selectedExperts)
+	}
+	hiddenState = gateOut.GELU(ctx, upOut)
+	experts := moe.Down.Forward(ctx, hiddenState, selectedExperts)
+
+	// Apply per-expert down projection scale when present.
+	if moe.DownScale != nil {
+		expertScales := moe.DownScale.Reshape(ctx, opts.numExperts, 1)
+		expertScales = expertScales.Repeat(ctx, 1, hiddenState.Dim(2))
+		expertScales = expertScales.Reshape(ctx, 1, opts.numExperts, hiddenState.Dim(2)).Rows(ctx, selectedExperts)
+		expertScales = expertScales.Reshape(ctx, opts.numExpertsUsed, hiddenState.Dim(2))
+		expertScales = expertScales.Reshape(ctx, 1, opts.numExpertsUsed, hiddenState.Dim(2))
+		experts = experts.Mul(ctx, expertScales)
+	}
+
+	// Apply routing weights
+	experts = experts.Mul(ctx, routingWeights)
+
+	// Sum across experts
+	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+
+	return nextStates
+}
+
+type TextLayer struct {
+	AttentionNorm     *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention     *TextSelfAttention
+	PostAttentionNorm *nn.RMSNorm `gguf:"post_attention_norm,alt:attn_post_norm"`
+	MLPNorm           *nn.RMSNorm `gguf:"ffn_norm,alt:ffn_pre_norm"`
+	MLP               *TextMLP
+	PostMLPNorm       *nn.RMSNorm `gguf:"post_ffw_norm,alt:ffn_post_norm"`
+
+	// MoE (present only for models with enable_moe_block=true)
+	Router       *TextRouter
+	MoE          *TextMoEBlock
+	MoENorm      *nn.RMSNorm `gguf:"pre_ffw_norm_2,alt:ffn_pre_norm_2"`
+	PostMoENorm  *nn.RMSNorm `gguf:"post_ffw_norm_2,alt:ffn_post_norm_2"`
+	PostMLPNorm1 *nn.RMSNorm `gguf:"post_ffw_norm_1,alt:ffn_post_norm_1"` // used instead of PostMLPNorm when MoE is present
+
+	PerLayerInputGate  *nn.Linear  `gguf:"inp_gate"`
+	PerLayerProjection *nn.Linear  `gguf:"proj"`
+	PostPerLayerNorm   *nn.RMSNorm `gguf:"post_norm"`
+	LayerScalar        ml.Tensor   `gguf:"layer_scalar,alt:layer_output_scale.weight"`
+}
+
+func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positions, perLayerInput, outputs ml.Tensor, cache kvcache.Cache, sharedKV bool, opts *TextOptions) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, layer, hiddenState, positions, cache, sharedKV, opts)
+	hiddenState = l.PostAttentionNorm.Forward(ctx, hiddenState, opts.eps)
+
+	if outputs != nil {
+		hiddenState = hiddenState.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+		if perLayerInput != nil {
+			perLayerInput = perLayerInput.Rows(ctx, outputs)
+		}
+	}
+
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	// MLP (+ optional MoE in parallel)
+	hasSplitExperts := l.MoE != nil && l.MoE.Gate != nil && l.MoE.Up != nil && l.MoE.Gate.Weight != nil && l.MoE.Up.Weight != nil
+	hasFusedExperts := l.MoE != nil && l.MoE.GateUp != nil && l.MoE.GateUp.Weight != nil
+	if l.Router != nil && l.MoE != nil && l.MoE.Down != nil && l.MoE.Down.Weight != nil && (hasSplitExperts || hasFusedExperts) {
+		// MoE layers: run MLP and MoE in parallel, sum results
+		mlpState := l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+		mlpState = l.MLP.Forward(ctx, mlpState)
+		mlpState = l.PostMLPNorm1.Forward(ctx, mlpState, opts.eps)
+
+		routingWeights, selectedExperts := l.Router.Forward(ctx, hiddenState, opts)
+		moeState := l.MoENorm.Forward(ctx, hiddenState, opts.eps)
+		moeState = l.MoE.Forward(ctx, moeState, routingWeights, selectedExperts, opts)
+		moeState = l.PostMoENorm.Forward(ctx, moeState, opts.eps)
+
+		// Combine MLP + MoE, apply outer post-FFN norm, then add residual
+		combined := mlpState.Add(ctx, moeState)
+		combined = l.PostMLPNorm.Forward(ctx, combined, opts.eps)
+		hiddenState = combined.Add(ctx, residual)
+	} else {
+		// Dense layers: MLP only
+		hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+		hiddenState = l.MLP.Forward(ctx, hiddenState)
+		hiddenState = l.PostMLPNorm.Forward(ctx, hiddenState, opts.eps)
+		hiddenState = hiddenState.Add(ctx, residual)
+	}
+
+	// PLE injection (after MLP residual)
+	if perLayerInput != nil && l.PerLayerInputGate != nil {
+		pleState := l.PerLayerInputGate.Forward(ctx, hiddenState)
+		pleState = pleState.GELU(ctx, perLayerInput)
+		pleState = l.PerLayerProjection.Forward(ctx, pleState)
+		pleState = l.PostPerLayerNorm.Forward(ctx, pleState, opts.eps)
+		hiddenState = hiddenState.Add(ctx, pleState)
+	}
+
+	// Layer scalar applied at end of layer (full-attention layers only)
+	if l.LayerScalar != nil {
+		hiddenState = hiddenState.Mul(ctx, l.LayerScalar)
+	}
+
+	return hiddenState
+}