ollama source for Momentry Core verification

This commit is contained in:
Accusys
2026-05-22 17:19:10 +08:00
commit 0b31ff9135
2020 changed files with 1413145 additions and 0 deletions

84
kvcache/cache.go Normal file
View File

@@ -0,0 +1,84 @@
package kvcache
import (
"errors"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model/input"
)
var (
ErrKvCacheFull = errors.New("could not find a kv cache slot")
ErrNotSupported = errors.New("model does not support operation")
)
type Cache interface {
// ** used by model implementations **
// SetLayer sets the active layer of the cache
SetLayer(layer int)
// Get returns the history of key and value tensors plus a mask
//
// The shape of the tensors is documented in the specific
// cache implementation used.
Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor)
// Put stores a batch of key and value in the cache
//
// The shape of the tensors is documented in the specific
// cache implementation used.
Put(ctx ml.Context, key, value ml.Tensor)
// SetConfig controls optimizations (mostly backend-specific) that may transform
// the output of the cache to work better with specific kernels. If not called,
// the backend settings will be used. This works well when calling Attention.
//
// The config can be overridden by models, especially if they require vanilla
// output when implementing their own version of attention. To do this, pass
// an empty ml.CacheConfig.
//
// Most models will not need to use this.
SetConfig(ml.CacheConfig)
// ** cache management **
// Init sets up runtime parameters.
// backend: Used to allocate cache data storage and execute management operations (such as defrag)
// dtype: The data type for storing cache entries
// maxSequences: The maximum number of sequences stored in the cache - across all batches
// capacity: The number of cache entries to store, per sequence
// maxBatch: The maximum number of tokens that can occur in a single batch
Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int)
// Close closes the cache and frees resources associated with it
Close()
// StartForward is called before the start of the model's forward pass.
// For each token in the coming batch, there must be a corresponding
// entry in positions and seqs. reserve is to preallocate memory
// without actually storing data in the cache.
StartForward(ctx ml.Context, batch input.Batch, reserve bool) error
// CopyPrefix copies tokens in the range [0, len) from srcSeq to dstSeq
CopyPrefix(srcSeq, dstSeq int, len int32)
// CanResume returns true if the cache can continue with the next token at
// the given position and sequence. Assumes that the caller has already
// verified the contents of the cache.
CanResume(seq int, pos int32) bool
// Remove deletes tokens in the range [beginIndex, endIndex) from seq. Set
// endIndex to math.MaxInt32 to remove everything starting at beginIndex.
//
// If an error occurs, the entire context for the sequence should be
// removed by calling Remove(seq, 0, math.MaxInt32)
Remove(seq int, beginIndex, endIndex int32) error
}
// CheckpointCache optionally supports restoring recurrent state to a prior
// position to avoid full prompt reprocessing when a prefix mismatch occurs.
// The returned position is the number of tokens that can be kept (prefix length).
type CheckpointCache interface {
PrepareRestore(seq int, targetPos int32) (int32, bool)
}

666
kvcache/causal.go Normal file
View File

@@ -0,0 +1,666 @@
package kvcache
import (
"errors"
"fmt"
"math"
"slices"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model/input"
)
type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error)
// Causal cache stores K and V tensors according to their position in the
// sequence. Returns the history and a mask for attending to past tokens
//
// The tensors are of shape embed dim, kv heads, batch size
// The mask is of shape history size, batch size
type Causal struct {
DType ml.DType
// swaWindowSize is the number of tokens that will be included in the mask
// during attention operations. swaMemorySize is the number of tokens that
// will be retained in memory for partial prefix caching. Set to math.MaxInt32
// for unlimited or if sliding window attention is not being used.
swaWindowSize int32
swaMemorySize int32
chunkSize int32
opts CausalOptions
// maxBatch is the largest batch that we might receive
maxBatch int
// config controls mostly backend-specific optimizations
config *ml.CacheConfig
// ** current forward pass **
// size of the current batch
curBatchSize int
// locations for data storage for this batch
curLoc ml.Tensor
// mask of the cache as used by this batch
curMask ml.Tensor
// the active layer for Get and Put
curLayer int
// locations in the cache that are needed for this batch
curCellRange cellRange
// curSequences is the sequences corresponding to this pass's entries in the cache
curSequences []int
// curPositions is the positions corresponding to this pass's entries in the cache
curPositions []int32
// ** cache metadata **
// for each possible location in the cache, stores the position and set of sequences
// that reference the data there
cells []cacheCell
// maps from sequence to the range of locations where it is stored in the cache
cellRanges map[int]cellRange
// ** cache data storage **
shiftFn shiftFn
backend ml.Backend
ctxs map[int]ml.Context
keys, values map[int]ml.Tensor
}
type cacheCell struct {
pos int32
sequences []int
}
type cellRange struct {
min int
max int
}
func NewCausalCache(shift shiftFn) *Causal {
return &Causal{
shiftFn: shift,
ctxs: make(map[int]ml.Context),
keys: make(map[int]ml.Tensor),
values: make(map[int]ml.Tensor),
}
}
func NewSWACache(windowSize int32, shift shiftFn) *Causal {
return &Causal{
swaWindowSize: windowSize,
shiftFn: shift,
ctxs: make(map[int]ml.Context),
keys: make(map[int]ml.Tensor),
values: make(map[int]ml.Tensor),
}
}
func NewSWAMemCache(windowSize int32, memorySize int32, shift shiftFn) *Causal {
return &Causal{
swaWindowSize: windowSize,
swaMemorySize: memorySize,
shiftFn: shift,
ctxs: make(map[int]ml.Context),
keys: make(map[int]ml.Tensor),
values: make(map[int]ml.Tensor),
}
}
func NewChunkedAttentionCache(chunkSize int32, shift shiftFn) *Causal {
return &Causal{
chunkSize: chunkSize,
shiftFn: shift,
ctxs: make(map[int]ml.Context),
keys: make(map[int]ml.Tensor),
values: make(map[int]ml.Tensor),
}
}
func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
if c.config == nil {
var config ml.CacheConfig
if cc, ok := backend.(ml.BackendCacheConfig); ok {
config = cc.CacheConfig()
}
c.config = &config
}
if c.config.CachePadding == 0 {
c.config.CachePadding = 1
}
if c.config.MaskDType == ml.DTypeOther {
c.config.MaskDType = ml.DTypeF32
}
if c.swaWindowSize == 0 {
c.swaWindowSize = math.MaxInt32
}
if c.swaMemorySize == 0 {
c.swaMemorySize = c.swaWindowSize
}
// We will allocate space in the cache for the stop token, which won't be part of a follow on
// sequence, so allocate an extra token of storage to ensure that we can jump back without
// causing a cache break. As an optimization, only do this when we have parallel sequences
// because the extra token will live in the batch buffer and won't get overwritten if we
// only have a single sequence.
if c.swaMemorySize != math.MaxInt32 && maxSequences > 1 {
c.swaMemorySize = max(c.swaMemorySize, c.swaWindowSize+1)
}
if int(c.swaMemorySize) >= capacity {
c.swaMemorySize = math.MaxInt32
}
if c.swaMemorySize < c.swaWindowSize {
panic(fmt.Errorf("sliding window memory (%v) must be at least as large as the window (%v)", c.swaMemorySize, c.swaWindowSize))
}
var cacheSize int
if c.swaMemorySize == math.MaxInt32 {
cacheSize = maxSequences * capacity
} else {
cacheSize = (maxSequences * int(c.swaMemorySize)) + maxBatch
}
cacheSize = roundUp(cacheSize, c.config.CachePadding)
c.cells = make([]cacheCell, cacheSize)
c.DType = dtype
c.cellRanges = make(map[int]cellRange)
c.backend = backend
c.maxBatch = maxBatch
}
func (c *Causal) SetConfig(config ml.CacheConfig) {
if c.config != nil {
panic("config cannot be changed after being previously set, either by the model or backend")
}
c.config = &config
}
func (c *Causal) Close() {
for _, ctx := range c.ctxs {
ctx.Close()
}
}
func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
c.curBatchSize = len(batch.Positions)
c.curSequences = batch.Sequences
c.curPositions = batch.Positions
c.opts.Except = nil
var locs []int32
if !reserve {
c.updateSlidingWindow()
var err error
locs, err = c.findLocs()
if err != nil {
return err
}
for i, pos := range batch.Positions {
seq := batch.Sequences[i]
loc := int(locs[i])
c.cells[loc] = cacheCell{pos: pos, sequences: []int{seq}}
seqRange, ok := c.cellRanges[seq]
if !ok {
seqRange = newRange()
}
seqRange.min = min(seqRange.min, loc)
c.curCellRange.min = min(c.curCellRange.min, loc)
seqRange.max = max(seqRange.max, loc)
c.curCellRange.max = max(c.curCellRange.max, loc)
c.cellRanges[seq] = seqRange
}
} else {
// If we are reserving memory, don't update any of the cache metadata but set the size
// to the worst case.
locs = make([]int32, c.curBatchSize)
for i := range locs {
locs[i] = int32(i)
}
c.curCellRange.min = 0
c.curCellRange.max = len(c.cells) - 1
}
c.curLoc = ctx.Input().FromInts(locs, len(locs))
c.curMask = c.buildMask(ctx)
return nil
}
func newRange() cellRange {
return cellRange{
min: math.MaxInt,
max: 0,
}
}
// Returns a slice of locations where each token in the batch should be stored
func (c *Causal) findLocs() ([]int32, error) {
loc := make([]int32, 0, c.curBatchSize)
for i := range c.cells {
if len(c.cells[i].sequences) == 0 {
loc = append(loc, int32(i))
if len(loc) >= c.curBatchSize {
return loc, nil
}
}
}
return nil, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
}
func (c *Causal) updateSlidingWindow() {
c.curCellRange = newRange()
if c.swaMemorySize == math.MaxInt32 {
for _, seq := range c.curSequences {
if seqRange, ok := c.cellRanges[seq]; ok {
c.curCellRange.min = min(c.curCellRange.min, seqRange.min)
c.curCellRange.max = max(c.curCellRange.max, seqRange.max)
}
}
return
}
type lowestPosition struct {
pos int32
curBatch bool
}
// create a map of unique sequences to the lowest position in that sequence
lowestPos := make(map[int]lowestPosition)
for i := range c.curPositions {
seq := c.curSequences[i]
lowest, ok := lowestPos[seq]
if !ok {
lowest = lowestPosition{pos: c.curPositions[i], curBatch: true}
} else if c.curPositions[i] < lowest.pos {
lowest.pos = c.curPositions[i]
}
lowestPos[seq] = lowest
}
// for any sequences are not part of this batch, clean up any tokens
// that are no longer needed after the processing of the previous
// batch
for seq, seqRange := range c.cellRanges {
if _, ok := lowestPos[seq]; !ok {
var last int32
for i := seqRange.min; i <= seqRange.max; i++ {
if slices.Contains(c.cells[i].sequences, seq) {
last = max(last, c.cells[i].pos)
}
}
lowestPos[seq] = lowestPosition{pos: last + 1, curBatch: false}
}
}
// delete any entries that are beyond the window of the oldest position in the sequence
for seq, lowest := range lowestPos {
oldRange, ok := c.cellRanges[seq]
if !ok {
continue
}
newRange := newRange()
for i := oldRange.min; i <= oldRange.max; i++ {
if slices.Contains(c.cells[i].sequences, seq) {
if c.cells[i].pos < lowest.pos-c.swaMemorySize {
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
} else {
newRange.min = min(newRange.min, i)
newRange.max = max(newRange.max, i)
}
if lowest.curBatch && c.cells[i].pos >= lowest.pos-c.swaWindowSize {
c.curCellRange.min = min(c.curCellRange.min, i)
c.curCellRange.max = max(c.curCellRange.max, i)
}
}
}
c.cellRanges[seq] = newRange
}
}
func roundDown(length, pad int) int {
return (length / pad) * pad
}
func roundUp(length, pad int) int {
return ((length + pad - 1) / pad) * pad
}
// Builds a mask of history x batch indicating whether for each token in the batch the
// token in the history should apply. This is based on both the sequence and causality (the
// position of the history is not ahead of the token in the batch).
func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
c.curCellRange.min = roundDown(c.curCellRange.min, c.config.CachePadding)
c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
length := c.curCellRange.max - c.curCellRange.min + 1
mask := make([]float32, c.curBatchSize*length)
for i := range c.curBatchSize {
enabled := !slices.Contains(c.opts.Except, i)
for j := c.curCellRange.min; j <= c.curCellRange.max; j++ {
if !slices.Contains(c.cells[j].sequences, c.curSequences[i]) ||
(enabled && c.cells[j].pos > c.curPositions[i]) ||
c.chunkSize > 0 && c.cells[j].pos < c.curPositions[i]-c.curPositions[i]%c.chunkSize ||
c.cells[j].pos < c.curPositions[i]-c.swaWindowSize {
mask[i*length+(j-c.curCellRange.min)] = float32(math.Inf(-1))
}
}
}
maskTensor := ctx.Input().FromFloats(mask, length, c.curBatchSize)
if c.config.MaskDType != ml.DTypeF32 {
maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
}
return maskTensor
}
func (c *Causal) SetLayer(layer int) {
c.curLayer = layer
}
type CausalOptions struct {
// Enabled controls whether the causal mask is generated for a particular index in a batch
Except []int
}
// SetCausal disables causal mask generation for a particular range of indicies in
// the current batch for subsequent calls to Get. The state resets for the next forward pass.
func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
if !slices.Equal(c.opts.Except, opts.Except) {
c.opts = opts
if ctx != nil {
c.curMask = c.buildMask(ctx)
}
}
}
func (c *Causal) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
key := c.keys[c.curLayer]
value := c.values[c.curLayer]
kHeadDim := key.Dim(0)
numKVHeads := key.Dim(1)
rowSize := key.Stride(2)
cachedSize := c.curMask.Dim(0)
key = key.View(ctx, rowSize*c.curCellRange.min,
kHeadDim, key.Stride(1),
numKVHeads, key.Stride(2),
cachedSize,
)
if c.config.PermutedV {
vHeadDim := value.Dim(1)
elemSize := value.Stride(0)
value = value.View(ctx, elemSize*c.curCellRange.min,
cachedSize, value.Stride(1),
vHeadDim, value.Stride(2),
numKVHeads,
)
} else {
vHeadDim := value.Dim(0)
rowSize := value.Stride(2)
value = value.View(ctx, rowSize*c.curCellRange.min,
vHeadDim, value.Stride(1),
numKVHeads, value.Stride(2),
cachedSize,
)
}
return key, value, c.curMask
}
func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
kHeadDim := key.Dim(0)
vHeadDim := value.Dim(0)
numKVHeads := key.Dim(1)
batchSize := key.Dim(2)
if c.curBatchSize != batchSize {
panic(fmt.Errorf("inconsistent batch sizes (layer: %v, batch size: %v layer batch size: %v)", c.curLayer, c.curBatchSize, batchSize))
}
if _, ok := c.ctxs[c.curLayer]; !ok {
c.ctxs[c.curLayer] = c.backend.NewContextSize(2).Layer(c.curLayer)
}
if _, ok := c.keys[c.curLayer]; !ok {
c.keys[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, kHeadDim, numKVHeads, len(c.cells))
}
if _, ok := c.values[c.curLayer]; !ok {
if c.config.PermutedV {
c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, len(c.cells), vHeadDim, numKVHeads)
} else {
c.values[c.curLayer] = c.ctxs[c.curLayer].Zeros(c.DType, vHeadDim, numKVHeads, len(c.cells))
}
}
key = key.Reshape(ctx, kHeadDim*numKVHeads, batchSize)
keyCache := c.keys[c.curLayer]
keyCache = keyCache.Reshape(ctx, kHeadDim*numKVHeads, len(c.cells))
ctx.Forward(keyCache.SetRows(ctx, key, c.curLoc))
if c.config.PermutedV {
value = value.Reshape(ctx, vHeadDim*numKVHeads, 1, batchSize)
value = value.Permute(ctx, 2, 0, 1, 3)
valueCache := c.values[c.curLayer]
valueCache = valueCache.Reshape(ctx, 1, len(c.cells), vHeadDim*numKVHeads)
ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
} else {
value = value.Reshape(ctx, vHeadDim*numKVHeads, batchSize)
valueCache := c.values[c.curLayer]
valueCache = valueCache.Reshape(ctx, vHeadDim*numKVHeads, len(c.cells))
ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
}
}
func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int32) {
seqRange := newRange()
for i := range c.cells {
// Remove the contents of dstSeq so that we only have the copied prefix, metadata will be reset at the end
if slices.Contains(c.cells[i].sequences, dstSeq) {
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == dstSeq })
}
if slices.Contains(c.cells[i].sequences, srcSeq) && c.cells[i].pos < len {
c.cells[i].sequences = append(c.cells[i].sequences, dstSeq)
if i < seqRange.min {
seqRange.min = i
}
if i > seqRange.max {
seqRange.max = i
}
}
}
c.cellRanges[dstSeq] = seqRange
}
func (c *Causal) CanResume(seq int, pos int32) bool {
if c.swaMemorySize == math.MaxInt32 {
return true
}
seqRange, ok := c.cellRanges[seq]
if !ok {
return false
}
// for sliding window, check that the window of the new sequence is contained in
// the window of what we are storing
var first int32 = math.MaxInt32
var last int32 = -1
for i := seqRange.min; i <= seqRange.max; i++ {
if slices.Contains(c.cells[i].sequences, seq) {
first = min(first, c.cells[i].pos)
last = max(last, c.cells[i].pos)
}
}
if last == -1 {
return false
}
posWindowStart := max(0, pos-c.swaWindowSize)
return posWindowStart >= first && pos <= last+1
}
func (c *Causal) shift(seq int, beginIndex, offset int32) error {
if c.shiftFn == nil {
return ErrNotSupported
}
seqRange := c.cellRanges[seq]
for start := seqRange.min; start <= seqRange.max; start += c.maxBatch {
size := min(seqRange.max-start+1, c.maxBatch)
offsets := make([]int32, size)
var batchFirst, batchLast int
batchFirst = -1
for i := range offsets {
cell := c.cells[start+i]
if slices.Contains(cell.sequences, seq) && cell.pos >= beginIndex {
offsets[i] = offset
if batchFirst < 0 {
batchFirst = i
}
batchLast = i
}
}
if batchFirst < 0 {
continue
}
offsets = offsets[batchFirst : batchLast+1]
ctx := c.backend.NewContext()
kShift := ctx.Input().FromInts(offsets, len(offsets))
for i, key := range c.keys {
if key == nil {
continue
}
kHeadDim := key.Dim(0)
numKVHeads := key.Dim(1)
rowSize := key.Stride(2)
key = key.View(ctx, rowSize*(start+batchFirst),
kHeadDim, key.Stride(1),
numKVHeads, key.Stride(2),
len(offsets),
)
roped, err := c.shiftFn(ctx, i, key, kShift)
if err != nil {
ctx.Close()
return err
}
ctx.Forward(roped.Copy(ctx, key))
}
ctx.Compute()
ctx.Close()
}
return nil
}
func (c *Causal) Remove(seq int, beginIndex, endIndex int32) error {
// TODO(jessegross): We should check to see if removing the middle of the sequence will
// cause the sliding window to encompass tokens that we no longer have. If so, then we
// should return an error, which will trigger the runner to evaluate the full history and
// rebuild the window. However, if we have multimodal inputs in our history, this reuse
// results in use after free, so we don't do it for now.
var offset int32
if endIndex != math.MaxInt32 {
offset = beginIndex - endIndex
}
seqRange := newRange()
for i := range c.cells {
if slices.Contains(c.cells[i].sequences, seq) {
if c.cells[i].pos >= beginIndex && c.cells[i].pos < endIndex {
c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s int) bool { return s == seq })
} else {
if c.cells[i].pos >= endIndex {
if slices.ContainsFunc(c.cells[i].sequences, func(s int) bool { return s != seq }) {
return errors.New("shifting cells shared by multiple sequences not supported")
}
c.cells[i].pos += offset
}
if i < seqRange.min {
seqRange.min = i
}
if i > seqRange.max {
seqRange.max = i
}
}
}
}
if seqRange == newRange() {
delete(c.cellRanges, seq)
return nil
}
c.cellRanges[seq] = seqRange
if endIndex != math.MaxInt32 {
err := c.shift(seq, endIndex+offset, offset)
if err != nil {
return err
}
}
return nil
}

973
kvcache/causal_test.go Normal file
View File

@@ -0,0 +1,973 @@
package kvcache
import (
"fmt"
"math"
"slices"
"testing"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model/input"
)
type testCase struct {
name string
in []float32
inShape []int
seqs []int
pos []int32
expected []float32
expectedShape []int
expectedMask []float32
}
func runPermutedVariants(t *testing.T, fn func(t *testing.T, backend *testBackend)) {
t.Helper()
for _, permuted := range []bool{false, true} {
t.Run(fmt.Sprintf("PermutedV=%t", permuted), func(t *testing.T) {
fn(t, &testBackend{permutedV: permuted})
})
}
}
func TestStore(t *testing.T) {
runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
cache := NewCausalCache(nil)
defer cache.Close()
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
tests := []testCase{
{
name: "FirstBatch",
in: []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234},
inShape: []int{2, 3, 4},
seqs: []int{0, 0, 0, 0},
pos: []int32{0, 1, 2, 3},
expected: []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234},
expectedShape: []int{2, 3, 4},
expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0},
},
{
name: "SecondBatch",
in: []float32{115, 215, 125, 225, 135, 235},
inShape: []int{2, 3, 1},
seqs: []int{0},
pos: []int32{4},
expected: []float32{111, 211, 121, 221, 131, 231, 112, 212, 122, 222, 132, 232, 113, 213, 123, 223, 133, 233, 114, 214, 124, 224, 134, 234, 115, 215, 125, 225, 135, 235},
expectedShape: []int{2, 3, 5},
expectedMask: []float32{0, 0, 0, 0, 0},
},
}
testCache(t, backend, cache, tests)
})
}
func TestSWA(t *testing.T) {
runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
cache := NewSWACache(1, nil)
defer cache.Close()
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
x := float32(math.Inf(-1))
tests := []testCase{
{
name: "FirstBatch",
in: []float32{1, 2, 3, 4},
inShape: []int{1, 1, 4},
seqs: []int{0, 0, 0, 0},
pos: []int32{0, 1, 2, 3},
expected: []float32{1, 2, 3, 4},
expectedShape: []int{1, 1, 4},
expectedMask: []float32{
0, x, x, x,
0, 0, x, x,
x, 0, 0, x,
x, x, 0, 0,
},
},
{
name: "SecondBatch",
in: []float32{5, 6},
inShape: []int{1, 1, 2},
seqs: []int{0, 0},
pos: []int32{4, 5},
expected: []float32{5, 6, 3, 4},
expectedShape: []int{1, 1, 4},
expectedMask: []float32{
0, x, x, 0,
0, 0, x, x,
},
},
}
testCache(t, backend, cache, tests)
})
}
func TestSWASeparateBatches(t *testing.T) {
runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
cache := NewSWACache(1, nil)
defer cache.Close()
cache.Init(backend, ml.DTypeF16, 2, 16, 2)
x := float32(math.Inf(-1))
tests := []testCase{
{
name: "First seq 0",
in: []float32{1, 2},
inShape: []int{1, 1, 2},
seqs: []int{0, 0},
pos: []int32{0, 1},
expected: []float32{1, 2},
expectedShape: []int{1, 1, 2},
expectedMask: []float32{
0, x,
0, 0,
},
},
{
name: "Second seq 0",
in: []float32{3, 4},
inShape: []int{1, 1, 2},
seqs: []int{0, 0},
pos: []int32{2, 3},
expected: []float32{2, 3, 4},
expectedShape: []int{1, 1, 3},
expectedMask: []float32{
0, 0, x,
x, 0, 0,
},
},
{
name: "First seq 1",
in: []float32{5, 6},
inShape: []int{1, 1, 2},
seqs: []int{1, 1},
pos: []int32{0, 1},
expected: []float32{5, 6},
expectedShape: []int{1, 1, 2},
expectedMask: []float32{
0, x,
0, 0,
},
},
{
name: "Second seq 1",
in: []float32{7, 8},
inShape: []int{1, 1, 2},
seqs: []int{1, 1},
pos: []int32{2, 3},
expected: []float32{6, 3, 4, 7, 8},
expectedShape: []int{1, 1, 5},
expectedMask: []float32{
0, x, x, 0, x,
x, x, x, 0, 0,
},
},
{
name: "Third seq 0",
in: []float32{9, 10},
inShape: []int{1, 1, 2},
seqs: []int{0, 0},
pos: []int32{4, 5},
expected: []float32{9, 10, 3, 4},
expectedShape: []int{1, 1, 4},
expectedMask: []float32{
0, x, x, 0,
0, 0, x, x,
},
},
}
testCache(t, backend, cache, tests)
})
}
func TestSWAMem(t *testing.T) {
runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
cache := NewSWAMemCache(1, 3, nil)
defer cache.Close()
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
x := float32(math.Inf(-1))
tests := []testCase{
{
name: "FirstBatch",
in: []float32{1, 2, 3, 4},
inShape: []int{1, 1, 4},
seqs: []int{0, 0, 0, 0},
pos: []int32{0, 1, 2, 3},
expected: []float32{1, 2, 3, 4},
expectedShape: []int{1, 1, 4},
expectedMask: []float32{
0, x, x, x,
0, 0, x, x,
x, 0, 0, x,
x, x, 0, 0,
},
},
{
name: "SecondBatch",
in: []float32{5, 6},
inShape: []int{1, 1, 2},
seqs: []int{0, 0},
pos: []int32{4, 5},
expected: []float32{5, 2, 3, 4, 6},
expectedShape: []int{1, 1, 5},
expectedMask: []float32{
0, x, x, 0, x,
0, x, x, x, 0,
},
},
}
testCache(t, backend, cache, tests)
})
}
func TestChunkedAttention(t *testing.T) {
runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
cache := NewChunkedAttentionCache(2, nil)
defer cache.Close()
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
x := float32(math.Inf(-1))
testCache(
t, backend, cache,
[]testCase{
{
name: "FirstBatch",
in: []float32{1, 2, 3, 4},
inShape: []int{1, 1, 4},
seqs: []int{0, 0, 0, 0},
pos: []int32{0, 1, 2, 3},
expected: []float32{1, 2, 3, 4},
expectedShape: []int{1, 1, 4},
expectedMask: []float32{
0, x, x, x,
0, 0, x, x,
x, x, 0, x,
x, x, 0, 0,
},
},
{
name: "SecondBatch",
in: []float32{5, 6, 7},
inShape: []int{1, 1, 3},
seqs: []int{0, 0, 0},
pos: []int32{4, 5, 6},
expected: []float32{1, 2, 3, 4, 5, 6, 7},
expectedShape: []int{1, 1, 7},
expectedMask: []float32{
x, x, x, x, 0, x, x,
x, x, x, x, 0, 0, x,
x, x, x, x, x, x, 0,
},
},
{
name: "ThirdBatch",
in: []float32{8, 9},
inShape: []int{1, 1, 2},
seqs: []int{0, 0},
pos: []int32{7, 8},
expected: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9},
expectedShape: []int{1, 1, 9},
expectedMask: []float32{
x, x, x, x, x, x, 0, 0, x,
x, x, x, x, x, x, x, x, 0,
},
},
},
)
})
}
func TestSequences(t *testing.T) {
runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
cache := NewCausalCache(nil)
defer cache.Close()
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
tests := []testCase{
{
name: "FirstBatch",
in: []float32{1, 2, 3, 4},
inShape: []int{1, 1, 4},
seqs: []int{0, 0, 1, 1},
pos: []int32{0, 1, 0, 1},
expected: []float32{1, 2, 3, 4},
expectedShape: []int{1, 1, 4},
expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
},
{
name: "SecondBatch",
in: []float32{5, 6},
inShape: []int{1, 1, 2},
seqs: []int{0, 1},
pos: []int32{2, 2},
expected: []float32{1, 2, 3, 4, 5, 6},
expectedShape: []int{1, 1, 6},
expectedMask: []float32{0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), 0},
},
}
testCache(t, backend, cache, tests)
})
}
func TestRemove(t *testing.T) {
runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
return key.Add(ctx, shift), nil
})
defer cache.Close()
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
x := float32(math.Inf(-1))
tests := []testCase{
{
name: "FirstBatch",
in: []float32{1, 2, 3, 4},
inShape: []int{1, 1, 4},
seqs: []int{0, 0, 1, 1},
pos: []int32{0, 1, 0, 1},
expected: []float32{1, 2, 3, 4},
expectedShape: []int{1, 1, 4},
expectedMask: []float32{
0, x, x, x,
0, 0, x, x,
x, x, 0, x,
x, x, 0, 0,
},
},
}
testCache(t, backend, cache, tests)
err := cache.Remove(0, 1, math.MaxInt32)
if err != nil {
panic(err)
}
tests = []testCase{
{
name: "RemoveEnd",
in: []float32{5, 6},
inShape: []int{1, 1, 2},
seqs: []int{0, 1},
pos: []int32{1, 2},
expected: []float32{1, 5, 3, 4, 6},
expectedShape: []int{1, 1, 5},
expectedMask: []float32{
0, 0, x, x, x,
x, x, 0, 0, 0,
},
},
}
testCache(t, backend, cache, tests)
err = cache.Remove(0, 0, 1)
if err != nil {
panic(err)
}
tests = []testCase{
{
name: "RemoveMiddle",
in: []float32{7, 8},
inShape: []int{1, 1, 2},
seqs: []int{0, 0},
pos: []int32{1, 2},
expected: []float32{7, 4, 3, 4, 6, 8},
expectedShape: []int{1, 1, 6},
expectedMask: []float32{
0, 0, x, x, x, x,
0, 0, x, x, x, 0,
},
},
}
testCache(t, backend, cache, tests)
})
}
func TestCopy(t *testing.T) {
runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { return key, nil })
defer cache.Close()
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
tests := []testCase{
{
name: "FirstBatch",
in: []float32{1, 2, 3, 4},
inShape: []int{1, 1, 4},
seqs: []int{0, 0, 0, 0},
pos: []int32{0, 1, 2, 3},
expected: []float32{1, 2, 3, 4},
expectedShape: []int{1, 1, 4},
expectedMask: []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0},
},
}
testCache(t, backend, cache, tests)
cache.CopyPrefix(0, 1, 2)
tests = []testCase{
{
name: "Copy",
in: []float32{5, 6},
inShape: []int{1, 1, 2},
seqs: []int{1, 1},
pos: []int32{3, 4},
expected: []float32{1, 2, 3, 4, 5, 6},
expectedShape: []int{1, 1, 6},
expectedMask: []float32{0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
},
}
testCache(t, backend, cache, tests)
})
}
func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
context := backend.NewContext()
defer context.Close()
err := cache.StartForward(context, input.Batch{Positions: test.pos, Sequences: test.seqs}, false)
if err != nil {
panic(err)
}
cache.SetLayer(0)
tensor := context.FromFloats(test.in, test.inShape...)
cache.Put(context, tensor, tensor)
out, _, mask := cache.Get(context)
context.Forward(out, mask).Compute(out, mask)
if !slices.Equal(out.Floats(), test.expected) {
t.Errorf("TestCache: have %v; want %v", out.Floats(), test.expected)
}
if !slices.Equal(out.Shape(), test.expectedShape) {
t.Errorf("TestCache: has shape %v; want %v", out.Shape(), test.expectedShape)
}
if !slices.Equal(mask.Floats(), test.expectedMask) {
t.Errorf("TestCache: have mask: have %v want %v", mask.Floats(), test.expectedMask)
}
})
}
}
func TestCanResume(t *testing.T) {
runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
windowSize := int32(4)
cache := NewSWACache(windowSize, nil)
defer cache.Close()
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
context := backend.NewContext()
defer context.Close()
err := cache.StartForward(context, input.Batch{
Positions: []int32{0, 1, 2, 3, 4},
Sequences: []int{0, 0, 0, 0, 0},
}, false)
if err != nil {
t.Fatalf("StartForward failed: %v", err)
}
cache.SetLayer(0)
tensor := context.FromFloats([]float32{1, 2, 3, 4, 5}, 1, 1, 5)
cache.Put(context, tensor, tensor)
// with window size 4, nothing has slid out of the window yet
if !cache.CanResume(0, 0) {
t.Errorf("CanResume(0, 0) = false, want true (within window)")
}
if !cache.CanResume(0, 1) {
t.Errorf("CanResume(0, 1) = false, want true (within window)")
}
if !cache.CanResume(0, 2) {
t.Errorf("CanResume(0, 2) = false, want true (within window)")
}
if !cache.CanResume(0, 3) {
t.Errorf("CanResume(0, 3) = false, want true (latest position)")
}
if !cache.CanResume(0, 4) {
t.Errorf("CanResume(0, 4) = false, want true (latest position)")
}
// shift window by adding position 5
err = cache.StartForward(context, input.Batch{
Positions: []int32{5},
Sequences: []int{0},
}, false)
if err != nil {
t.Fatalf("StartForward failed: %v", err)
}
cache.SetLayer(0)
tensor = context.FromFloats([]float32{6}, 1, 1, 1)
cache.Put(context, tensor, tensor)
// only the latest position has overlapping windows
if cache.CanResume(0, 0) {
t.Errorf("after shift: CanResume(0, 0) = true, want false (outside window)")
}
if cache.CanResume(0, 1) {
t.Errorf("after shift: CanResume(0, 1) = true, want false (outside window)")
}
if cache.CanResume(0, 2) {
t.Errorf("after shift: CanResume(0, 2) = true, want false (outside window)")
}
if cache.CanResume(0, 3) {
t.Errorf("after shift: CanResume(0, 3) = true, want false (outside window)")
}
if cache.CanResume(0, 4) {
t.Errorf("after shift: CanResume(0, 4) = true, want false (outside window)")
}
if !cache.CanResume(0, 5) {
t.Errorf("after shift: CanResume(0, 5) = false, want true (latest position)")
}
})
}
func TestCanResumeSWAMem(t *testing.T) {
runPermutedVariants(t, func(t *testing.T, backend *testBackend) {
windowSize := int32(4)
memSize := int32(5)
cache := NewSWAMemCache(windowSize, memSize, nil)
defer cache.Close()
cache.Init(backend, ml.DTypeF16, 1, 16, 16)
context := backend.NewContext()
defer context.Close()
err := cache.StartForward(context, input.Batch{
Positions: []int32{0, 1, 2, 3, 4, 5, 6},
Sequences: []int{0, 0, 0, 0, 0, 0, 0},
}, false)
if err != nil {
t.Fatalf("StartForward failed: %v", err)
}
cache.SetLayer(0)
tensor := context.FromFloats([]float32{1, 2, 3, 4, 5, 6, 7}, 1, 1, 7)
cache.Put(context, tensor, tensor)
// shift window by adding position 7
err = cache.StartForward(context, input.Batch{
Positions: []int32{7},
Sequences: []int{0},
}, false)
if err != nil {
t.Fatalf("StartForward failed: %v", err)
}
cache.SetLayer(0)
tensor = context.FromFloats([]float32{8}, 1, 1, 1)
cache.Put(context, tensor, tensor)
// only the latest position has overlapping windows
if cache.CanResume(0, 0) {
t.Errorf("after shift: CanResume(0, 0) = true, want false (outside window)")
}
if cache.CanResume(0, 1) {
t.Errorf("after shift: CanResume(0, 1) = true, want false (outside window)")
}
if cache.CanResume(0, 2) {
t.Errorf("after shift: CanResume(0, 2) = true, want false (outside window)")
}
if cache.CanResume(0, 3) {
t.Errorf("after shift: CanResume(0, 3) = true, want false (outside window)")
}
if cache.CanResume(0, 4) {
t.Errorf("after shift: CanResume(0, 4) = true, want false (outside window)")
}
if cache.CanResume(0, 5) {
t.Errorf("after shift: CanResume(0, 5) = true, want false (outside window)")
}
if !cache.CanResume(0, 6) {
t.Errorf("after shift: CanResume(0, 6) = false, want true (inside window)")
}
if !cache.CanResume(0, 7) {
t.Errorf("after shift: CanResume(0, 7) = false, want true (latest position)")
}
})
}
type testBackend struct {
ml.Backend
permutedV bool
}
func (b *testBackend) NewContext() ml.Context {
return &testContext{}
}
func (b *testBackend) NewContextSize(int) ml.Context {
return &testContext{}
}
func (b *testBackend) CacheConfig() ml.CacheConfig {
return ml.CacheConfig{PermutedV: b.permutedV}
}
type testContext struct {
ml.Context
}
func (c *testContext) Empty(dtype ml.DType, shape ...int) ml.Tensor {
total := 0
if len(shape) > 0 {
total = 1
for _, s := range shape {
total *= s
}
}
return &testTensor{dtype: dtype, elementSize: 4, data: make([]float32, total), shape: shape}
}
func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
return c.Empty(dtype, shape...)
}
func (c *testContext) FromFloats(s []float32, shape ...int) ml.Tensor {
t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
copy(t.data, s)
return t
}
func (c *testContext) FromInts(s []int32, shape ...int) ml.Tensor {
f := make([]float32, len(s))
for i := range f {
f[i] = float32(s[i])
}
out := c.FromFloats(f, shape...)
out.(*testTensor).dtype = ml.DTypeI32
return out
}
func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
s := make([]float32, 0, int((stop-start)/step))
for i := start; i < stop; i += step {
s = append(s, i)
}
out := c.FromFloats(s, len(s))
out.(*testTensor).dtype = dtype
return out
}
func (c *testContext) Input() ml.Context { return c }
func (c *testContext) Layer(int) ml.Context { return c }
func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
func (c *testContext) Compute(...ml.Tensor) {}
func (c *testContext) Reserve() {}
func (c *testContext) MaxGraphNodes() int {
return 10
}
func (c *testContext) Close() {}
type testTensor struct {
ml.Tensor
dtype ml.DType
elementSize int
data []float32
shape []int
}
func (t *testTensor) Dim(n int) int {
return t.shape[n]
}
func (t *testTensor) Stride(n int) int {
stride := t.elementSize
for i := range n {
stride *= t.shape[i]
}
return stride
}
func (t *testTensor) Shape() []int {
return t.shape
}
func (t *testTensor) DType() ml.DType {
return t.dtype
}
func (t *testTensor) Floats() []float32 {
out := make([]float32, len(t.data))
copy(out, t.data)
return out
}
func (t *testTensor) Neg(ctx ml.Context) ml.Tensor {
out := ctx.Empty(t.DType(), t.Shape()...).(*testTensor)
for i := range out.data {
out.data[i] = -t.data[i]
}
return out
}
func (t *testTensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
out := ctx.Empty(t.DType(), t.Shape()...).(*testTensor)
for i := range out.data {
out.data[i] = t.data[i] + t2.(*testTensor).data[i]
}
return out
}
func (t *testTensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
return &testTensor{
dtype: t.dtype,
elementSize: t.elementSize,
data: t.data,
shape: shape,
}
}
func (t *testTensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
offset /= t.elementSize
var s []int
switch len(shape) {
case 1:
s = []int{shape[0]}
case 3:
s = []int{shape[0], shape[2]}
case 5:
s = []int{shape[0], shape[2], shape[4]}
default:
panic("unsupported number of dimensions")
}
context := &testContext{}
view := context.Empty(t.dtype, s...).(*testTensor)
view.data = t.data[offset : offset+len(view.data)]
return view
}
func (t *testTensor) Permute(ctx ml.Context, order ...int) ml.Tensor {
if len(t.shape) > 4 || len(order) > 4 {
panic("permute only supports up to 4 dimensions")
}
if len(order) != len(t.shape) && len(order) != 4 {
panic("invalid number of dimensions for permute")
}
// ggml_permute expects 4 axes, so fill in any missing dimensions.
orderFull := append(make([]int, 0, 4), order...)
for len(orderFull) < 4 {
orderFull = append(orderFull, len(orderFull))
}
seen := [4]bool{}
shape4 := [4]int{1, 1, 1, 1}
for i := 0; i < len(t.shape) && i < 4; i++ {
shape4[i] = t.shape[i]
}
newShape4 := [4]int{1, 1, 1, 1}
for axis := range 4 {
dst := orderFull[axis]
if dst < 0 || dst >= 4 {
panic("invalid axis for permute")
}
if seen[dst] {
panic("duplicate axis for permute")
}
seen[dst] = true
newShape4[dst] = shape4[axis]
}
total := len(t.data)
newData := make([]float32, total)
if total > 0 {
oldDims := shape4
newDims := newShape4
oldStride := [4]int{1, 1, 1, 1}
newStride := [4]int{1, 1, 1, 1}
for i := 1; i < 4; i++ {
oldStride[i] = oldStride[i-1] * oldDims[i-1]
newStride[i] = newStride[i-1] * newDims[i-1]
}
var coords [4]int
var newCoords [4]int
for idx := range total {
remainder := idx
for axis := range 4 {
dim := oldDims[axis]
if dim == 0 {
coords[axis] = 0
continue
}
coords[axis] = remainder % dim
remainder /= dim
}
for axis := range 4 {
newCoords[orderFull[axis]] = coords[axis]
}
newIndex := 0
for axis := range 4 {
if newDims[axis] == 0 {
continue
}
newIndex += newCoords[axis] * newStride[axis]
}
newData[newIndex] = t.data[idx]
}
}
numDims := 4
for numDims > 1 && newShape4[numDims-1] <= 1 {
numDims--
}
newShape := make([]int, numDims)
copy(newShape, newShape4[:numDims])
return &testTensor{
dtype: t.dtype,
elementSize: t.elementSize,
data: newData,
shape: newShape,
}
}
func (t *testTensor) SetRows(ctx ml.Context, src ml.Tensor, idxs ml.Tensor) ml.Tensor {
dst := t
srcTensor := src.(*testTensor)
idxTensor := idxs.(*testTensor)
shapeTo4D := func(shape []int) [4]int {
out := [4]int{1, 1, 1, 1}
for i := 0; i < len(shape) && i < 4; i++ {
out[i] = shape[i]
}
return out
}
computeStrides := func(shape [4]int) [4]int {
out := [4]int{1, 1, 1, 1}
for i := 1; i < 4; i++ {
out[i] = out[i-1] * shape[i-1]
}
return out
}
dstShape4D := shapeTo4D(dst.shape)
srcShape4D := shapeTo4D(srcTensor.shape)
idxShape4D := shapeTo4D(idxTensor.shape)
if dstShape4D[0] != srcShape4D[0] || dstShape4D[2] != srcShape4D[2] || dstShape4D[3] != srcShape4D[3] {
panic("SetRows requires matching tensor shapes")
}
if srcShape4D[1] != idxShape4D[0] {
panic("SetRows rows/index mismatch")
}
if srcShape4D[2]%idxShape4D[1] != 0 || srcShape4D[3]%idxShape4D[2] != 0 {
panic("SetRows cannot broadcast indices")
}
if idxShape4D[3] != 1 {
panic("SetRows expects 1D or 2D index tensors")
}
dstStride := computeStrides(dstShape4D)
srcStride := computeStrides(srcShape4D)
idxStride := computeStrides(idxShape4D)
numColumns := srcShape4D[0]
numRows := srcShape4D[1]
for dim3Index := range dstShape4D[3] {
for dim2Index := range dstShape4D[2] {
idxDim2 := 0
idxDim3 := 0
if idxShape4D[1] > 0 {
idxDim2 = dim2Index % idxShape4D[1]
}
if idxShape4D[2] > 0 {
idxDim3 = dim3Index % idxShape4D[2]
}
idxBase := idxDim3*idxStride[2] + idxDim2*idxStride[1]
srcBase := dim3Index*srcStride[3] + dim2Index*srcStride[2]
dstBase := dim3Index*dstStride[3] + dim2Index*dstStride[2]
for row := range numRows {
idx := int(idxTensor.data[idxBase+row*idxStride[0]])
if idx < 0 || idx >= dstShape4D[1] {
panic("SetRows index out of range")
}
srcOffset := srcBase + row*srcStride[1]
dstOffset := dstBase + idx*dstStride[1]
copy(dst.data[dstOffset:dstOffset+numColumns], srcTensor.data[srcOffset:srcOffset+numColumns])
}
}
}
return dst
}
func (t *testTensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
copy(t2.(*testTensor).data, t.data)
return nil
}

156
kvcache/encoder.go Normal file
View File

@@ -0,0 +1,156 @@
package kvcache
import (
"fmt"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model/input"
)
// Encoder cache stores K and V tensors that are position independent
//
// The tensors can be of any shape and will be returned as they were stored
// The mask is currently always nil
//
// Not currently safe for multiple sequences
type EncoderCache struct {
// config controls mostly backend-specific optimizations
config *ml.CacheConfig
// ** current forward pass **
// the active layer for Get and Put
curLayer int
// if something is stored during this pass, this
// will be the position (but there is no guarantee
// anything will be stored)
curPos int32
// curReserve indicates that this forward pass is only for
// memory reservation and we should not update our metadata
// based on it.
curReserve bool
// ** cache metadata **
// was something stored in the cache?
encoderCached bool
// position of the cached data
encoderPos int32
// ** cache data storage **
backend ml.Backend
ctxs map[int]ml.Context
keys, values map[int]ml.Tensor
}
func NewEncoderCache() *EncoderCache {
return &EncoderCache{
ctxs: make(map[int]ml.Context),
keys: make(map[int]ml.Tensor),
values: make(map[int]ml.Tensor),
}
}
func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
if c.config == nil {
var config ml.CacheConfig
if cc, ok := backend.(ml.BackendCacheConfig); ok {
config = cc.CacheConfig()
}
c.config = &config
}
if maxSequences > 1 {
panic(fmt.Errorf("encoder cache does not support multiple sequences; requested: %v", maxSequences))
}
if c.config.CachePadding != 0 && c.config.CachePadding != 1 {
panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
}
c.backend = backend
}
func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
if c.config != nil {
panic("config cannot be changed after being previously set, either by the model or backend")
}
c.config = &config
}
func (c *EncoderCache) Close() {
for _, ctx := range c.ctxs {
ctx.Close()
}
}
func (c *EncoderCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
// We work with the most recent image
if len(batch.Multimodal) > 0 {
c.curPos = batch.Positions[batch.Multimodal[len(batch.Multimodal)-1].Index]
}
c.curReserve = reserve
return nil
}
func (c *EncoderCache) SetLayer(layer int) {
c.curLayer = layer
}
func (c *EncoderCache) EncoderCached() bool {
return c.encoderCached
}
func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
return c.keys[c.curLayer], c.values[c.curLayer], nil
}
func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
if !c.curReserve {
c.encoderPos = c.curPos
c.encoderCached = true
}
if c.config.PermutedV {
value = value.Permute(ctx, 1, 2, 0, 3)
}
if _, ok := c.ctxs[c.curLayer]; !ok {
c.ctxs[c.curLayer] = c.backend.NewContextSize(2).Layer(c.curLayer)
}
if _, ok := c.keys[c.curLayer]; !ok {
c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...)
}
if _, ok := c.values[c.curLayer]; !ok {
c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...)
}
ctx.Forward(
key.Copy(ctx, c.keys[c.curLayer]),
value.Copy(ctx, c.values[c.curLayer]),
)
}
func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
panic("encoder cache does not support multiple sequences")
}
func (c *EncoderCache) CanResume(seq int, pos int32) bool {
return true
}
func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error {
if c.encoderPos >= beginIndex && c.encoderPos < endIndex {
c.encoderCached = false
}
return nil
}

752
kvcache/recurrent.go Normal file
View File

@@ -0,0 +1,752 @@
package kvcache
import (
"errors"
"fmt"
"math"
"slices"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model/input"
)
const (
DefaultCheckpointCount = 24
DefaultCheckpointMinPos = int32(16)
DefaultCheckpointInterval = int32(1664)
)
var ErrInvalidRecurrentShape = errors.New("kvcache: invalid recurrent state shape")
// Config configures a shared hybrid recurrent cache.
type RecurrentConfig struct {
Shift func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error)
ConvDim int
ConvChannels int
RecurrentStateSize int
CheckpointLogPrefix string
}
var (
_ Cache = (*Recurrent)(nil)
_ CheckpointCache = (*Recurrent)(nil)
)
// Cache stores:
// - a standard causal KV cache
// - per-sequence conv state for recurrent operators
// - per-sequence recurrent state for recurrent operators
//
// Conv state shape (per layer, per sequence): [convDim, convChannels]
// Recurrent state shape (per layer, per sequence): [recurrentStateSize]
type Recurrent struct {
kv *Causal
backend ml.Backend
dtype ml.DType
maxSequences int
// Conv state dimensions
convDim int
convChannels int
// Recurrent state dimensions
recurrentStateSize int
logPrefix string
// slot mapping for recurrent state (copy-on-write)
slotForSeq map[int]int
refCount []int
freeSlots []int
seqCounts map[int]int
slotScratch [1]int32
// per-layer conv state buffers (allocated lazily)
convCtxs map[int]ml.Context
convStates map[int]ml.Tensor // [convDim*convChannels, maxSlots]
// per-layer recurrent state buffers (allocated lazily)
recurrentCtxs map[int]ml.Context
recurrentStates map[int]ml.Tensor // [recurrentStateSize, maxSlots]
// recurrent checkpoints (per slot)
checkpointCount int
checkpointMinPos int32
checkpointInterval int32
checkpointCtxSize int
checkpoints map[int]*slotCheckpointStore
pendingRestore map[int]checkpointRestore
curCheckpointPos []int32
curCheckpointSlots map[int]int
reserveCheckpoints bool
checkpointConvCtxs map[int]ml.Context
checkpointRecurCtxs map[int]ml.Context
checkpointReserved map[int]struct{}
// current forward batch (derived in StartForward)
curSeqs []int
curSlots []int
curSlotsInput ml.Tensor
curSeqTokens int
// track if EnsureWritable has been called for this forward pass
writableEnsured bool
writableError error
}
func NewRecurrentCache(config RecurrentConfig) *Recurrent {
return &Recurrent{
kv: NewCausalCache(config.Shift),
convDim: config.ConvDim,
convChannels: config.ConvChannels,
recurrentStateSize: config.RecurrentStateSize,
logPrefix: config.CheckpointLogPrefix,
slotForSeq: make(map[int]int),
seqCounts: make(map[int]int),
convCtxs: make(map[int]ml.Context),
convStates: make(map[int]ml.Tensor),
recurrentCtxs: make(map[int]ml.Context),
recurrentStates: make(map[int]ml.Tensor),
checkpointCount: DefaultCheckpointCount,
checkpointMinPos: DefaultCheckpointMinPos,
checkpointInterval: DefaultCheckpointInterval,
checkpoints: make(map[int]*slotCheckpointStore),
pendingRestore: make(map[int]checkpointRestore),
curCheckpointSlots: make(map[int]int),
checkpointConvCtxs: make(map[int]ml.Context),
checkpointRecurCtxs: make(map[int]ml.Context),
checkpointReserved: make(map[int]struct{}),
}
}
func (c *Recurrent) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
c.backend = backend
c.dtype = dtype
c.maxSequences = maxSequences
c.checkpoints = make(map[int]*slotCheckpointStore)
c.pendingRestore = make(map[int]checkpointRestore)
c.curCheckpointPos = c.curCheckpointPos[:0]
c.curCheckpointSlots = make(map[int]int)
c.checkpointReserved = make(map[int]struct{})
c.checkpointCtxSize = c.checkpointCount * c.maxSequences
if c.checkpointCtxSize < 8 {
c.checkpointCtxSize = 8
}
// initialize slot allocator
c.refCount = make([]int, maxSequences)
c.freeSlots = c.freeSlots[:0]
for i := maxSequences - 1; i >= 0; i-- {
c.freeSlots = append(c.freeSlots, i)
}
c.kv.Init(backend, dtype, maxSequences, capacity, maxBatch)
}
func (c *Recurrent) Close() {
for _, ctx := range c.convCtxs {
ctx.Close()
}
for _, ctx := range c.recurrentCtxs {
ctx.Close()
}
for _, ctx := range c.checkpointConvCtxs {
ctx.Close()
}
for _, ctx := range c.checkpointRecurCtxs {
ctx.Close()
}
c.kv.Close()
}
func (c *Recurrent) SetConfig(config ml.CacheConfig) {
c.kv.SetConfig(config)
}
func (c *Recurrent) SetLayer(layer int) {
c.kv.SetLayer(layer)
}
func (c *Recurrent) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
return c.kv.Get(ctx)
}
func (c *Recurrent) Put(ctx ml.Context, key, value ml.Tensor) {
c.kv.Put(ctx, key, value)
}
func (c *Recurrent) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
if err := c.kv.StartForward(ctx, batch, reserve); err != nil {
return err
}
nTokens := len(batch.Sequences)
if nTokens == 0 {
c.curSeqs = c.curSeqs[:0]
c.curSlots = c.curSlots[:0]
c.curSlotsInput = nil
c.curSeqTokens = 0
c.reserveCheckpoints = false
c.writableEnsured = false
c.writableError = nil
return nil
}
// Fast path for single-sequence batches (common during decode and prefill).
firstSeq := batch.Sequences[0]
singleSeq := true
for _, s := range batch.Sequences[1:] {
if s != firstSeq {
singleSeq = false
break
}
}
if singleSeq {
return c.startForwardSingleSeq(ctx, firstSeq, nTokens, batch, reserve)
}
// Derive equal-length sequence layout for recurrent layers.
seqCounts := c.seqCounts
for s := range seqCounts {
delete(seqCounts, s)
}
c.curSeqs = c.curSeqs[:0]
for _, s := range batch.Sequences {
if seqCounts[s] == 0 {
c.curSeqs = append(c.curSeqs, s)
}
seqCounts[s]++
}
nSeqs := len(c.curSeqs)
want := nTokens / nSeqs
for _, s := range c.curSeqs {
if seqCounts[s] != want {
return ErrNotSupported
}
}
c.curSeqTokens = want
if reserve {
c.curSlots = c.curSlots[:0]
for i := range nSeqs {
c.curSlots = append(c.curSlots, i)
}
c.finalizeStartForward(ctx, batch, true)
return nil
}
// Ensure slots exist for sequences in this batch.
c.curSlots = c.curSlots[:0]
var newSlots []int
for _, s := range c.curSeqs {
slot, ok := c.slotForSeq[s]
if !ok {
var err error
slot, err = c.allocSlot()
if err != nil {
return err
}
c.slotForSeq[s] = slot
c.refCount[slot] = 1
newSlots = append(newSlots, slot)
}
c.curSlots = append(c.curSlots, slot)
}
if len(newSlots) > 0 {
c.zeroSlots(ctx, newSlots)
}
c.finalizeStartForward(ctx, batch, false)
return nil
}
func (c *Recurrent) startForwardSingleSeq(ctx ml.Context, seq, seqTokens int, batch input.Batch, reserve bool) error {
c.curSeqs = append(c.curSeqs[:0], seq)
c.curSeqTokens = seqTokens
if reserve {
c.curSlots = append(c.curSlots[:0], 0)
c.finalizeStartForward(ctx, batch, true)
return nil
}
slot, ok := c.slotForSeq[seq]
if !ok {
var err error
slot, err = c.allocSlot()
if err != nil {
return err
}
c.slotForSeq[seq] = slot
c.refCount[slot] = 1
slotList := [1]int{slot}
c.zeroSlots(ctx, slotList[:])
}
c.curSlots = append(c.curSlots[:0], slot)
c.finalizeStartForward(ctx, batch, false)
return nil
}
func (c *Recurrent) finalizeStartForward(ctx ml.Context, batch input.Batch, reserve bool) {
c.setCurSlotsInput(ctx)
c.writableEnsured = false
c.writableError = nil
c.reserveCheckpoints = reserve
c.planCheckpoints(batch)
}
func (c *Recurrent) setCurSlotsInput(ctx ml.Context) {
c.curSlotsInput = c.slotsInput(ctx, c.curSlots)
}
func (c *Recurrent) slotsInput(ctx ml.Context, slots []int) ml.Tensor {
switch len(slots) {
case 0:
return nil
case 1:
c.slotScratch[0] = int32(slots[0])
return ctx.Input().FromInts(c.slotScratch[:], 1)
default:
slotIndices := make([]int32, len(slots))
for i, v := range slots {
slotIndices[i] = int32(v)
}
return ctx.Input().FromInts(slotIndices, len(slotIndices))
}
}
func (c *Recurrent) allocSlot() (int, error) {
if len(c.freeSlots) == 0 {
return 0, ErrKvCacheFull
}
slot := c.freeSlots[len(c.freeSlots)-1]
c.freeSlots = c.freeSlots[:len(c.freeSlots)-1]
return slot, nil
}
func (c *Recurrent) freeSlot(slot int) {
if slot >= 0 && slot < c.maxSequences {
c.freeSlots = append(c.freeSlots, slot)
}
}
// zeroSlots zeros recurrent state for the given slots across all cached layers.
func (c *Recurrent) zeroSlots(ctx ml.Context, slots []int) {
if len(slots) == 0 {
return
}
inputCtx := ctx.Input()
slotsTensor := c.slotsInput(ctx, slots)
if len(c.convStates) > 0 {
zeros := inputCtx.Zeros(ml.DTypeF32, c.convDim*c.convChannels, len(slots))
for _, buf := range c.convStates {
ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
}
}
if len(c.recurrentStates) > 0 {
zeros := inputCtx.Zeros(ml.DTypeF32, c.recurrentStateSize, len(slots))
for _, buf := range c.recurrentStates {
ctx.Forward(buf.SetRows(ctx, zeros, slotsTensor))
}
}
}
// EnsureWritable ensures sequences have private slots (copy-on-write).
func (c *Recurrent) EnsureWritable(ctx ml.Context) error {
for i, seq := range c.curSeqs {
slot, ok := c.slotForSeq[seq]
if !ok {
continue
}
if slot < 0 || slot >= len(c.refCount) {
continue
}
if c.refCount[slot] <= 1 {
continue
}
newSlot, err := c.allocSlot()
if err != nil {
return err
}
c.refCount[slot]--
c.refCount[newSlot] = 1
c.slotForSeq[seq] = newSlot
c.curSlots[i] = newSlot
c.copyRecurrentState(ctx, slot, newSlot)
c.copyCheckpoints(ctx, slot, newSlot)
}
c.setCurSlotsInput(ctx)
return nil
}
func (c *Recurrent) copyRecurrentState(ctx ml.Context, srcSlot, dstSlot int) {
src := ctx.Input().FromInts([]int32{int32(srcSlot)}, 1)
dst := ctx.Input().FromInts([]int32{int32(dstSlot)}, 1)
for _, buf := range c.convStates {
rows := buf.Rows(ctx, src)
if rows.DType() != ml.DTypeF32 {
rows = rows.Cast(ctx, ml.DTypeF32)
}
ctx.Forward(buf.SetRows(ctx, rows, dst))
}
for _, buf := range c.recurrentStates {
rows := buf.Rows(ctx, src)
if rows.DType() != ml.DTypeF32 {
rows = rows.Cast(ctx, ml.DTypeF32)
}
ctx.Forward(buf.SetRows(ctx, rows, dst))
}
}
func (c *Recurrent) CopyPrefix(srcSeq, dstSeq int, prefixLen int32) {
c.kv.CopyPrefix(srcSeq, dstSeq, prefixLen)
if dstSlot, ok := c.slotForSeq[dstSeq]; ok {
if c.validSlot(dstSlot) {
c.refCount[dstSlot]--
if c.refCount[dstSlot] <= 0 {
c.refCount[dstSlot] = 0
c.freeSlot(dstSlot)
}
}
delete(c.slotForSeq, dstSeq)
}
srcSlot, ok := c.slotForSeq[srcSeq]
if !ok {
return
}
if c.validSlot(srcSlot) {
c.slotForSeq[dstSeq] = srcSlot
c.refCount[srcSlot]++
}
}
func (c *Recurrent) CanResume(seq int, pos int32) bool {
if !c.kv.CanResume(seq, pos) {
return false
}
if pos == 0 {
return true
}
return c.hasCheckpoint(seq, pos)
}
func (c *Recurrent) Remove(seq int, beginIndex, endIndex int32) error {
if beginIndex > 0 && endIndex != math.MaxInt32 {
if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
return err
}
delete(c.pendingRestore, seq)
slot, ok := c.slotForSeq[seq]
if !ok || !c.validSlot(slot) {
return nil
}
// Detach shared recurrent state/checkpoints before mutating checkpoint positions.
if c.refCount[slot] > 1 {
newSlot, err := c.allocSlot()
if err != nil {
return err
}
ctx := c.backend.NewContext()
c.copyRecurrentState(ctx, slot, newSlot)
c.copyCheckpoints(ctx, slot, newSlot)
if len(c.convStates) > 0 || len(c.recurrentStates) > 0 {
ctx.Compute()
}
ctx.Close()
c.refCount[slot]--
c.refCount[newSlot] = 1
c.slotForSeq[seq] = newSlot
slot = newSlot
}
c.shiftCheckpoints(slot, beginIndex, endIndex)
return nil
}
if beginIndex > 0 {
restore, ok := c.pendingRestore[seq]
if !ok || restore.pos+1 != beginIndex {
return ErrNotSupported
}
if !c.restoreComplete(restore) {
return ErrNotSupported
}
if slot, ok := c.slotForSeq[seq]; ok && c.validSlot(slot) && c.refCount[slot] > 1 {
newSlot, err := c.allocSlot()
if err != nil {
return err
}
ctx := c.backend.NewContext()
c.copyRecurrentState(ctx, slot, newSlot)
c.copyCheckpoints(ctx, slot, newSlot)
if len(c.convStates) > 0 || len(c.recurrentStates) > 0 {
ctx.Compute()
}
ctx.Close()
c.refCount[slot]--
c.refCount[newSlot] = 1
c.slotForSeq[seq] = newSlot
restore.slot = newSlot
c.pendingRestore[seq] = restore
}
}
if err := c.kv.Remove(seq, beginIndex, endIndex); err != nil {
return err
}
if beginIndex > 0 {
restore := c.pendingRestore[seq]
delete(c.pendingRestore, seq)
return c.applyCheckpointRestore(restore)
}
slot, ok := c.slotForSeq[seq]
delete(c.pendingRestore, seq)
if !ok {
return nil
}
if !c.validSlot(slot) {
delete(c.slotForSeq, seq)
return nil
}
c.refCount[slot]--
if c.refCount[slot] <= 0 {
c.refCount[slot] = 0
c.clearCheckpoints(slot)
c.freeSlot(slot)
}
delete(c.slotForSeq, seq)
return nil
}
func (c *Recurrent) validSlot(slot int) bool {
return slot >= 0 && slot < len(c.refCount)
}
func (c *Recurrent) SlotsTensor() ml.Tensor {
return c.curSlotsInput
}
// contiguousSlots returns the starting slot if current slots are contiguous and ordered.
func (c *Recurrent) contiguousSlots() (int, bool) {
if len(c.curSlots) == 0 {
return 0, false
}
start := c.curSlots[0]
for i, s := range c.curSlots {
if s != start+i {
return 0, false
}
}
return start, true
}
func (c *Recurrent) SeqTokens() int {
return c.curSeqTokens
}
func (c *Recurrent) NumSeqs() int {
return len(c.curSeqs)
}
func (c *Recurrent) convBuffer(layer int) ml.Tensor {
if buf, ok := c.convStates[layer]; ok {
return buf
}
if _, ok := c.convCtxs[layer]; !ok {
c.convCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
}
buf := c.convCtxs[layer].Zeros(ml.DTypeF32, c.convDim*c.convChannels, c.maxSequences)
c.convStates[layer] = buf
return buf
}
func (c *Recurrent) recurrentBuffer(layer int) ml.Tensor {
if buf, ok := c.recurrentStates[layer]; ok {
return buf
}
if _, ok := c.recurrentCtxs[layer]; !ok {
c.recurrentCtxs[layer] = c.backend.NewContextSize(1).Layer(layer)
}
buf := c.recurrentCtxs[layer].Zeros(ml.DTypeF32, c.recurrentStateSize, c.maxSequences)
c.recurrentStates[layer] = buf
return buf
}
func (c *Recurrent) ensureWritable(ctx ml.Context) error {
c.ensureWritableOnce(ctx)
return c.writableError
}
func (c *Recurrent) currentSlotRows(ctx ml.Context, buf ml.Tensor, rowSize int) ml.Tensor {
if start, ok := c.contiguousSlots(); ok {
offset := start * buf.Stride(1)
return buf.View(ctx, offset, rowSize, buf.Stride(1), c.NumSeqs())
}
return buf.Rows(ctx, c.SlotsTensor())
}
func (c *Recurrent) writeCurrentSlotRows(ctx ml.Context, buf ml.Tensor, rowSize int, src ml.Tensor) {
if start, ok := c.contiguousSlots(); ok {
offset := start * buf.Stride(1)
view := buf.View(ctx, offset, rowSize, buf.Stride(1), c.NumSeqs())
ctx.Forward(src.Copy(ctx, view))
return
}
ctx.Forward(buf.SetRows(ctx, src, c.SlotsTensor()))
}
func (c *Recurrent) ensureWritableOnce(ctx ml.Context) {
if !c.writableEnsured {
needsWritable := false
for _, seq := range c.curSeqs {
slot, ok := c.slotForSeq[seq]
if !ok {
continue
}
if slot >= 0 && slot < len(c.refCount) && c.refCount[slot] > 1 {
needsWritable = true
break
}
}
if needsWritable {
if err := c.EnsureWritable(ctx); err != nil {
c.writableError = err
}
}
c.writableEnsured = true
}
}
// ConvState returns conv state for current batch sequences as [convDim, convChannels, nSeqs].
func (c *Recurrent) ConvState(ctx ml.Context, layer int) (ml.Tensor, error) {
if err := c.ensureWritable(ctx); err != nil {
return nil, err
}
buf := c.convBuffer(layer)
cur := c.currentSlotRows(ctx, buf, c.convDim*c.convChannels)
return cur.Reshape(ctx, c.convDim, c.convChannels, c.NumSeqs()), nil
}
// UpdateConvState writes new conv state for current batch sequences.
func (c *Recurrent) UpdateConvState(ctx ml.Context, layer int, newState ml.Tensor) {
buf := c.convBuffer(layer)
src := newState.Reshape(ctx, c.convDim*c.convChannels, c.NumSeqs())
srcF32 := src
if src.DType() != ml.DTypeF32 {
srcF32 = src.Cast(ctx, ml.DTypeF32)
}
c.writeCurrentSlotRows(ctx, buf, c.convDim*c.convChannels, srcF32)
c.captureConvCheckpoint(ctx, layer, srcF32)
}
// RecurrentState returns recurrent state for current batch sequences with shape [dims..., nSeqs].
func (c *Recurrent) RecurrentState(ctx ml.Context, layer int, dims ...int) (ml.Tensor, error) {
if err := c.ensureWritable(ctx); err != nil {
return nil, err
}
if len(dims) == 0 {
return nil, ErrInvalidRecurrentShape
}
size := 1
for _, d := range dims {
if d <= 0 {
return nil, ErrInvalidRecurrentShape
}
size *= d
}
if size != c.recurrentStateSize {
return nil, fmt.Errorf("%w: got %v (size %d), want size %d", ErrInvalidRecurrentShape, dims, size, c.recurrentStateSize)
}
buf := c.recurrentBuffer(layer)
cur := c.currentSlotRows(ctx, buf, c.recurrentStateSize)
shape := make([]int, 0, len(dims)+1)
shape = append(shape, dims...)
shape = append(shape, c.NumSeqs())
return cur.Reshape(ctx, shape...), nil
}
// RecurrentState4D returns recurrent state as [dim0, dim1, dim2, nSeqs].
func (c *Recurrent) RecurrentState4D(ctx ml.Context, layer int, dim0, dim1, dim2 int) (ml.Tensor, error) {
if err := c.ensureWritable(ctx); err != nil {
return nil, err
}
if dim0 <= 0 || dim1 <= 0 || dim2 <= 0 {
return nil, ErrInvalidRecurrentShape
}
size := dim0 * dim1 * dim2
if size != c.recurrentStateSize {
return nil, fmt.Errorf("%w: got [%d %d %d] (size %d), want size %d", ErrInvalidRecurrentShape, dim0, dim1, dim2, size, c.recurrentStateSize)
}
buf := c.recurrentBuffer(layer)
cur := c.currentSlotRows(ctx, buf, c.recurrentStateSize)
return cur.Reshape(ctx, dim0, dim1, dim2, c.NumSeqs()), nil
}
// UpdateRecurrentState writes new recurrent state for current batch sequences.
func (c *Recurrent) UpdateRecurrentState(ctx ml.Context, layer int, newState ml.Tensor) {
buf := c.recurrentBuffer(layer)
src := newState.Reshape(ctx, c.recurrentStateSize, c.NumSeqs())
srcF32 := src
if src.DType() != ml.DTypeF32 {
srcF32 = src.Cast(ctx, ml.DTypeF32)
}
c.writeCurrentSlotRows(ctx, buf, c.recurrentStateSize, srcF32)
c.captureRecurrentCheckpoint(ctx, layer, srcF32)
}
// IsSupportedForBatch returns true if the current batch layout supports recurrent layers.
func (c *Recurrent) IsSupportedForBatch() bool {
return c.curSeqTokens > 0 && len(c.curSeqs) > 0
}
// Seqs returns the ordered unique sequences for the current forward pass.
func (c *Recurrent) Seqs() []int {
return slices.Clone(c.curSeqs)
}

View File

@@ -0,0 +1,561 @@
package kvcache
import (
"log/slog"
"math"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model/input"
)
// TODO(jmorganca): Add byte-serialized host-RAM checkpoints to reduce GPU
// memory usage while preserving prefix reuse for recurrent state.
type checkpointEntry struct {
pos int32
conv map[int]ml.Tensor
recurrent map[int]ml.Tensor
}
type slotCheckpointStore struct {
entries []checkpointEntry
size int
next int
lastPos int32
}
type checkpointRestore struct {
slot int
idx int
pos int32
}
func newSlotCheckpointStore(n int) *slotCheckpointStore {
entries := make([]checkpointEntry, n)
for i := range entries {
entries[i].pos = -1
}
return &slotCheckpointStore{
entries: entries,
lastPos: -1,
}
}
func (s *slotCheckpointStore) reset() {
s.size = 0
s.next = 0
s.lastPos = -1
for i := range s.entries {
s.entries[i].pos = -1
}
}
func (s *slotCheckpointStore) record(pos int32) int {
if len(s.entries) == 0 {
return -1
}
idx := s.next
s.next = (s.next + 1) % len(s.entries)
if s.size < len(s.entries) {
s.size++
}
s.entries[idx].pos = pos
s.lastPos = pos
return idx
}
func (s *slotCheckpointStore) bestIndex(targetPos int32) (int, int32, bool) {
bestIdx := -1
bestPos := int32(-1)
for i := range s.entries {
pos := s.entries[i].pos
if pos < 0 || pos >= targetPos {
continue
}
if pos > bestPos {
bestPos = pos
bestIdx = i
}
}
if bestIdx < 0 {
return -1, -1, false
}
return bestIdx, bestPos, true
}
func (s *slotCheckpointStore) pruneAfter(pos int32) {
if len(s.entries) == 0 {
s.size = 0
s.next = 0
s.lastPos = -1
return
}
size := 0
next := -1
minPos := int32(math.MaxInt32)
minIdx := 0
for i := range s.entries {
if s.entries[i].pos > pos {
s.entries[i].pos = -1
}
if s.entries[i].pos >= 0 {
size++
if s.entries[i].pos < minPos {
minPos = s.entries[i].pos
minIdx = i
}
} else if next == -1 {
next = i
}
}
s.size = size
if size == 0 {
s.next = 0
s.lastPos = -1
return
}
if next != -1 {
s.next = next
} else {
// Full ring: overwrite the oldest checkpoint next.
s.next = minIdx
}
s.lastPos = pos
}
func (s *slotCheckpointStore) shiftRange(beginIndex, endIndex int32) {
if len(s.entries) == 0 {
s.size = 0
s.next = 0
s.lastPos = -1
return
}
offset := beginIndex - endIndex
size := 0
next := -1
minPos := int32(math.MaxInt32)
maxPos := int32(-1)
minIdx := 0
for i := range s.entries {
pos := s.entries[i].pos
if pos >= 0 {
if pos >= beginIndex && pos < endIndex {
s.entries[i].pos = -1
} else if pos >= endIndex {
s.entries[i].pos = pos + offset
}
}
pos = s.entries[i].pos
if pos >= 0 {
size++
if pos < minPos {
minPos = pos
minIdx = i
}
if pos > maxPos {
maxPos = pos
}
} else if next == -1 {
next = i
}
}
s.size = size
if size == 0 {
s.next = 0
s.lastPos = -1
return
}
if next != -1 {
s.next = next
} else {
// Full ring: overwrite the oldest checkpoint next.
s.next = minIdx
}
s.lastPos = maxPos
}
func (s *slotCheckpointStore) window() (size int, minPos, maxPos, lastPos int32) {
minPos = int32(math.MaxInt32)
maxPos = int32(-1)
for i := range s.entries {
pos := s.entries[i].pos
if pos < 0 {
continue
}
size++
if pos < minPos {
minPos = pos
}
if pos > maxPos {
maxPos = pos
}
}
if size == 0 {
minPos = -1
maxPos = -1
}
return size, minPos, maxPos, s.lastPos
}
func (c *Recurrent) checkpointTag() string {
if c.logPrefix == "" {
return "kvcache.recurrent"
}
return c.logPrefix
}
func (c *Recurrent) planCheckpoints(batch input.Batch) {
if c.checkpointCount == 0 || len(c.curSeqs) == 0 {
c.curCheckpointPos = c.curCheckpointPos[:0]
for k := range c.curCheckpointSlots {
delete(c.curCheckpointSlots, k)
}
return
}
if cap(c.curCheckpointPos) < len(c.curSeqs) {
c.curCheckpointPos = make([]int32, len(c.curSeqs))
} else {
c.curCheckpointPos = c.curCheckpointPos[:len(c.curSeqs)]
}
for i := range c.curCheckpointPos {
c.curCheckpointPos[i] = -1
}
for k := range c.curCheckpointSlots {
delete(c.curCheckpointSlots, k)
}
posMax := make(map[int]int32, len(c.curSeqs))
for i, seq := range batch.Sequences {
pos := batch.Positions[i]
if cur, ok := posMax[seq]; !ok || pos > cur {
posMax[seq] = pos
}
}
for i, seq := range c.curSeqs {
pos, ok := posMax[seq]
if !ok {
continue
}
if pos < c.checkpointMinPos {
continue
}
slot := c.curSlots[i]
store := c.checkpointStore(slot)
lastPos := store.lastPos
if lastPos < 0 || pos-lastPos >= c.checkpointInterval {
c.curCheckpointPos[i] = pos
}
}
}
func (c *Recurrent) checkpointStore(slot int) *slotCheckpointStore {
store, ok := c.checkpoints[slot]
if ok {
return store
}
store = newSlotCheckpointStore(c.checkpointCount)
c.checkpoints[slot] = store
return store
}
func (c *Recurrent) checkpointIndexForSlot(slot int, pos int32) int {
if c.checkpointCount == 0 {
return -1
}
if idx, ok := c.curCheckpointSlots[slot]; ok {
return idx
}
store := c.checkpointStore(slot)
idx := store.record(pos)
if idx >= 0 {
c.curCheckpointSlots[slot] = idx
}
return idx
}
func (c *Recurrent) hasCheckpoint(seq int, pos int32) bool {
if pos <= 0 {
return false
}
slot, ok := c.slotForSeq[seq]
if !ok {
return false
}
store, ok := c.checkpoints[slot]
if !ok {
return false
}
_, _, ok = store.bestIndex(pos)
return ok
}
func (c *Recurrent) PrepareRestore(seq int, targetPos int32) (int32, bool) {
if targetPos <= 0 {
return 0, false
}
slot, ok := c.slotForSeq[seq]
if !ok {
return 0, false
}
store, ok := c.checkpoints[slot]
if !ok {
slog.Debug(c.checkpointTag()+": checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", 0)
return 0, false
}
idx, pos, ok := store.bestIndex(targetPos)
if !ok {
size, minPos, maxPos, lastPos := store.window()
slog.Debug(c.checkpointTag()+": checkpoint miss", "seq", seq, "slot", slot, "target", targetPos, "size", size,
"min", minPos, "max", maxPos, "last", lastPos)
return 0, false
}
c.pendingRestore[seq] = checkpointRestore{
slot: slot,
idx: idx,
pos: pos,
}
return pos + 1, true
}
func (c *Recurrent) applyCheckpointRestore(restore checkpointRestore) error {
entry, ok := c.restoreEntry(restore)
if !ok {
return ErrNotSupported
}
ctx := c.backend.NewContext()
defer ctx.Close()
slotIdx := ctx.Input().FromInts([]int32{int32(restore.slot)}, 1)
for layer, src := range entry.conv {
buf := c.convBuffer(layer)
ctx.Forward(buf.SetRows(ctx, src, slotIdx))
}
for layer, src := range entry.recurrent {
buf := c.recurrentBuffer(layer)
ctx.Forward(buf.SetRows(ctx, src, slotIdx))
}
if len(entry.conv) > 0 || len(entry.recurrent) > 0 {
ctx.Compute()
}
store := c.checkpoints[restore.slot]
store.pruneAfter(restore.pos)
return nil
}
func (c *Recurrent) restoreComplete(restore checkpointRestore) bool {
_, ok := c.restoreEntry(restore)
return ok
}
func (c *Recurrent) restoreEntry(restore checkpointRestore) (*checkpointEntry, bool) {
store, ok := c.checkpoints[restore.slot]
if !ok || restore.idx < 0 || restore.idx >= len(store.entries) {
return nil, false
}
entry := &store.entries[restore.idx]
if entry.pos < 0 {
return nil, false
}
if !c.entryComplete(entry) {
return nil, false
}
return entry, true
}
func (c *Recurrent) entryComplete(entry *checkpointEntry) bool {
for layer := range c.convStates {
if entry.conv == nil || entry.conv[layer] == nil {
return false
}
}
for layer := range c.recurrentStates {
if entry.recurrent == nil || entry.recurrent[layer] == nil {
return false
}
}
return true
}
func (c *Recurrent) clearCheckpoints(slot int) {
if store, ok := c.checkpoints[slot]; ok {
store.reset()
}
}
func (c *Recurrent) shiftCheckpoints(slot int, beginIndex, endIndex int32) {
if store, ok := c.checkpoints[slot]; ok {
store.shiftRange(beginIndex, endIndex)
}
}
func (c *Recurrent) copyCheckpoints(ctx ml.Context, srcSlot, dstSlot int) {
if c.checkpointCount == 0 {
return
}
srcStore, ok := c.checkpoints[srcSlot]
if !ok || srcStore.size == 0 {
return
}
dstStore := c.checkpointStore(dstSlot)
dstStore.size = srcStore.size
dstStore.next = srcStore.next
dstStore.lastPos = srcStore.lastPos
for i := range srcStore.entries {
srcEntry := &srcStore.entries[i]
dstEntry := &dstStore.entries[i]
dstEntry.pos = srcEntry.pos
if srcEntry.conv != nil {
if dstEntry.conv == nil {
dstEntry.conv = make(map[int]ml.Tensor)
}
for layer, src := range srcEntry.conv {
dst := c.ensureCheckpointConv(layer, dstEntry)
ctx.Forward(src.Copy(ctx, dst))
}
}
if srcEntry.recurrent != nil {
if dstEntry.recurrent == nil {
dstEntry.recurrent = make(map[int]ml.Tensor)
}
for layer, src := range srcEntry.recurrent {
dst := c.ensureCheckpointRecurrent(layer, dstEntry)
ctx.Forward(src.Copy(ctx, dst))
}
}
}
}
func (c *Recurrent) captureConvCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
if c.checkpointCount == 0 {
return
}
if c.reserveCheckpoints {
c.reserveCheckpointConv(layer)
return
}
if len(c.curCheckpointPos) == 0 {
return
}
for i, pos := range c.curCheckpointPos {
if pos < 0 {
continue
}
slot := c.curSlots[i]
idx := c.checkpointIndexForSlot(slot, pos)
if idx < 0 {
continue
}
entry := &c.checkpoints[slot].entries[idx]
dst := c.ensureCheckpointConv(layer, entry)
seqSlice := src.Slice(ctx, 1, i, i+1, 1)
ctx.Forward(seqSlice.Copy(ctx, dst))
}
}
func (c *Recurrent) captureRecurrentCheckpoint(ctx ml.Context, layer int, src ml.Tensor) {
if c.checkpointCount == 0 {
return
}
if c.reserveCheckpoints {
c.reserveCheckpointRecurrent(layer)
return
}
if len(c.curCheckpointPos) == 0 {
return
}
for i, pos := range c.curCheckpointPos {
if pos < 0 {
continue
}
slot := c.curSlots[i]
idx := c.checkpointIndexForSlot(slot, pos)
if idx < 0 {
continue
}
entry := &c.checkpoints[slot].entries[idx]
dst := c.ensureCheckpointRecurrent(layer, entry)
seqSlice := src.Slice(ctx, 1, i, i+1, 1)
ctx.Forward(seqSlice.Copy(ctx, dst))
}
}
func (c *Recurrent) ensureCheckpointConv(layer int, entry *checkpointEntry) ml.Tensor {
if entry.conv == nil {
entry.conv = make(map[int]ml.Tensor)
}
if t, ok := entry.conv[layer]; ok {
return t
}
ctx, ok := c.checkpointConvCtxs[layer]
if !ok {
ctx = c.backend.NewContextSize(c.checkpointCtxSize).Layer(layer)
c.checkpointConvCtxs[layer] = ctx
}
t := ctx.Zeros(ml.DTypeF32, c.convDim*c.convChannels, 1)
entry.conv[layer] = t
return t
}
func (c *Recurrent) ensureCheckpointRecurrent(layer int, entry *checkpointEntry) ml.Tensor {
if entry.recurrent == nil {
entry.recurrent = make(map[int]ml.Tensor)
}
if t, ok := entry.recurrent[layer]; ok {
return t
}
ctx, ok := c.checkpointRecurCtxs[layer]
if !ok {
ctx = c.backend.NewContextSize(c.checkpointCtxSize).Layer(layer)
c.checkpointRecurCtxs[layer] = ctx
}
t := ctx.Zeros(ml.DTypeF32, c.recurrentStateSize, 1)
entry.recurrent[layer] = t
return t
}
func (c *Recurrent) reserveCheckpointConv(layer int) {
key := checkpointReserveKey(layer, 0)
if _, ok := c.checkpointReserved[key]; ok {
return
}
for slot := range c.maxSequences {
store := c.checkpointStore(slot)
for i := range store.entries {
entry := &store.entries[i]
_ = c.ensureCheckpointConv(layer, entry)
}
}
c.checkpointReserved[key] = struct{}{}
}
func (c *Recurrent) reserveCheckpointRecurrent(layer int) {
key := checkpointReserveKey(layer, 1)
if _, ok := c.checkpointReserved[key]; ok {
return
}
for slot := range c.maxSequences {
store := c.checkpointStore(slot)
for i := range store.entries {
entry := &store.entries[i]
_ = c.ensureCheckpointRecurrent(layer, entry)
}
}
c.checkpointReserved[key] = struct{}{}
}
func checkpointReserveKey(layer int, kind int) int {
return layer*2 + kind
}

View File

@@ -0,0 +1,288 @@
package kvcache
import (
"errors"
"math"
"slices"
"testing"
"github.com/ollama/ollama/ml"
)
func newTestCache() *Recurrent {
return NewRecurrentCache(RecurrentConfig{ConvDim: 1, ConvChannels: 2, RecurrentStateSize: 2})
}
func TestSlotCheckpointStoreBestIndex(t *testing.T) {
store := newSlotCheckpointStore(2)
store.record(10)
store.record(20)
_, pos, ok := store.bestIndex(15)
if !ok || pos != 10 {
t.Fatalf("expected best pos 10, got pos=%d ok=%v", pos, ok)
}
store.record(30) // overwrite oldest (10)
if _, _, ok := store.bestIndex(15); ok {
t.Fatalf("expected no checkpoint for targetPos=15 after overwrite")
}
_, pos, ok = store.bestIndex(40)
if !ok || pos != 30 {
t.Fatalf("expected best pos 30, got pos=%d ok=%v", pos, ok)
}
}
func TestCachePrepareRestore(t *testing.T) {
cache := newTestCache()
cache.checkpointCount = 3
cache.checkpoints = make(map[int]*slotCheckpointStore)
cache.pendingRestore = make(map[int]checkpointRestore)
cache.slotForSeq[1] = 0
store := cache.checkpointStore(0)
store.record(5)
store.record(9)
store.record(15)
restorePos, ok := cache.PrepareRestore(1, 12)
if !ok {
t.Fatalf("expected restore ok")
}
if restorePos != 10 {
t.Fatalf("expected restorePos 10, got %d", restorePos)
}
rest, ok := cache.pendingRestore[1]
if !ok {
t.Fatalf("expected pending restore entry")
}
if rest.pos != 9 {
t.Fatalf("expected pending restore pos 9, got %d", rest.pos)
}
}
func TestSlotCheckpointStorePruneAfter(t *testing.T) {
store := newSlotCheckpointStore(3)
store.record(10)
store.record(20)
store.record(30)
store.pruneAfter(20)
if store.lastPos != 20 {
t.Fatalf("expected lastPos 20, got %d", store.lastPos)
}
_, pos, ok := store.bestIndex(25)
if !ok || pos != 20 {
t.Fatalf("expected best pos 20 after prune, got pos=%d ok=%v", pos, ok)
}
_, pos, ok = store.bestIndex(35)
if !ok || pos != 20 {
t.Fatalf("expected pruned best pos 20 for targetPos=35, got pos=%d ok=%v", pos, ok)
}
}
func TestCacheRestoreRejectsIncompleteCheckpoint(t *testing.T) {
cache := newTestCache()
cache.checkpointCount = 3
cache.checkpoints = make(map[int]*slotCheckpointStore)
cache.pendingRestore = make(map[int]checkpointRestore)
cache.slotForSeq[1] = 0
cache.refCount = []int{1}
cache.freeSlots = nil
// Simulate layer 0 requires both conv and recurrent checkpoints.
cache.convStates[0] = nil
cache.recurrentStates[0] = nil
store := cache.checkpointStore(0)
idx := store.record(9)
entry := &store.entries[idx]
entry.conv = map[int]ml.Tensor{0: nil}
// entry.recurrent intentionally missing
cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: idx, pos: 9}
err := cache.Remove(1, 10, math.MaxInt32)
if !errors.Is(err, ErrNotSupported) {
t.Fatalf("expected ErrNotSupported for incomplete checkpoint, got %v", err)
}
}
func TestCacheRestoreAcceptsCompleteCheckpoint(t *testing.T) {
cache := newTestCache()
cache.checkpointCount = 3
cache.checkpoints = make(map[int]*slotCheckpointStore)
cache.pendingRestore = make(map[int]checkpointRestore)
cache.slotForSeq[1] = 0
cache.refCount = []int{1}
cache.freeSlots = nil
store := cache.checkpointStore(0)
idx := store.record(9)
cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: idx, pos: 9}
restore := cache.pendingRestore[1]
if !cache.restoreComplete(restore) {
t.Fatalf("expected restoreComplete to return true for complete checkpoint")
}
}
func TestCacheRecurrentStateShapeValidation(t *testing.T) {
cache := newTestCache()
_, err := cache.RecurrentState(nil, 0, 3)
if !errors.Is(err, ErrInvalidRecurrentShape) {
t.Fatalf("expected ErrInvalidRecurrentShape, got %v", err)
}
}
func TestSlotCheckpointStoreShiftRange(t *testing.T) {
store := newSlotCheckpointStore(5)
store.record(1)
store.record(4)
store.record(7)
store.record(10)
store.shiftRange(2, 6)
var positions []int32
for i := range store.entries {
if store.entries[i].pos >= 0 {
positions = append(positions, store.entries[i].pos)
}
}
slices.Sort(positions)
want := []int32{1, 3, 6}
if !slices.Equal(positions, want) {
t.Fatalf("unexpected shifted positions: got=%v want=%v", positions, want)
}
if store.lastPos != 6 {
t.Fatalf("expected lastPos 6, got %d", store.lastPos)
}
}
func TestCacheRemoveMiddleShiftsCheckpoints(t *testing.T) {
cache := newTestCache()
cache.slotForSeq[1] = 0
cache.refCount = []int{1}
cache.pendingRestore[1] = checkpointRestore{slot: 0, idx: 0, pos: 1}
store := cache.checkpointStore(0)
store.record(1)
store.record(4)
store.record(7)
store.record(10)
if err := cache.Remove(1, 2, 6); err != nil {
t.Fatalf("expected middle remove to succeed, got %v", err)
}
if _, ok := cache.pendingRestore[1]; ok {
t.Fatalf("expected pending restore to be cleared after middle remove")
}
var positions []int32
for i := range store.entries {
if store.entries[i].pos >= 0 {
positions = append(positions, store.entries[i].pos)
}
}
slices.Sort(positions)
want := []int32{1, 3, 6}
if !slices.Equal(positions, want) {
t.Fatalf("unexpected checkpoint positions after remove: got=%v want=%v", positions, want)
}
}
func TestSlotCheckpointStoreRingBufferWrapAround(t *testing.T) {
store := newSlotCheckpointStore(3)
store.record(10)
store.record(20)
store.record(30)
store.entries[0].conv = make(map[int]ml.Tensor)
store.entries[0].conv[0] = nil
store.entries[0].recurrent = make(map[int]ml.Tensor)
store.entries[0].recurrent[0] = nil
store.record(40)
if store.entries[0].conv == nil {
t.Fatalf("expected conv map to be preserved on reuse")
}
if store.entries[0].recurrent == nil {
t.Fatalf("expected recurrent map to be preserved on reuse")
}
if store.entries[0].pos != 40 {
t.Fatalf("expected entry 0 pos to be 40, got %d", store.entries[0].pos)
}
}
func TestSlotCheckpointStoreFullCapacity(t *testing.T) {
store := newSlotCheckpointStore(2)
idx1 := store.record(10)
idx2 := store.record(20)
if idx1 != 0 || idx2 != 1 {
t.Fatalf("expected indices 0, 1, got %d, %d", idx1, idx2)
}
if store.size != 2 {
t.Fatalf("expected size 2, got %d", store.size)
}
_, pos1, ok1 := store.bestIndex(15)
_, pos2, ok2 := store.bestIndex(25)
if !ok1 || pos1 != 10 {
t.Fatalf("expected best pos 10 for target 15, got pos=%d ok=%v", pos1, ok1)
}
if !ok2 || pos2 != 20 {
t.Fatalf("expected best pos 20 for target 25, got pos=%d ok=%v", pos2, ok2)
}
}
func TestSlotCheckpointStoreEmptyBuffer(t *testing.T) {
store := newSlotCheckpointStore(0)
idx := store.record(10)
if idx != -1 {
t.Fatalf("expected record to return -1 for empty buffer, got %d", idx)
}
_, _, ok := store.bestIndex(15)
if ok {
t.Fatalf("expected no checkpoint for empty buffer")
}
}
func TestSlotCheckpointStorePruneAfterAll(t *testing.T) {
store := newSlotCheckpointStore(3)
store.record(10)
store.record(20)
store.record(30)
store.pruneAfter(5)
if store.size != 0 {
t.Fatalf("expected size 0 after pruning all, got %d", store.size)
}
if store.lastPos != -1 {
t.Fatalf("expected lastPos -1 after pruning all, got %d", store.lastPos)
}
_, _, ok := store.bestIndex(100)
if ok {
t.Fatalf("expected no checkpoint after pruning all")
}
}

110
kvcache/wrapper.go Normal file
View File

@@ -0,0 +1,110 @@
package kvcache
import (
"math"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model/input"
)
// Wrapper cache is a container for multiple types of caches,
// such as for the encoding and decoding portions of a model.
type WrapperCache struct {
// caches we are wrapping
caches []Cache
// cache to be used for this layer
curType int
}
func NewWrapperCache(caches ...Cache) *WrapperCache {
return &WrapperCache{
caches: caches,
}
}
func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
for _, cache := range c.caches {
cache.Init(backend, dtype, maxSequences, capacity, maxBatch)
}
}
func (c *WrapperCache) SetConfig(config ml.CacheConfig) {
for _, cache := range c.caches {
cache.SetConfig(config)
}
}
func (c *WrapperCache) Close() {
for _, cache := range c.caches {
cache.Close()
}
}
func (c *WrapperCache) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
for i, cache := range c.caches {
err := cache.StartForward(ctx, batch, reserve)
if err != nil {
// unwind on error - Remove with endIndex set to math.MaxInt32 does not fail
for j := i - 1; j >= 0; j-- {
for k := range batch.Positions {
_ = c.caches[j].Remove(batch.Sequences[k], batch.Positions[k], math.MaxInt32)
}
}
return err
}
}
c.curType = 0
return nil
}
func (c *WrapperCache) SetLayer(layer int) {
for _, cache := range c.caches {
cache.SetLayer(layer)
}
}
func (c *WrapperCache) SetLayerType(layerType int) {
c.curType = layerType
}
func (c *WrapperCache) UnderlyingCache() Cache {
return c.caches[c.curType]
}
func (c *WrapperCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
return c.caches[c.curType].Get(ctx)
}
func (c *WrapperCache) Put(ctx ml.Context, key, value ml.Tensor) {
c.caches[c.curType].Put(ctx, key, value)
}
func (c *WrapperCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
for _, cache := range c.caches {
cache.CopyPrefix(srcSeq, dstSeq, len)
}
}
func (c *WrapperCache) CanResume(seq int, pos int32) bool {
for _, cache := range c.caches {
if !cache.CanResume(seq, pos) {
return false
}
}
return true
}
func (c *WrapperCache) Remove(seq int, beginIndex, endIndex int32) error {
// If the one of these fails, the caller is supposed to retry with endIndex set to math.MaxInt32, which should not fail
for _, cache := range c.caches {
err := cache.Remove(seq, beginIndex, endIndex)
if err != nil {
return err
}
}
return nil
}