ollama source for Momentry Core verification

2026-05-22 17:19:10 +08:00
commit 0b31ff9135
2020 changed files with 1413145 additions and 0 deletions
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -0,0 +1,419 @@
+package ml
+
+import (
+	"bytes"
+	"context"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"slices"
+	"strconv"
+	"strings"
+
+	"github.com/ollama/ollama/fs"
+)
+
+type Backend interface {
+	// Close frees all memory associated with this backend
+	Close()
+
+	Load(ctx context.Context, progress func(float32)) error
+
+	// BackendMemory returns the memory allocations that were made for this model
+	BackendMemory() BackendMemory
+
+	Config() fs.Config
+	Get(name string) Tensor
+	NewContext() Context
+	NewContextSize(size int) Context
+
+	// Enumerate the devices available for inference via this backend
+	BackendDevices() []DeviceInfo
+}
+
+// BackendCacheConfig should be implemented by backends that need special output
+// from the cache to meet specific requirements. It is frequently implemented in
+// conjunction with ScaledDotProductAttention.
+type BackendCacheConfig interface {
+	CacheConfig() CacheConfig
+}
+
+// CacheConfig controls optimizations (mostly backend-specific) that may transform
+// the output the cache to work better with specific kernels.
+type CacheConfig struct {
+	// CachePadding specifies the multiple for the number of tokens of cache history
+	// that will be returned from cache Get for k, v and mask. The capacity of the
+	// cache itself will also be increased to a multiple of this size if needed.
+	CachePadding int
+
+	// PermutedV performs Permute(ctx, 1, 2, 0, 3) on v tensors stored via Put
+	// and return the permuted version via Get. This uses the cache copy operation
+	// to avoid a Contiguous call on the permuted tensor.
+	PermutedV bool
+
+	// MaskDType specifies the data type for generating the mask. If unset it will
+	// default to DTypeF32.
+	MaskDType DType
+}
+
+// BackendParams controls how the backend loads and executes models
+type BackendParams struct {
+	// AllocMemory causes the backend to allocate memory for the model. If
+	// false, this is only being used for discovering the required amount of
+	// memory and cannot load the model for running.
+	AllocMemory bool
+
+	// NumThreads sets the number of threads to use if running on the CPU
+	NumThreads int
+
+	// GPULayers is the set of layers to offload to GPUs
+	GPULayers GPULayersList
+
+	// FlashAttention indicates that we should use a fused flash attention kernel
+	FlashAttention FlashAttentionType
+}
+
+var backends = make(map[string]func(string, BackendParams) (Backend, error))
+
+func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
+	if _, ok := backends[name]; ok {
+		panic("backend: backend already registered")
+	}
+
+	backends[name] = f
+}
+
+func NewBackend(modelPath string, params BackendParams) (Backend, error) {
+	if backend, ok := backends["ggml"]; ok {
+		return backend(modelPath, params)
+	}
+
+	return nil, fmt.Errorf("unsupported backend")
+}
+
+type Context interface {
+	Empty(dtype DType, shape ...int) Tensor
+	Zeros(dtype DType, shape ...int) Tensor
+	FromBytes(dtype DType, s []byte, shape ...int) Tensor
+	FromFloats(s []float32, shape ...int) Tensor
+	FromInts(s []int32, shape ...int) Tensor
+
+	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
+	Arange(start, stop, step float32, dtype DType) Tensor
+
+	Forward(...Tensor) Context
+
+	// SetBatchSize provides a hint on the batch size to optimize processing
+	// Uses heuristics if not set
+	SetBatchSize(int)
+
+	Compute(...Tensor)
+	ComputeWithNotify(func(), ...Tensor) // notify callback once compute has begun
+
+	// Reserve is analogous to Compute but rather than executing a
+	// graph, simply preallocates memory. Typically called with a
+	// worst case graph to ensure all resources are available for
+	// for future inference.
+	Reserve()
+
+	MaxGraphNodes() int
+	Close()
+
+	// Input returns a context appropriate for creating tensors that are
+	// inputs to the model (which includes things like output locations)
+	Input() Context
+
+	// Layer returns a context appropriate for creating intermediate tensors
+	Layer(int) Context
+}
+
+type Tensor interface {
+	Dim(n int) int
+	Stride(n int) int
+
+	Shape() []int
+	DType() DType
+	Cast(ctx Context, dtype DType) Tensor
+
+	Bytes() []byte
+	Floats() []float32
+	BackendGet() []float32
+
+	FromBytes([]byte)
+	FromFloats([]float32)
+	FromInts([]int32)
+
+	Add(ctx Context, t2 Tensor) Tensor
+	Sub(ctx Context, t2 Tensor) Tensor
+	Mul(ctx Context, t2 Tensor) Tensor
+	Div(ctx Context, t2 Tensor) Tensor
+
+	Mulmat(ctx Context, t2 Tensor) Tensor
+	MulmatFullPrec(ctx Context, t2 Tensor) Tensor
+	MulmatID(ctx Context, t2, ids Tensor) Tensor
+	AddID(ctx Context, t2, ids Tensor) Tensor
+
+	Softmax(ctx Context) Tensor
+	L2Norm(ctx Context, eps float32) Tensor
+	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
+	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
+	Scale(ctx Context, s float64) Tensor
+	SumRows(ctx Context) Tensor
+
+	AvgPool2D(ctx Context, k, s int, p float32) Tensor
+	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
+	Conv3D(ctx Context, weight Tensor, c, s0, s1, s2, p0, p1, p2, d0, d1, d2 int) Tensor
+	Conv1DDW(ctx Context, weight Tensor, s, p, d int) Tensor
+	SSMConv(ctx Context, kernel Tensor) Tensor
+	SSMScan(ctx Context, x, dt, A, B, C, ids Tensor) Tensor
+
+	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
+
+	Sin(ctx Context) Tensor
+	Cos(ctx Context) Tensor
+	Tanh(ctx Context) Tensor
+	GELU(ctx Context, up ...Tensor) Tensor
+	GELU_ERF(ctx Context) Tensor
+	QuickGELU(ctx Context, up ...Tensor) Tensor
+	SILU(ctx Context, up ...Tensor) Tensor
+	RELU(ctx Context, up ...Tensor) Tensor
+	Sigmoid(ctx Context) Tensor
+	SigmoidOut(ctx Context) Tensor
+
+	// AlphaLimitSILU is a variant of SILU that clamps the input to the range [-limit, limit]
+	SILUAlphaLimit(ctx Context, up Tensor, alpha, limit float32) Tensor
+
+	Reshape(ctx Context, shape ...int) Tensor
+	View(ctx Context, offset int, shape ...int) Tensor
+	Permute(ctx Context, shape ...int) Tensor
+	Contiguous(ctx Context, shape ...int) Tensor
+
+	Pad(ctx Context, shape ...int) Tensor
+	// PadExt pads with independent left/right amounts per dimension.
+	// Arguments: lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3 for dims 0-3.
+	PadExt(ctx Context, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3 int) Tensor
+
+	Stack(ctx Context, dim int, s ...Tensor) Tensor
+
+	// Repeat repeats the tensor n times along dimension dim
+	Repeat(ctx Context, dim, n int) Tensor
+	Concat(ctx Context, t2 Tensor, dim int) Tensor
+	Rows(ctx Context, t2 Tensor) Tensor
+	SetRows(ctx Context, src Tensor, idxs Tensor) Tensor
+	SetInplace(ctx Context, src Tensor, nb1, nb2, nb3, offset int) Tensor
+	Copy(ctx Context, t2 Tensor) Tensor
+	Duplicate(ctx Context) Tensor
+
+	Slice(ctx Context, dim, low, high, step int) Tensor
+	Chunk(ctx Context, dim int, size int) []Tensor
+	ChunkSections(ctx Context, dim int, sections ...int) []Tensor
+
+	TopK(ctx Context, k int) Tensor
+	Argsort(ctx Context) Tensor
+	Mean(ctx Context) Tensor
+	Variance(ctx Context) Tensor
+	Stddev(ctx Context) Tensor
+	Sqr(ctx Context) Tensor
+	Sqrt(ctx Context) Tensor
+	Exp(ctx Context) Tensor
+	Neg(ctx Context) Tensor
+
+	// Clamp clamps values to [min, max] range
+	Clamp(ctx Context, min, max float32) Tensor
+
+	// Softplus computes ln(1 + exp(x))
+	Softplus(ctx Context) Tensor
+
+	// CumSum computes cumulative sum along dimension 0
+	CumSum(ctx Context) Tensor
+
+	// Diag creates a diagonal matrix from a 1D tensor
+	Diag(ctx Context) Tensor
+
+	// Tri converts a matrix to triangular form (0=upper+diag, 1=upper, 2=lower+diag, 3=lower)
+	Tri(ctx Context, triType int) Tensor
+
+	// Fill fills a tensor with a constant value (in-place)
+	Fill(ctx Context, value float32) Tensor
+
+	// Repeat4D repeats tensor to match target shape
+	Repeat4D(ctx Context, dim0, dim1, dim2, dim3 int) Tensor
+
+	// SolveTri solves a triangular system Ax = B
+	SolveTri(ctx Context, b Tensor, lower, left, unitDiag bool) Tensor
+
+	Interpolate(ctx Context, dims [4]int, samplingMode SamplingMode) Tensor
+}
+
+// ScaledDotProductAttention implements a fused attention
+// operation equivalent to following code on a tensor named
+// query:
+//
+// query = query.Permute(ctx, 0, 2, 1, 3)
+// key = key.Permute(ctx, 0, 2, 1, 3)
+// value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+//
+// kq := key.MulmatFullPrec(ctx, query)
+//
+// kq = kq.Scale(ctx, scale)
+//
+//	if mask != nil {
+//		kq = kq.Add(ctx, mask)
+//	}
+//
+// kq = kq.Softmax(ctx)
+//
+// kqv := value.Mulmat(ctx, kq)
+// return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+//
+// cacheConfigApplied indicates whether the optimizations requested through CacheConfig have been performed
+type ScaledDotProductAttention interface {
+	ScaledDotProductAttention(ctx Context, key, value, mask, sinks Tensor, vmla Tensor, scale float64, cacheConfigApplied bool) Tensor
+}
+
+type number interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64 |
+		~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 |
+		~float32 | ~float64 |
+		~complex64 | ~complex128
+}
+
+func mul[T number](s ...T) T {
+	p := T(1)
+	for _, v := range s {
+		p *= v
+	}
+
+	return p
+}
+
+type DumpOptions func(*dumpOptions)
+
+// DumpWithPrecision sets the number of decimal places to print. Applies to float32 and float64.
+func DumpWithPrecision(n int) DumpOptions {
+	return func(opts *dumpOptions) {
+		opts.Precision = n
+	}
+}
+
+// DumpWithThreshold sets the threshold for printing the entire tensor. If the number of elements
+// is less than or equal to this value, the entire tensor will be printed. Otherwise, only the
+// beginning and end of each dimension will be printed.
+func DumpWithThreshold(n int) DumpOptions {
+	return func(opts *dumpOptions) {
+		opts.Threshold = n
+	}
+}
+
+// DumpWithEdgeItems sets the number of elements to print at the beginning and end of each dimension.
+func DumpWithEdgeItems(n int) DumpOptions {
+	return func(opts *dumpOptions) {
+		opts.EdgeItems = n
+	}
+}
+
+type dumpOptions struct {
+	Precision, Threshold, EdgeItems int
+}
+
+func Dump(ctx Context, t Tensor, optsFuncs ...DumpOptions) string {
+	opts := dumpOptions{Precision: 4, Threshold: 1000, EdgeItems: 3}
+	for _, optsFunc := range optsFuncs {
+		optsFunc(&opts)
+	}
+
+	if mul(t.Shape()...) <= opts.Threshold {
+		opts.EdgeItems = math.MaxInt
+	}
+
+	switch t.DType() {
+	case DTypeF32:
+		return dump[[]float32](ctx, t, opts.EdgeItems, func(f float32) string {
+			return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
+		})
+	case DTypeF16, DTypeQ80, DTypeQ40:
+		f32 := ctx.Input().Empty(DTypeF32, t.Shape()...)
+		f32 = t.Copy(ctx, f32)
+		return dump[[]float32](ctx, f32, opts.EdgeItems, func(f float32) string {
+			return strconv.FormatFloat(float64(f), 'f', opts.Precision, 32)
+		})
+	case DTypeI32:
+		return dump[[]int32](ctx, t, opts.EdgeItems, func(i int32) string {
+			return strconv.FormatInt(int64(i), 10)
+		})
+	default:
+		return "<unsupported>"
+	}
+}
+
+func dump[S ~[]E, E number](ctx Context, t Tensor, items int, fn func(E) string) string {
+	if t.Bytes() == nil {
+		ctx.Forward(t).Compute(t)
+	}
+
+	s := make(S, mul(t.Shape()...))
+	if err := binary.Read(bytes.NewBuffer(t.Bytes()), binary.LittleEndian, &s); err != nil {
+		panic(err)
+	}
+
+	shape := t.Shape()
+	slices.Reverse(shape)
+
+	var sb strings.Builder
+	var f func([]int, int)
+	f = func(dims []int, stride int) {
+		prefix := strings.Repeat(" ", len(shape)-len(dims)+1)
+		sb.WriteString("[")
+		defer func() { sb.WriteString("]") }()
+		for i := 0; i < dims[0]; i++ {
+			if i >= items && i < dims[0]-items {
+				sb.WriteString("..., ")
+				// skip to next printable element
+				skip := dims[0] - 2*items
+				if len(dims) > 1 {
+					stride += mul(append(dims[1:], skip)...)
+					fmt.Fprint(&sb, strings.Repeat("\n", len(dims)-1), prefix)
+				}
+				i += skip - 1
+			} else if len(dims) > 1 {
+				f(dims[1:], stride)
+				stride += mul(dims[1:]...)
+				if i < dims[0]-1 {
+					fmt.Fprint(&sb, ",", strings.Repeat("\n", len(dims)-1), prefix)
+				}
+			} else {
+				text := fn(s[stride+i])
+				if len(text) > 0 && text[0] != '-' {
+					sb.WriteString(" ")
+				}
+
+				sb.WriteString(text)
+				if i < dims[0]-1 {
+					sb.WriteString(", ")
+				}
+			}
+		}
+	}
+	f(shape, 0)
+
+	return sb.String()
+}
+
+type DType int
+
+const (
+	DTypeOther DType = iota
+	DTypeF32
+	DTypeF16
+	DTypeQ80
+	DTypeQ40
+	DTypeI32
+	DTypeMXFP4
+)
+
+type SamplingMode int
+
+const (
+	SamplingModeNearest SamplingMode = iota
+	SamplingModeBilinear
+)
--- a/ml/backend/backend.go
+++ b/ml/backend/backend.go
@@ -0,0 +1,5 @@
+package backend
+
+import (
+	_ "github.com/ollama/ollama/ml/backend/ggml"
+)
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
--- a/ml/backend/ggml/ggml/.rsync-filter
+++ b/ml/backend/ggml/ggml/.rsync-filter
@@ -0,0 +1,33 @@
+protect .rsync-filter
+protect *.go
+protect *-embed.*
+protect ollama-*
+hide /CMakeLists.txt
+hide /include/ggml-webgpu.h
+include /cmake/
+include /cmake/common.cmake
+include /include/
+include /src/
+include /src/ggml-blas/
+include /src/ggml-cpu/
+include /src/ggml-cpu/amx/
+include /src/ggml-cpu/arch/
+include /src/ggml-cpu/arch/arm/
+include /src/ggml-cpu/arch/x86/
+include /src/ggml-cpu/llamafile/
+include /src/ggml-cuda/
+include /src/ggml-cuda/vendors/
+include /src/ggml-cuda/template-instances/
+include /src/ggml-hip/
+include /src/ggml-metal/
+include src/ggml-vulkan/
+include src/ggml-vulkan/vulkan-shaders
+include CMakeLists.txt
+include *.[chm]
+include *.cpp
+include *.cu
+include *.cuh
+include *.metal
+include *.comp
+include *.glsl
+hide *
--- a/ml/backend/ggml/ggml/LICENSE
+++ b/ml/backend/ggml/ggml/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023-2024 The ggml authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/ml/backend/ggml/ggml/cmake/common.cmake
+++ b/ml/backend/ggml/ggml/cmake/common.cmake
@@ -0,0 +1,50 @@
+function(ggml_get_flags CCID CCVER)
+    set(C_FLAGS "")
+    set(CXX_FLAGS "")
+
+    if (CCID MATCHES "Clang")
+        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
+        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
+
+        if (
+            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
+            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
+        )
+            list(APPEND C_FLAGS -Wdouble-promotion)
+        endif()
+    elseif (CCID STREQUAL "GNU")
+        set(C_FLAGS   -Wdouble-promotion)
+        set(CXX_FLAGS -Wno-array-bounds)
+
+        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
+            list(APPEND CXX_FLAGS -Wextra-semi)
+        endif()
+    endif()
+
+    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
+    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
+endfunction()
+
+function(ggml_get_system_arch)
+    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
+        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
+        set(GGML_SYSTEM_ARCH "ARM" PARENT_SCOPE)
+    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR
+            CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
+        set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
+        set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+        set(GGML_SYSTEM_ARCH "loongarch64"  PARENT_SCOPE)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
+        set(GGML_SYSTEM_ARCH "riscv64" PARENT_SCOPE)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+        set(GGML_SYSTEM_ARCH "s390x" PARENT_SCOPE)
+    else()
+        set(GGML_SYSTEM_ARCH "UNKNOWN" PARENT_SCOPE)
+    endif()
+endfunction()
--- a/ml/backend/ggml/ggml/include/ggml-alloc.h
+++ b/ml/backend/ggml/ggml/include/ggml-alloc.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct             ggml_backend * ggml_backend_t;
+
+// Tensor allocator
+struct ggml_tallocr {
+    ggml_backend_buffer_t buffer;
+    void * base;
+    size_t alignment;
+    size_t offset;
+};
+
+GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
+GGML_API enum ggml_status    ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
+
+// Graph allocator
+/*
+  Example usage:
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
+
+    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
+    ggml_gallocr_reserve(galloc, build_graph(max_batch));
+
+    // allocate the graph
+    struct ggml_cgraph * graph = build_graph(batch);
+    ggml_gallocr_alloc_graph(galloc, graph);
+
+    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
+
+    // evaluate the graph
+    ggml_backend_graph_compute(backend, graph);
+*/
+
+// special tensor flags for use with the graph allocator:
+//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
+//   ggml_set_output(): output tensors are never freed and never overwritten
+
+typedef struct ggml_gallocr * ggml_gallocr_t;
+
+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
+GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
+
+// pre-allocate buffers from a measure graph - does not allocate or modify the graph
+// call with a worst-case graph to avoid buffer reallocations
+// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
+// returns false if the buffer allocation failed
+// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API void ggml_gallocr_reserve_n_size(
+    ggml_gallocr_t galloc,
+    struct ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids,
+    size_t * sizes);
+GGML_API bool ggml_gallocr_reserve_n(
+    ggml_gallocr_t galloc,
+    struct ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids);
+
+// automatic reallocation if the topology changes when using a single buffer
+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+
+GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+GGML_API size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
+// Utils
+// Create a buffer and allocate all the tensors in a ggml_context
+// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
+GGML_API size_t                       ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -0,0 +1,386 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#ifdef GGML_BACKEND_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BACKEND_BUILD
+#            define GGML_BACKEND_API __declspec(dllexport) extern
+#        else
+#            define GGML_BACKEND_API __declspec(dllimport) extern
+#        endif
+#    else
+#        define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
+#    endif
+#else
+#    define GGML_BACKEND_API extern
+#endif
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    typedef struct ggml_backend_event * ggml_backend_event_t;
+    typedef struct ggml_backend * ggml_backend_t;
+    typedef void * ggml_backend_graph_plan_t;
+    typedef struct ggml_backend_reg * ggml_backend_reg_t;
+    typedef struct ggml_backend_device * ggml_backend_dev_t;
+
+
+    //
+    // Backend buffer type
+    //
+
+    GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
+    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
+    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
+    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
+    GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
+    GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);
+
+    //
+    // Backend buffer
+    //
+
+    enum ggml_backend_buffer_usage {
+        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
+        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
+        GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
+    };
+
+    GGML_API const char *                   ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API enum ggml_status               ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
+    GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage     (ggml_backend_buffer_t buffer);
+    GGML_API ggml_backend_buffer_type_t     ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+
+    // tensor copy between different backends
+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    //
+    // Backend (stream)
+    //
+
+    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
+    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
+    GGML_API void         ggml_backend_free(ggml_backend_t backend);
+
+    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
+    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
+
+    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    // "offset" refers to the offset in tensor->data for setting/getting data
+    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
+
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
+
+    // NOTE: will be removed, use device version instead
+    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
+    GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
+    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
+
+    // asynchronous copy
+    // the copy is performed after all the currently queued operations in backend_src
+    // backend_dst will wait for the copy to complete before performing other operations
+    // automatic fallback to sync copy if async is not supported
+    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
+
+    //
+    // Events
+    //
+
+    GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
+    GGML_API void                 ggml_backend_event_free(ggml_backend_event_t event);
+    GGML_API void                 ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
+    GGML_API void                 ggml_backend_event_synchronize(ggml_backend_event_t event);
+    GGML_API void                 ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
+
+    //
+    // Backend device
+    //
+
+    enum ggml_backend_dev_type {
+        // CPU device using system memory
+        GGML_BACKEND_DEVICE_TYPE_CPU,
+        // GPU device using dedicated memory
+        GGML_BACKEND_DEVICE_TYPE_GPU,
+        // integrated GPU device using host memory
+        GGML_BACKEND_DEVICE_TYPE_IGPU,
+        // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
+        GGML_BACKEND_DEVICE_TYPE_ACCEL
+    };
+
+    // functionality supported by the device
+    struct ggml_backend_dev_caps {
+        // asynchronous operations
+        bool async;
+        // pinned host buffer
+        bool host_buffer;
+        // creating buffers from host ptr
+        bool buffer_from_host_ptr;
+        // event synchronization
+        bool events;
+    };
+
+    // all the device properties
+    struct ggml_backend_dev_props {
+        // device name
+        const char * name;
+        // device description
+        const char * description;
+        // device free memory in bytes
+        size_t memory_free;
+        const char * id;
+        // device total memory in bytes
+        size_t memory_total;
+        // device type
+        enum ggml_backend_dev_type type;
+        // device id
+        //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
+        //   if the id is unknown, this should be NULL
+        const char * device_id;
+        // device capabilities
+        struct ggml_backend_dev_caps caps;
+        int driver_major;
+        int driver_minor;
+        int compute_major;
+        int compute_minor;
+        int integrated;
+        const char *library;
+    };
+
+    GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
+    GGML_API const char *                  ggml_backend_dev_description(ggml_backend_dev_t device);
+    GGML_API void                          ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
+    GGML_API enum ggml_backend_dev_type    ggml_backend_dev_type(ggml_backend_dev_t device);
+    GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
+    GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
+    GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
+    GGML_API void                          ggml_backend_dev_reset(ggml_backend_dev_t device);
+    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
+    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
+    GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
+
+    GGML_API bool                          ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
+    GGML_API bool                          ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
+    GGML_API bool                          ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
+
+    //
+    // Backend (reg)
+    //
+
+    GGML_API const char *       ggml_backend_reg_name(ggml_backend_reg_t reg);
+    GGML_API size_t             ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
+    GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
+    GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
+
+    // Common functions that may be obtained using ggml_backend_reg_get_proc_address
+
+    // Split buffer type for tensor parallelism
+    typedef ggml_backend_buffer_type_t   (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
+    // Set the number of threads for the backend
+    typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
+    // Get additional buffer types provided by the device (returns a NULL-terminated array)
+    typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
+    // Set the abort callback for the backend
+    typedef void                         (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
+    // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
+    struct ggml_backend_feature {
+        const char * name;
+        const char * value;
+    };
+    typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
+
+    //
+    // Backend registry
+    //
+
+    GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
+
+    GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
+
+    // Backend (reg) enumeration
+    GGML_API size_t             ggml_backend_reg_count(void);
+    GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
+    GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
+
+    // Device enumeration
+    GGML_API size_t             ggml_backend_dev_count(void);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
+
+    // Direct backend (stream) initialization
+    // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
+    GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
+    // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
+    GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
+    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
+    GGML_API ggml_backend_t ggml_backend_init_best(void);
+
+    // Load a backend from a dynamic library and register it
+    GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
+    // Unload a backend if loaded dynamically and unregister it
+    GGML_API void               ggml_backend_unload(ggml_backend_reg_t reg);
+    // Load all known backends from dynamic libraries
+    GGML_API void               ggml_backend_load_all(void);
+    GGML_API void               ggml_backend_load_all_from_path(const char * dir_path);
+
+    //
+    // Backend scheduler
+    //
+
+    // The backend scheduler allows for multiple backend devices to be used together
+    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
+    // The backends are selected based on:
+    // - the backend that supports the operation
+    // - the location of the pre-allocated tensors (e.g. the weights)
+    /*
+      Example usage:
+
+        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
+        // preferrably to run on the same backend as the buffer
+        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
+
+        // initialize buffers from a max size graph (optional)
+        reserve_graph = build_graph(sched, max_batch_size);
+
+        // manually assign nodes to a backend (optional, should not be needed in most cases)
+        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
+        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
+
+        ggml_backend_sched_reserve(sched, reserve_graph);
+
+        // compute
+        graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
+        for (int i = 0; i < 10; ++i) {
+            ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
+        }
+
+        // if there are graph inputs:
+        graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
+        ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
+        ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
+        ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
+        ggml_backend_sched_graph_compute(sched, graph); // execute the graph
+
+        // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
+        // allocate them statically via ggml_backend_alloc_ctx_tensors
+    }
+    */
+
+    typedef struct ggml_backend_sched * ggml_backend_sched_t;
+
+    // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
+    // when ask == true, the scheduler wants to know if the user wants to observe this node
+    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
+    //
+    // when ask == false, the scheduler is passing the node tensor to the user for observation
+    // if the user returns false, the scheduler will cancel the graph compute
+    //
+    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
+
+    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers);
+    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
+
+    // Provide a hint on the batch size to optimize processing (uses heuristics if unset)
+    GGML_API void                 ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
+
+    // Initialize backend buffers from a measure graph
+    GGML_API void                 ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
+
+    GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
+
+    // Get the number of splits of the last graph
+    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
+
+    GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t                     ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t                     ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
+    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+
+    // Split graph without allocating it
+    GGML_API void                 ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+
+    // Allocate and compute graph on the backend scheduler
+    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
+
+    // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
+    // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
+    // The correct way to use this API is to discard the deallocated tensors and create new ones.
+    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
+
+    // Set a callback to be called for each resulting node during graph compute
+    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+
+    //
+    // Utils
+    //
+
+    struct ggml_backend_graph_copy {
+        ggml_backend_buffer_t buffer;
+        struct ggml_context * ctx_allocated;
+        struct ggml_context * ctx_unallocated;
+        struct ggml_cgraph * graph;
+    };
+
+    // Copy a graph to a different backend
+    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
+
+    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+
+    // Compare the output of two backends
+    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
+
+    // Tensor initialization
+    GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
+    GGML_API enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor);
+
+    // CPU buffer types are always available
+    GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-blas.h
+++ b/ml/backend/ggml/ggml/include/ggml-blas.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
+
+// number of threads used for conversion to float
+// for openblas and blis, this will also set the number of threads used for blas operations
+GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
+
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-cann.h
+++ b/ml/backend/ggml/ggml/include/ggml-cann.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Maximum number of CANN devices supported.
+ */
+#define GGML_CANN_MAX_DEVICES 16
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
+
+/**
+ * @brief Initializes the CANN backend for a specified device.
+ *
+ * This function initializes the CANN backend for the given device.
+ * It verifies the device index, allocates a context, and creates a backend
+ * instance.
+ *
+ * @param device The index of the device to initialize.
+ * @return A pointer to the initialized backend instance, or nullptr on failure.
+ */
+GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
+
+/**
+ * @brief Checks if a given backend is a CANN backend.
+ *
+ * This function verifies if the provided backend is a CANN backend by comparing
+ * its GUID with the CANN backend's GUID.
+ *
+ * @param backend The backend instance to check.
+ * @return True if the backend is a CANN backend, false otherwise.
+ */
+GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
+
+/**
+ * @brief Retrieves the CANN buffer type for a specified device.
+ *
+ * This function initializes and returns the buffer type interface associated
+ * with the given device. It ensures thread-safe access using a mutex.
+ *
+ * @param device The device index for which to retrieve the buffer type.
+ * @return A pointer to the buffer type interface for the specified device, or
+ * nullptr if the device index is out of range.
+ */
+GGML_BACKEND_API ggml_backend_buffer_type_t
+ggml_backend_cann_buffer_type(int32_t device);
+
+/**
+ * @brief Retrieves the number of CANN devices available.
+ *
+ * This function returns the number of CANN devices available based on
+ * information obtained from `ggml_cann_info()`.
+ *
+ * @return The number of CANN devices available.
+ */
+GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
+
+/**
+ * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
+ *
+ * @return A pointer to the host buffer type interface.
+ */
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
+
+/**
+ * @brief Retrieves the description of a specific CANN device.
+ *
+ * This function sets the specified device, retrieves the SoC name,
+ * and writes it into the provided description buffer.
+ *
+ * @param device The device index to retrieve the description for.
+ * @param description Pointer to a buffer where the description will be written.
+ * @param description_size Size of the description buffer.
+ */
+GGML_BACKEND_API void ggml_backend_cann_get_device_description(
+    int32_t device, char* description, size_t description_size);
+
+/**
+ * @brief Retrieves the memory information of a specific CANN device.
+ *
+ * This function sets the specified device, retrieves the free and total
+ * memory information of the specified type (ACL_HBM_MEM), and stores them
+ * in the provided pointers.
+ *
+ * @param device The device index to retrieve memory information for.
+ * @param free Pointer to a variable where the free memory size will be stored.
+ * @param total Pointer to a variable where the total memory size will be
+ * stored.
+ */
+GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
+                                                  size_t* free,
+                                                  size_t* total);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-cpp.h
+++ b/ml/backend/ggml/ggml/include/ggml-cpp.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#ifndef __cplusplus
+#error "This header is for C++ only"
+#endif
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "gguf.h"
+#include <memory>
+
+// Smart pointers for ggml types
+
+// ggml
+
+struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
+struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
+
+typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
+typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
+
+// ggml-alloc
+
+struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
+
+typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
+
+// ggml-backend
+
+struct ggml_backend_deleter        { void operator()(ggml_backend_t backend)       { ggml_backend_free(backend); } };
+struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
+struct ggml_backend_event_deleter  { void operator()(ggml_backend_event_t event)   { ggml_backend_event_free(event); } };
+struct ggml_backend_sched_deleter  { void operator()(ggml_backend_sched_t sched)   { ggml_backend_sched_free(sched); } };
+
+typedef std::unique_ptr<ggml_backend,        ggml_backend_deleter>        ggml_backend_ptr;
+typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
+typedef std::unique_ptr<ggml_backend_event,  ggml_backend_event_deleter>  ggml_backend_event_ptr;
+typedef std::unique_ptr<ggml_backend_sched,  ggml_backend_sched_deleter>  ggml_backend_sched_ptr;
--- a/ml/backend/ggml/ggml/include/ggml-cpu.h
+++ b/ml/backend/ggml/ggml/include/ggml-cpu.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggml-org/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+        int n_threads;
+        struct ggml_threadpool * threadpool;
+
+        // abort ggml_graph_compute when true
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
+    };
+
+    // numa strategies
+    enum ggml_numa_strategy {
+        GGML_NUMA_STRATEGY_DISABLED   = 0,
+        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
+        GGML_NUMA_STRATEGY_ISOLATE    = 2,
+        GGML_NUMA_STRATEGY_NUMACTL    = 3,
+        GGML_NUMA_STRATEGY_MIRROR     = 4,
+        GGML_NUMA_STRATEGY_COUNT
+    };
+
+    GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
+    GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+
+    GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+
+    GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+
+    GGML_BACKEND_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+
+    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+
+    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
+    GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);
+
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+                    struct ggml_threadpool * threadpool /* = NULL */ );
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+
+    //
+    // system info
+    //
+
+    // x86
+    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
+    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
+    GGML_BACKEND_API int ggml_cpu_has_bmi2       (void);
+    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
+    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
+    GGML_BACKEND_API int ggml_cpu_has_amx_int8   (void);
+    // ARM
+    GGML_BACKEND_API int ggml_cpu_has_neon       (void);
+    GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
+    GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
+    GGML_BACKEND_API int ggml_cpu_has_dotprod    (void);
+    GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
+    GGML_BACKEND_API int ggml_cpu_has_sve        (void);
+    GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
+    GGML_BACKEND_API int ggml_cpu_has_sme        (void);
+    // other
+    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
+    GGML_BACKEND_API int ggml_cpu_get_rvv_vlen   (void);  // risc-v vector length in bytes
+    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
+    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
+    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
+
+    // Internal types and functions exposed for tests and benchmarks
+
+    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+                                       const void * GGML_RESTRICT y, size_t by, int nrc);
+
+    struct ggml_type_traits_cpu {
+        ggml_from_float_t        from_float;
+        ggml_vec_dot_t           vec_dot;
+        enum ggml_type           vec_dot_type;
+        int64_t                  nrows; // number of rows to process simultaneously
+    };
+
+    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
+
+    GGML_BACKEND_API void ggml_cpu_init(void);
+
+    //
+    // CPU backend
+    //
+
+    GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
+
+    GGML_BACKEND_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_BACKEND_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
+    GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+
+    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
+
+    GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *,       float *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *,     int32_t *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-cuda.h
+++ b/ml/backend/ggml/ggml/include/ggml-cuda.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#ifdef GGML_USE_HIP
+#define GGML_CUDA_NAME "ROCm"
+#define GGML_CUBLAS_NAME "hipBLAS"
+#elif defined(GGML_USE_MUSA)
+#define GGML_CUDA_NAME "MUSA"
+#define GGML_CUBLAS_NAME "muBLAS"
+#else
+#define GGML_CUDA_NAME "CUDA"
+#define GGML_CUBLAS_NAME "cuBLAS"
+#endif
+#define GGML_CUDA_MAX_DEVICES       16
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+
+// device buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+
+GGML_BACKEND_API int  ggml_backend_cuda_get_device_count(void);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+
+GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
+GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-hexagon.h
+++ b/ml/backend/ggml/ggml/include/ggml-hexagon.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-metal.h
+++ b/ml/backend/ggml/ggml/include/ggml-metal.h
@@ -0,0 +1,61 @@
+// Note: this description is outdated
+//
+// An interface allowing to compute ggml_cgraph with Metal
+//
+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
+// used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
+//
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stddef.h>
+#include <stdbool.h>
+
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// backend API
+// user-code should use only these functions
+//
+
+// TODO: remove in the future
+GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
+
+GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
+
+// helper to check if the device supports a specific family
+// ideally, the user code should be doing these checks
+// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
+
+// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
+GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-opencl.h
+++ b/ml/backend/ggml/ggml/include/ggml-opencl.h
@@ -0,0 +1,26 @@
+#ifndef GGML_OPENCL_H
+#define GGML_OPENCL_H
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+//
+// backend API
+//
+GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
+GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif // GGML_OPENCL_H
--- a/ml/backend/ggml/ggml/include/ggml-opt.h
+++ b/ml/backend/ggml/ggml/include/ggml-opt.h
@@ -0,0 +1,256 @@
+// This file contains functionality for training models using GGML.
+// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
+// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
+//
+// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stdint.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    struct ggml_opt_dataset;
+    struct ggml_opt_context;
+    struct ggml_opt_result;
+
+    typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
+    typedef struct ggml_opt_context * ggml_opt_context_t;
+    typedef struct ggml_opt_result  * ggml_opt_result_t;
+
+    // ====== Loss ======
+
+    // built-in loss types, i.e. the built-in quantities minimized by the optimizer
+    // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
+    enum ggml_opt_loss_type {
+        GGML_OPT_LOSS_TYPE_MEAN,
+        GGML_OPT_LOSS_TYPE_SUM,
+        GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
+        GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
+    };
+
+    // ====== Dataset ======
+
+    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
+            enum ggml_type type_data,    // the type for the internal data tensor
+            enum ggml_type type_label,   // the type for the internal labels tensor
+            int64_t        ne_datapoint, // number of elements per datapoint
+            int64_t        ne_label,     // number of elements per label
+            int64_t        ndata,        // total number of datapoints/labels
+            int64_t        ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
+    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
+
+    // get underlying tensors that store the data
+    GGML_API int64_t              ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
+    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
+    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
+
+    // shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
+    GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
+
+    // get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
+    GGML_API void ggml_opt_dataset_get_batch(
+            ggml_opt_dataset_t   dataset,
+            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
+            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
+            int64_t              ibatch);
+    GGML_API void ggml_opt_dataset_get_batch_host(
+            ggml_opt_dataset_t   dataset,
+            void               * data_batch,
+            size_t               nb_data_batch,
+            void               * labels_batch,
+            int64_t              ibatch);
+
+    // ====== Model / Context ======
+
+    enum ggml_opt_build_type {
+        GGML_OPT_BUILD_TYPE_FORWARD = 10,
+        GGML_OPT_BUILD_TYPE_GRAD    = 20,
+        GGML_OPT_BUILD_TYPE_OPT     = 30,
+    };
+
+    enum ggml_opt_optimizer_type {
+        GGML_OPT_OPTIMIZER_TYPE_ADAMW,
+        GGML_OPT_OPTIMIZER_TYPE_SGD,
+
+        GGML_OPT_OPTIMIZER_TYPE_COUNT
+    };
+
+    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
+    struct ggml_opt_optimizer_params {
+        struct {
+            float alpha; // learning rate
+            float beta1; // first AdamW momentum
+            float beta2; // second AdamW momentum
+            float eps;   // epsilon for numerical stability
+            float wd;    // weight decay - 0.0f to disable
+        } adamw;
+        struct {
+            float alpha; // learning rate
+            float wd;    // weight decay
+        } sgd;
+    };
+
+    // callback to calculate optimizer parameters prior to a backward pass
+    // userdata can be used to pass arbitrary data
+    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
+
+    // returns the default optimizer params (constant, hard-coded values)
+    // userdata is not used
+    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
+
+    // casts userdata to ggml_opt_optimizer_params and returns it
+    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
+
+    // parameters for initializing a new optimization context
+    struct ggml_opt_params {
+        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
+
+        // by default the forward graph needs to be reconstructed for each eval
+        // if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
+        struct ggml_context * ctx_compute;
+        struct ggml_tensor  * inputs;
+        struct ggml_tensor  * outputs;
+
+        enum ggml_opt_loss_type  loss_type;
+        enum ggml_opt_build_type build_type;
+
+        int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
+
+        ggml_opt_get_optimizer_params get_opt_pars;    // callback for calculating optimizer parameters
+        void *                        get_opt_pars_ud; // userdata for calculating optimizer parameters
+
+        // only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
+        enum ggml_opt_optimizer_type optimizer;
+    };
+
+    // get parameters for an optimization context with defaults set where possible
+    // parameters for which no sensible defaults exist are supplied as arguments to this function
+    GGML_API struct ggml_opt_params ggml_opt_default_params(
+            ggml_backend_sched_t    backend_sched,
+            enum ggml_opt_loss_type loss_type);
+
+    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
+    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
+
+    // set gradients to zero, initilize loss, and optionally reset the optimizer
+    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
+
+    GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
+
+    // get underlying tensors that store data
+    // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
+    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
+    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
+    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
+    GGML_API struct ggml_tensor * ggml_opt_loss(    ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
+    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
+    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
+
+    // get the gradient accumulator for a node from the forward graph
+    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
+
+    GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
+
+    GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
+
+    // ====== Optimization Result ======
+
+    GGML_API ggml_opt_result_t ggml_opt_result_init(void);
+    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
+    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
+
+    // get data from result, uncertainties are optional and can be ignored by passing NULL
+    GGML_API void ggml_opt_result_ndata(   ggml_opt_result_t result, int64_t * ndata);                  // writes 1 value, number of datapoints
+    GGML_API void ggml_opt_result_loss(    ggml_opt_result_t result, double  * loss,     double * unc); // writes 1 value
+    GGML_API void ggml_opt_result_pred(    ggml_opt_result_t result, int32_t * pred);                   // writes ndata values
+    GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double  * accuracy, double * unc); // writes 1 value
+
+    // ====== Computation ======
+
+    // if not using static graphs, this function must be called prior to ggml_opt_alloc
+    GGML_API void ggml_opt_prepare_alloc(
+        ggml_opt_context_t    opt_ctx,
+        struct ggml_context * ctx_compute,
+        struct ggml_cgraph  * gf,
+        struct ggml_tensor  * inputs,
+        struct ggml_tensor  * outputs);
+
+    // allocate the next graph for evaluation, either forward or forward + backward
+    // must be called exactly once prior to calling ggml_opt_eval
+    GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
+
+    // do forward pass, increment result if not NULL, do backward pass if allocated
+    GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+
+    // ############################################################################
+    // ## The high-level functions start here. They do not depend on any private ##
+    // ## functions or structs and can be copied to and adapted for user code.   ##
+    // ############################################################################
+
+    // ====== Intended Usage ======
+    //
+    // 1. Select the appropriate loss for your problem.
+    // 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
+    //    Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
+    // 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
+    //    The first context should contain the model parameters and inputs and be allocated statically in user code.
+    //    The second context should contain all other tensors and will be (re)allocated automatically.
+    //    Due to this automated allocation the data of the second context is not defined when accessed in user code.
+    //    Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
+    // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
+
+    // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
+    typedef void (*ggml_opt_epoch_callback)(
+            bool               train,       // true after training evaluation, false after validation evaluation
+            ggml_opt_context_t opt_ctx,
+            ggml_opt_dataset_t dataset,
+            ggml_opt_result_t  result,      // result associated with the dataset subsection
+            int64_t            ibatch,      // number of batches that have been evaluated so far
+            int64_t            ibatch_max,  // total number of batches in this dataset subsection
+            int64_t            t_start_us); // time at which the evaluation on the dataset subsection was started
+
+    // do training on front of dataset, do evaluation only on back of dataset
+    GGML_API void ggml_opt_epoch(
+            ggml_opt_context_t      opt_ctx,
+            ggml_opt_dataset_t      dataset,
+            ggml_opt_result_t       result_train,   // result to increment during training, ignored if NULL
+            ggml_opt_result_t       result_eval,    // result to increment during evaluation, ignored if NULL
+            int64_t                 idata_split,    // data index at which to split training and evaluation
+            ggml_opt_epoch_callback callback_train,
+            ggml_opt_epoch_callback callback_eval);
+
+    // callback that prints a progress bar on stderr
+    GGML_API void ggml_opt_epoch_callback_progress_bar(
+            bool               train,
+            ggml_opt_context_t opt_ctx,
+            ggml_opt_dataset_t dataset,
+            ggml_opt_result_t  result,
+            int64_t            ibatch,
+            int64_t            ibatch_max,
+            int64_t            t_start_us);
+
+    // fit model defined by inputs and outputs to dataset
+    GGML_API void ggml_opt_fit(
+            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
+            struct ggml_context           * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
+            struct ggml_tensor            * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
+            struct ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
+            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
+            enum ggml_opt_loss_type         loss_type,      // loss to minimize
+            enum ggml_opt_optimizer_type    optimizer,      // sgd or adamw
+            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
+            int64_t                         nepoch,         // how many times the dataset should be iterated over
+            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
+            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
+            bool                            silent);        // whether or not info prints to stderr should be suppressed
+
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-rpc.h
+++ b/ml/backend/ggml/ggml/include/ggml-rpc.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define RPC_PROTO_MAJOR_VERSION    3
+#define RPC_PROTO_MINOR_VERSION    6
+#define RPC_PROTO_PATCH_VERSION    0
+#define GGML_RPC_MAX_SERVERS       16
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device);
+GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, uint32_t device);
+
+GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
+
+GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
+                                                    size_t n_threads, size_t n_devices, ggml_backend_dev_t * devices);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-sycl.h
+++ b/ml/backend/ggml/ggml/include/ggml-sycl.h
@@ -0,0 +1,49 @@
+//
+//  MIT license
+//  Copyright (C) 2024 Intel Corporation
+//  SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#define GGML_SYCL_NAME "SYCL"
+#define GGML_SYCL_MAX_DEVICES 48
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
+
+// devide buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+
+GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
+GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
+                                                       char *description,
+                                                       size_t description_size);
+GGML_BACKEND_API int  ggml_backend_sycl_get_device_count();
+GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
+
+// SYCL doesn't support registering host memory, keep here for reference
+// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
+// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-vulkan.h
+++ b/ml/backend/ggml/ggml/include/ggml-vulkan.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_VK_NAME "Vulkan"
+#define GGML_VK_MAX_DEVICES 16
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
+
+GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
+GGML_BACKEND_API int  ggml_backend_vk_get_device_count(void);
+GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-zdnn.h
+++ b/ml/backend/ggml/ggml/include/ggml-zdnn.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// device buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml-zendnn.h
+++ b/ml/backend/ggml/ggml/include/ggml-zendnn.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_zendnn_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_zendnn(ggml_backend_t backend);
+
+// number of threads used for zendnn operations
+GGML_BACKEND_API void ggml_backend_zendnn_set_n_threads(ggml_backend_t backend_zendnn, int n_threads);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zendnn_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
--- a/ml/backend/ggml/ggml/include/gguf.h
+++ b/ml/backend/ggml/ggml/include/gguf.h
@@ -0,0 +1,203 @@
+// This file contains functionality related to "GGUF" files, the binary file format used by ggml.
+// GGUF files have the following structure:
+//
+// 1. File magic "GGUF" (4 bytes).
+// 2. File version (uint32_t).
+// 3. Number of ggml tensors in file (int64_t).
+// 4. Number of key-value-pairs in file (int64_t).
+// 5. For each KV pair:
+//   1. The key (string).
+//   2. The value type (gguf_type).
+//   3a. If the value type is GGUF_TYPE_ARRAY:
+//     1. The type of the array (gguf_type).
+//     2. The number of elements in the array (uint64_t).
+//     3. The binary representation of each element in the array.
+//   3b. Otherwise:
+//     1. The binary representation of the value.
+// 6. For each ggml tensor:
+//   1. The tensor name (string).
+//   2. The number of dimensions of the tensor (uint32_t).
+//   3. For each dimension:
+//     1. The size of the tensor in the dimension (int64_t).
+//   4. The tensor data type (ggml_type).
+//   5. The tensor data offset in the tensor data binary blob (uint64_t).
+// 7. The tensor data binary blob (optional, aligned).
+//
+// Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator.
+// All enums are stored as int32_t.
+// All bool values are stored as int8_t.
+// If the special key "general.alignment" (uint32_t) is defined it is used for alignment,
+//   otherwise GGUF_DEFAULT_ALIGNMENT is used.
+//
+// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
+
+#pragma once
+
+#include "ggml.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#define GGUF_MAGIC   "GGUF"
+#define GGUF_VERSION 3
+
+#define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment"
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    // types that can be stored as GGUF KV data
+    enum gguf_type {
+        GGUF_TYPE_UINT8   = 0,
+        GGUF_TYPE_INT8    = 1,
+        GGUF_TYPE_UINT16  = 2,
+        GGUF_TYPE_INT16   = 3,
+        GGUF_TYPE_UINT32  = 4,
+        GGUF_TYPE_INT32   = 5,
+        GGUF_TYPE_FLOAT32 = 6,
+        GGUF_TYPE_BOOL    = 7,
+        GGUF_TYPE_STRING  = 8,
+        GGUF_TYPE_ARRAY   = 9,
+        GGUF_TYPE_UINT64  = 10,
+        GGUF_TYPE_INT64   = 11,
+        GGUF_TYPE_FLOAT64 = 12,
+        GGUF_TYPE_COUNT,       // marks the end of the enum
+    };
+
+    struct gguf_context;
+
+    struct gguf_init_params {
+        bool no_alloc;
+
+        // if not NULL, create a ggml_context and allocate the tensor data in it
+        struct ggml_context ** ctx;
+    };
+
+    GGML_API struct gguf_context * gguf_init_empty(void);
+    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+
+    GGML_API void gguf_free(struct gguf_context * ctx);
+
+    GGML_API const char * gguf_type_name(enum gguf_type type);
+
+    GGML_API uint32_t gguf_get_version    (const struct gguf_context * ctx);
+    GGML_API size_t   gguf_get_alignment  (const struct gguf_context * ctx);
+    GGML_API size_t   gguf_get_data_offset(const struct gguf_context * ctx);
+
+    GGML_API int64_t      gguf_get_n_kv(const struct gguf_context * ctx);
+    GGML_API int64_t      gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
+    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id);
+
+    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id);
+
+    // will abort if the wrong type is used for the key
+    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id);
+    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id);
+    GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id);
+    GGML_API size_t       gguf_get_arr_n   (const struct gguf_context * ctx, int64_t key_id);
+
+    // get raw pointer to the first element of the array with the given key_id
+    // for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
+    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id);
+    GGML_API size_t       gguf_get_arr_data_n(const struct gguf_context * ctx, int64_t key_id);
+
+    // get ith C string from array with given key_id
+    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
+
+    GGML_API int64_t        gguf_get_n_tensors    (const struct gguf_context * ctx);
+    GGML_API int64_t        gguf_find_tensor      (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found
+    GGML_API size_t         gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id);
+    GGML_API const char *   gguf_get_tensor_name  (const struct gguf_context * ctx, int64_t tensor_id);
+    GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int64_t tensor_id);
+    GGML_API size_t         gguf_get_tensor_size  (const struct gguf_context * ctx, int64_t tensor_id);
+
+    // removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist)
+    GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key);
+
+    // overrides an existing KV pair or adds a new one, the new KV pair is always at the back
+    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t      val);
+    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t       val);
+    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t     val);
+    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t      val);
+    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t     val);
+    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t      val);
+    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float        val);
+    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t     val);
+    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t      val);
+    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double       val);
+    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool         val);
+    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
+
+    // creates a new array with n elements of the given type and copies the corresponding number of bytes from data
+    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n);
+
+    // creates a new array with n strings and copies the corresponding strings from data
+    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n);
+
+    // set or add KV pairs from another context
+    GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src);
+
+    // add tensor to GGUF context, tensor name must be unique
+    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+
+    // after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated
+    //   in such a way that the tensor data remains as one contiguous block (except for padding)
+    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+
+    // assumes that at least gguf_get_tensor_size bytes can be read from data
+    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data);
+
+    // writing gguf files can be done in 3 ways:
+    //
+    // - write the entire gguf_context to a binary file in a single pass:
+    //
+    //   gguf_write_to_file(ctx, fname, /*only_meta =*/ false);
+    //
+    // - write only the meta data to a file, then re-open the file and append the tensor data:
+    //
+    //   gguf_write_to_file(ctx, fname, /*only_meta =*/ true);
+    //   FILE * f = fopen(fname, "ab");
+    //   fwrite(f, ...); // write tensor data
+    //   fclose(f);
+    //
+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+    //
+    //   FILE * f = fopen(fname, "wb");
+    //   const size_t size_meta = gguf_get_meta_size(ctx);
+    //   fseek(f, size_meta, SEEK_SET);
+    //   fwrite(f, ...); // write tensor data
+    //   void * data = malloc(size_meta);
+    //   gguf_get_meta_data(ctx, data);
+    //   rewind(f);
+    //   fwrite(data, 1, data, f);
+    //   free(data);
+    //   fclose(f);
+    //
+
+    // write the entire context to a binary file
+    GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
+
+    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
+
+    // writes the meta data to pointer "data"
+    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/include/ollama-debug.h
+++ b/ml/backend/ggml/ggml/include/ollama-debug.h
@@ -0,0 +1,11 @@
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ollama_debug(const struct ggml_tensor *tensor, bool verbose);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -0,0 +1,474 @@
+include(CheckCXXCompilerFlag)
+include("../cmake/common.cmake")
+
+add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
+
+# enable libstdc++ assertions for debug builds
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
+endif()
+
+if (NOT MSVC)
+    if (GGML_SANITIZE_THREAD)
+        add_compile_options(-fsanitize=thread)
+        link_libraries     (-fsanitize=thread)
+    endif()
+
+    if (GGML_SANITIZE_ADDRESS)
+        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+        link_libraries     (-fsanitize=address)
+    endif()
+
+    if (GGML_SANITIZE_UNDEFINED)
+        add_compile_options(-fsanitize=undefined)
+        link_libraries     (-fsanitize=undefined)
+    endif()
+endif()
+
+if (GGML_FATAL_WARNINGS)
+    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        list(APPEND C_FLAGS   -Werror)
+        list(APPEND CXX_FLAGS -Werror)
+    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+        add_compile_options(/WX)
+    endif()
+endif()
+
+if (GGML_ALL_WARNINGS)
+    if (NOT MSVC)
+        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                                  -Werror=implicit-int -Werror=implicit-function-declaration)
+        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
+
+        list(APPEND C_FLAGS   ${WARNING_FLAGS})
+        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+
+        ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+
+        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
+                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+    else()
+        # todo : msvc
+        set(C_FLAGS   "")
+        set(CXX_FLAGS "")
+    endif()
+endif()
+
+if (GGML_LTO)
+    include(CheckIPOSupported)
+    check_ipo_supported(RESULT result OUTPUT output)
+    if (result)
+        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
+    else()
+        message(WARNING "IPO is not supported: ${output}")
+    endif()
+endif()
+
+if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
+    find_program(GGML_CCACHE_FOUND ccache)
+    find_program(GGML_SCCACHE_FOUND sccache)
+
+    if (GGML_CCACHE_FOUND OR GGML_SCCACHE_FOUND)
+        if(GGML_CCACHE_FOUND)
+            set(GGML_CCACHE_VARIANT ccache)
+        else()
+            set(GGML_CCACHE_VARIANT sccache)
+        endif()
+        # TODO: should not be set globally
+        if (GGML_SYCL AND GGML_CCACHE_FOUND AND WIN32)
+            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache compiler_type=icl")
+        else ()
+            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
+        endif ()
+        set(ENV{CCACHE_SLOPPINESS} time_macros)
+        message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
+    else()
+        message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF")
+    endif ()
+endif()
+
+# this version of Apple ld64 is buggy
+execute_process(
+    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
+    ERROR_VARIABLE output
+    OUTPUT_QUIET
+)
+
+if (output MATCHES "dyld-1015\.7")
+    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
+endif()
+
+# architecture specific
+# TODO: probably these flags need to be tweaked on some architectures
+#       feel free to update the Makefile for your architecture and send a pull request or issue
+message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
+if (MSVC)
+    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
+    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
+else ()
+    set(CMAKE_GENERATOR_PLATFORM_LWR "")
+endif ()
+ggml_get_system_arch()
+message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
+
+if (NOT MSVC)
+    if (GGML_STATIC)
+        if (UNIX AND NOT APPLE)
+            set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
+        endif()
+        add_link_options(-static)
+        if (MINGW)
+            add_link_options(-static-libgcc -static-libstdc++)
+        endif()
+    endif()
+    if (GGML_GPROF)
+        add_compile_options(-pg)
+    endif()
+endif()
+
+#
+# POSIX conformance
+#
+
+# clock_gettime came in POSIX.1b (1993)
+# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
+# posix_memalign came in POSIX.1-2001 / SUSv3
+# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
+
+# Somehow in OpenBSD whenever POSIX conformance is specified
+# some string functions rely on locale_t availability,
+# which was introduced in POSIX.1-2008, forcing us to go higher
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    add_compile_definitions(_XOPEN_SOURCE=700)
+elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
+    # Don't define _XOPEN_SOURCE.  We need _ALL_SOURCE, which is the default,
+    # in order to define _SC_PHYS_PAGES.
+else()
+    add_compile_definitions(_XOPEN_SOURCE=600)
+endif()
+
+# Data types, macros and functions related to controlling CPU affinity and
+# some memory allocation are available on Linux through GNU extensions in libc
+if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
+    add_compile_definitions(_GNU_SOURCE)
+endif()
+
+# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
+# and on macOS its availability depends on enabling Darwin extensions
+# similarly on DragonFly, enabling BSD extensions is necessary
+if (
+    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
+    CMAKE_SYSTEM_NAME MATCHES "iOS"    OR
+    CMAKE_SYSTEM_NAME MATCHES "tvOS"   OR
+    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
+)
+    add_compile_definitions(_DARWIN_C_SOURCE)
+endif()
+
+# alloca is a non-standard interface that is not visible on BSDs when
+# POSIX conformance is specified, but not all of them provide a clean way
+# to enable it in such cases
+if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+    add_compile_definitions(__BSD_VISIBLE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+    add_compile_definitions(_NETBSD_SOURCE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    add_compile_definitions(_BSD_SOURCE)
+endif()
+
+if (WIN32)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+endif()
+
+# ggml
+
+if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
+    message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
+endif()
+
+add_library(ggml-base
+            ../include/ggml.h
+            ../include/ggml-alloc.h
+            ../include/ggml-backend.h
+            ../include/ggml-cpp.h
+            ../include/ggml-opt.h
+            ../include/gguf.h
+            ggml.c
+            ggml.cpp
+            ggml-alloc.c
+            ggml-backend.cpp
+            ggml-opt.cpp
+            ggml-threading.cpp
+            ggml-threading.h
+            ggml-quants.c
+            ggml-quants.h
+            mem_hip.cpp
+            mem_nvml.cpp
+            mem_dxgi_pdh.cpp
+            gguf.cpp)
+
+set_target_properties(ggml-base PROPERTIES
+    VERSION ${GGML_VERSION}
+    SOVERSION ${GGML_VERSION_MAJOR}
+)
+
+target_include_directories(ggml-base PRIVATE .)
+if (GGML_BACKEND_DL)
+    target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
+endif()
+
+if (GGML_SCHED_NO_REALLOC)
+    target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
+endif()
+
+add_library(ggml
+            ggml-backend-reg.cpp)
+add_library(ggml::ggml ALIAS ggml)
+
+set_target_properties(ggml PROPERTIES
+    VERSION ${GGML_VERSION}
+    SOVERSION ${GGML_VERSION_MAJOR}
+)
+
+if (GGML_BACKEND_DIR)
+    if (NOT GGML_BACKEND_DL)
+        message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
+    endif()
+    target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
+endif()
+
+target_link_libraries(ggml PUBLIC ggml-base)
+
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    target_link_libraries(ggml PRIVATE dl)
+endif()
+
+function(ggml_add_backend_library backend)
+    if (GGML_BACKEND_DL)
+        add_library(${backend} MODULE ${ARGN})
+        # write the shared library to the output directory
+        set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
+        add_dependencies(ggml ${backend})
+        if (GGML_BACKEND_DIR)
+            install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
+        else()
+            install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
+        endif()
+    else()
+        add_library(${backend} ${ARGN})
+        target_link_libraries(ggml PUBLIC ${backend})
+        install(TARGETS ${backend} LIBRARY)
+    endif()
+
+    target_link_libraries(${backend} PRIVATE ggml-base)
+    target_include_directories(${backend} PRIVATE ..)
+
+    if (${BUILD_SHARED_LIBS})
+        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
+        target_compile_definitions(${backend} PUBLIC  GGML_BACKEND_SHARED)
+    endif()
+
+    # Set versioning properties for all backend libraries
+    # Building a MODULE library with a version is not supported on macOS (https://gitlab.kitware.com/cmake/cmake/-/issues/20782)
+    if (NOT (APPLE AND GGML_BACKEND_DL))
+        set_target_properties(${backend} PROPERTIES
+            VERSION ${GGML_VERSION}
+            SOVERSION ${GGML_VERSION_MAJOR}
+        )
+    endif()
+
+    if(NOT GGML_AVAILABLE_BACKENDS)
+        set(GGML_AVAILABLE_BACKENDS "${backend}"
+            CACHE INTERNAL "List of backends for cmake package")
+    else()
+        list(FIND GGML_AVAILABLE_BACKENDS "${backend}" has_backend)
+        if(has_backend EQUAL -1)
+            set(GGML_AVAILABLE_BACKENDS "${GGML_AVAILABLE_BACKENDS};${backend}"
+                CACHE INTERNAL "List of backends for cmake package")
+        endif()
+    endif()
+endfunction()
+
+function(ggml_add_backend backend)
+    string(TOUPPER "GGML_${backend}" backend_id)
+    if (${backend_id})
+        string(TOLOWER "ggml-${backend}" backend_target)
+        add_subdirectory(${backend_target})
+        message(STATUS "Including ${backend} backend")
+        if (NOT GGML_BACKEND_DL)
+            string(TOUPPER "GGML_USE_${backend}" backend_use)
+            target_compile_definitions(ggml PUBLIC ${backend_use})
+        endif()
+    endif()
+endfunction()
+
+function(ggml_add_cpu_backend_variant tag_name)
+    set(GGML_CPU_TAG_NAME ${tag_name})
+    # other: OPENMP LLAMAFILE CPU_HBM
+    if (GGML_SYSTEM_ARCH STREQUAL "x86")
+        foreach (feat NATIVE
+                      SSE42
+                      AVX AVX2 BMI2 AVX_VNNI FMA F16C
+                      AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
+                      AMX_TILE AMX_INT8 AMX_BF16)
+            set(GGML_${feat} OFF)
+        endforeach()
+
+        foreach (feat ${ARGN})
+            set(GGML_${feat} ON)
+        endforeach()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
+        foreach (feat ${ARGN})
+            set(GGML_INTERNAL_${feat} ON)
+        endforeach()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
+        foreach (feat ${ARGN})
+            set(GGML_INTERNAL_${feat} ON)
+        endforeach()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
+        foreach (feat VXE2 NNPA)
+            set(GGML_INTERNAL_${feat} OFF)
+        endforeach()
+
+        foreach (feat ${ARGN})
+            set(GGML_INTERNAL_${feat} ON)
+        endforeach()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
+        foreach (feat RVV)
+            set(GGML_INTERNAL_${feat} OFF)
+        endforeach()
+
+        foreach (feat ${ARGN})
+            set(GGML_INTERNAL_${feat} ON)
+        endforeach()
+    endif()
+
+    ggml_add_cpu_backend_variant_impl(${tag_name})
+    add_dependencies(ggml-cpu ggml-cpu-${tag_name})
+endfunction()
+
+ggml_add_backend(CPU)
+
+if (GGML_CPU_ALL_VARIANTS)
+    if (NOT GGML_BACKEND_DL)
+        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
+    elseif (GGML_CPU_ARM_ARCH)
+        message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
+    endif()
+    add_custom_target(ggml-cpu)
+    if (GGML_SYSTEM_ARCH STREQUAL "x86")
+        ggml_add_cpu_backend_variant(x64)
+        ggml_add_cpu_backend_variant(sse42        SSE42)
+        ggml_add_cpu_backend_variant(sandybridge  SSE42 AVX)
+        ggml_add_cpu_backend_variant(haswell      SSE42 AVX F16C AVX2 BMI2 FMA)
+        ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
+        ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+        ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
+    elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
+        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+            # Many of these features are optional so we build versions with popular
+            # combinations and name the backends based on the version they were
+            # first released with
+            ggml_add_cpu_backend_variant(armv8.0_1)
+            ggml_add_cpu_backend_variant(armv8.2_1    DOTPROD)
+            ggml_add_cpu_backend_variant(armv8.2_2    DOTPROD FP16_VECTOR_ARITHMETIC)
+            ggml_add_cpu_backend_variant(armv8.2_3    DOTPROD FP16_VECTOR_ARITHMETIC SVE)
+            ggml_add_cpu_backend_variant(armv8.6_1    DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
+            ggml_add_cpu_backend_variant(armv8.6_2    DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
+            ggml_add_cpu_backend_variant(armv9.2_1    DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
+            ggml_add_cpu_backend_variant(armv9.2_2    DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
+        elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
+            # Android-specific backends with SoC-compatible feature sets
+            ggml_add_cpu_backend_variant(android_armv8.0_1)
+            ggml_add_cpu_backend_variant(android_armv8.2_1    DOTPROD)
+            ggml_add_cpu_backend_variant(android_armv8.2_2    DOTPROD FP16_VECTOR_ARITHMETIC)
+            ggml_add_cpu_backend_variant(android_armv8.6_1    DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
+        elseif (APPLE)
+            ggml_add_cpu_backend_variant(apple_m1             DOTPROD)
+            ggml_add_cpu_backend_variant(apple_m2_m3          DOTPROD MATMUL_INT8)
+            ggml_add_cpu_backend_variant(apple_m4             DOTPROD MATMUL_INT8 NOSVE SME)
+        else()
+            message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
+        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
+        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+            ggml_add_cpu_backend_variant(power0)
+            ggml_add_cpu_backend_variant(power7_1       POWER7)
+            ggml_add_cpu_backend_variant(power7_2       POWER7  VSX)
+            ggml_add_cpu_backend_variant(power8_1       POWER8)
+            ggml_add_cpu_backend_variant(power8_2       POWER8  VSX)
+            ggml_add_cpu_backend_variant(power9         POWER9  VSX)
+            ggml_add_cpu_backend_variant(power10        POWER10 VSX)
+            ggml_add_cpu_backend_variant(power11        POWER11 VSX)
+        else()
+            message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
+        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
+        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+            ggml_add_cpu_backend_variant(z15    Z15 VXE2)
+            ggml_add_cpu_backend_variant(z16    Z16 VXE2 NNPA)
+        else()
+            message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
+        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
+        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+            ggml_add_cpu_backend_variant(riscv64_0)
+            ggml_add_cpu_backend_variant(riscv64_v   RVV)
+        else()
+            message(FATAL_ERROR "Unsupported RISC-V target OS: ${CMAKE_SYSTEM_NAME}")
+        endif()
+    else()
+        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
+    endif()
+elseif (GGML_CPU)
+    ggml_add_cpu_backend_variant_impl("")
+endif()
+
+ggml_add_backend(BLAS)
+ggml_add_backend(CANN)
+ggml_add_backend(CUDA)
+ggml_add_backend(HIP)
+ggml_add_backend(METAL)
+ggml_add_backend(MUSA)
+ggml_add_backend(RPC)
+ggml_add_backend(SYCL)
+ggml_add_backend(Vulkan)
+ggml_add_backend(WebGPU)
+ggml_add_backend(zDNN)
+ggml_add_backend(OpenCL)
+ggml_add_backend(Hexagon)
+ggml_add_backend(ZenDNN)
+
+foreach (target ggml-base ggml)
+    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
+    target_compile_features   (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
+endforeach()
+
+target_link_libraries(ggml-base PRIVATE Threads::Threads)
+
+find_library(MATH_LIBRARY m)
+if (MATH_LIBRARY)
+    if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
+        target_link_libraries(ggml-base PRIVATE m)
+    endif()
+endif()
+
+if (CMAKE_SYSTEM_NAME MATCHES "Android")
+    target_link_libraries(ggml-base PRIVATE dl)
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
+    target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
+endif()
+
+if (BUILD_SHARED_LIBS)
+    foreach (target ggml-base ggml)
+        set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_compile_definitions(${target} PRIVATE GGML_BUILD)
+        target_compile_definitions(${target} PUBLIC  GGML_SHARED)
+    endforeach()
+endif()
--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
--- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h
@@ -0,0 +1,275 @@
+#pragma once
+
+// ggml-backend internal header
+
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    #define GGML_BACKEND_API_VERSION 2
+
+    //
+    // Backend buffer type
+    //
+
+    struct ggml_backend_buffer_type_i {
+        const char *          (*get_name)      (ggml_backend_buffer_type_t buft);
+        // allocate a buffer of this type
+        ggml_backend_buffer_t (*alloc_buffer)  (ggml_backend_buffer_type_t buft, size_t size);
+        // tensor alignment
+        size_t                (*get_alignment) (ggml_backend_buffer_type_t buft);
+        // (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
+        size_t                (*get_max_size)  (ggml_backend_buffer_type_t buft);
+        // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
+        size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
+        // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
+        bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
+
+        // (optional) returns a dummy buffer that is equivalent to one created by alloc_buffer but without actually being backed
+        // by memory
+        ggml_backend_buffer_t (*noalloc_buffer)(ggml_backend_buffer_type_t buft, size_t size);
+    };
+
+    struct ggml_backend_buffer_type {
+        struct ggml_backend_buffer_type_i  iface;
+        ggml_backend_dev_t device;
+        void * context;
+        bool no_alloc;
+    };
+
+    //
+    // Backend buffer
+    //
+
+    struct ggml_backend_buffer_i {
+        // (optional) free the buffer
+        void         (*free_buffer)  (ggml_backend_buffer_t buffer);
+        // base address of the buffer
+        void *       (*get_base)     (ggml_backend_buffer_t buffer);
+        // (optional) initialize a tensor in the buffer (eg. add tensor extras)
+        enum ggml_status (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        // tensor data access
+        void         (*memset_tensor)(ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
+        void         (*set_tensor)   (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void         (*get_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
+        bool         (*cpy_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        // clear the entire buffer
+        void         (*clear)        (ggml_backend_buffer_t buffer, uint8_t value);
+        // (optional) reset any internal state due to tensor initialization, such as tensor extras
+        void         (*reset)        (ggml_backend_buffer_t buffer);
+    };
+
+    struct ggml_backend_buffer {
+        struct ggml_backend_buffer_i  iface;
+        ggml_backend_buffer_type_t    buft;
+        void * context;
+        size_t size;
+        enum ggml_backend_buffer_usage usage;
+        bool no_alloc;
+    };
+
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
+                   ggml_backend_buffer_type_t buft,
+            struct ggml_backend_buffer_i      iface,
+                   void *                     context,
+                   size_t                     size);
+
+    // do not use directly, use ggml_backend_tensor_copy instead
+    GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // multi-buffer
+    // buffer that contains a collection of buffers
+    GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+    GGML_API bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+    GGML_API void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+
+    //
+    // Backend (stream)
+    //
+
+    struct ggml_backend_i {
+        const char * (*get_name)(ggml_backend_t backend);
+
+        void (*free)(ggml_backend_t backend);
+
+        // (optional) asynchronous tensor data access
+        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
+
+        // (optional) complete all pending operations (required if the backend supports async operations)
+        void (*synchronize)(ggml_backend_t backend);
+
+        // (optional) graph plans (not used currently)
+        // compute graph with a plan
+        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
+        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
+        void                      (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
+        // compute the graph with the plan
+        enum ggml_status          (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+        // compute graph (always async if supported by the backend). batch_size may be -1 if unknown
+        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size);
+
+        // (optional) event synchronization
+        // record an event on this stream
+        void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
+        // wait for an event on on a different stream
+        void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
+
+        // (optional) sort/optimize the nodes in the graph
+        void                      (*graph_optimize)    (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+        // (optional) reserves intermediate buffers needed for the compution
+        // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size
+        enum ggml_status          (*graph_reserve)     (ggml_backend_t backend, struct ggml_cgraph * cgraph, bool alloc);
+
+        // (optional) returns the memory needed after calling graph_reserve
+        size_t                    (*buffer_size)       (ggml_backend_t backend);
+
+        // (optional) frees memory from intermediate buffers that was allocated either by graph_compute or graph_reserve
+        void                      (*reset)             (ggml_backend_t backend);
+    };
+
+    struct ggml_backend {
+        ggml_guid_t guid;
+        struct ggml_backend_i iface;
+        ggml_backend_dev_t device;
+        void * context;
+    };
+
+    struct ggml_backend_event {
+        struct ggml_backend_device * device;
+        void * context;
+    };
+
+    //
+    // Backend device
+    //
+
+    // Note: if additional properties are needed, we should add a struct with all of them
+    //       the current functions to obtain the properties can remain, since they are more convenient for often used properties
+    struct ggml_backend_device_i {
+        // device name: short identifier for this device, such as "CPU" or "CUDA0"
+        const char * (*get_name)(ggml_backend_dev_t dev);
+
+        // device description: short informative description of the device, could be the model name
+        const char * (*get_description)(ggml_backend_dev_t dev);
+
+        // device memory in bytes
+        void         (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
+
+        // device type
+        enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
+
+        // device properties
+        void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
+
+        // backend (stream) initialization
+        ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
+
+        // preferred buffer type
+        ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
+
+        // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
+        ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
+
+        // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
+        ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
+
+        // check if the backend can compute an operation
+        bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
+
+        // check if the backend can use tensors allocated in a buffer type
+        bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
+
+        // (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
+        // these should be expensive operations that may benefit from running on this backend instead of the CPU backend
+        bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
+
+        // (optional) event synchronization
+        ggml_backend_event_t (*event_new)         (ggml_backend_dev_t dev);
+        void                 (*event_free)        (ggml_backend_dev_t dev, ggml_backend_event_t event);
+        void                 (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
+
+        // (optional) reset device, clearing existing allocations and context
+        // the caller must ensure that there are no outstanding buffers, as these will become invalid
+        void (*reset)(ggml_backend_dev_t dev);
+    };
+
+    struct ggml_backend_device {
+        struct ggml_backend_device_i iface;
+        ggml_backend_reg_t reg;
+        void * context;
+    };
+
+    //
+    // Backend (reg)
+    //
+
+    struct ggml_backend_reg_i {
+        const char * (*get_name)(ggml_backend_reg_t reg);
+
+        // enumerate available devices
+        size_t             (*get_device_count)(ggml_backend_reg_t reg);
+        ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
+
+        // (optional) get a pointer to a function in the backend
+        // backends can add custom functions that are not part of the standard ggml-backend interface
+        void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
+    };
+
+    struct ggml_backend_reg {
+        int api_version; // initialize to GGML_BACKEND_API_VERSION
+        struct ggml_backend_reg_i iface;
+        void * context;
+    };
+
+    // Add backend dynamic loading support to the backend
+
+    // Initialize the backend
+    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
+    // Optional: obtain a score for the backend based on the system configuration
+    // Higher scores are preferred, 0 means the backend is not supported in the current system
+    typedef int                (*ggml_backend_score_t)(void);
+
+#ifdef GGML_BACKEND_DL
+#    ifdef __cplusplus
+#        define GGML_BACKEND_DL_IMPL(reg_fn)                             \
+            extern "C" {                                                 \
+            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
+            }                                                            \
+            ggml_backend_reg_t ggml_backend_init(void) {                 \
+                return reg_fn();                                         \
+            }
+#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)       \
+            extern "C" {                                   \
+            GGML_BACKEND_API int ggml_backend_score(void); \
+            }                                              \
+            int ggml_backend_score(void) {                 \
+                return score_fn();                         \
+            }
+#    else
+#        define GGML_BACKEND_DL_IMPL(reg_fn)                              \
+            GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void);  \
+            ggml_backend_reg_t                  ggml_backend_init(void) { \
+                return reg_fn();                                          \
+            }
+#        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)        \
+            GGML_BACKEND_API int ggml_backend_score(void);  \
+            int                  ggml_backend_score(void) { \
+                return score_fn();                          \
+            }
+#    endif
+#else
+#    define GGML_BACKEND_DL_IMPL(reg_fn)
+#    define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
+#endif
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -0,0 +1,649 @@
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include <algorithm>
+#include <cstring>
+#include <filesystem>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <cctype>
+
+#ifdef _WIN32
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#elif defined(__APPLE__)
+#    include <mach-o/dyld.h>
+#    include <dlfcn.h>
+#else
+#    include <dlfcn.h>
+#    include <unistd.h>
+#endif
+
+// Backend registry
+#ifdef GGML_USE_CPU
+#include "ggml-cpu.h"
+#endif
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#ifdef GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
+#ifdef GGML_USE_WEBGPU
+#include "ggml-webgpu.h"
+#endif
+
+#ifdef GGML_USE_ZDNN
+#include "ggml-zdnn.h"
+#endif
+
+#ifdef GGML_USE_OPENCL
+#include "ggml-opencl.h"
+#endif
+
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
+#ifdef GGML_USE_BLAS
+#include "ggml-blas.h"
+#endif
+
+#ifdef GGML_USE_RPC
+#include "ggml-rpc.h"
+#endif
+
+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
+#ifdef GGML_USE_ZENDNN
+#include "ggml-zendnn.h"
+#endif
+
+// disable C++17 deprecation warning for std::codecvt_utf8
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+namespace fs = std::filesystem;
+
+static std::string path_str(const fs::path & path) {
+    std::string u8path;
+    try {
+#if defined(__cpp_lib_char8_t)
+        // C++20 and later: u8string() returns std::u8string
+        std::u8string u8str = path.u8string();
+        u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
+#else
+        // C++17: u8string() returns std::string
+        u8path = path.u8string();
+#endif
+    } catch (...) {
+    }
+    return u8path;
+}
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
+#endif
+
+#ifdef _WIN32
+
+using dl_handle = std::remove_pointer_t<HMODULE>;
+
+struct dl_handle_deleter {
+    void operator()(HMODULE handle) {
+        FreeLibrary(handle);
+    }
+};
+
+static dl_handle * dl_load_library(const fs::path & path) {
+    // suppress error dialogs for missing DLLs
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    HMODULE handle = LoadLibraryW(path.wstring().c_str());
+    if (!handle) {
+        DWORD error_code = GetLastError();
+        std::string msg;
+        LPSTR lpMsgBuf = NULL;
+        DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                      NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
+        if (bufLen) {
+            msg = lpMsgBuf;
+            LocalFree(lpMsgBuf);
+            GGML_LOG_INFO("%s unable to load library %s: %s\n", __func__, path_str(path).c_str(), msg.c_str());
+        }
+    }
+
+    SetErrorMode(old_mode);
+
+    return handle;
+}
+
+static void * dl_get_sym(dl_handle * handle, const char * name) {
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    void * p = (void *) GetProcAddress(handle, name);
+
+    SetErrorMode(old_mode);
+
+    return p;
+}
+
+static const char * dl_error() {
+    return "";
+}
+
+#else
+
+using dl_handle = void;
+
+struct dl_handle_deleter {
+    void operator()(void * handle) {
+        dlclose(handle);
+    }
+};
+
+static void * dl_load_library(const fs::path & path) {
+    dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
+
+    return handle;
+}
+
+static void * dl_get_sym(dl_handle * handle, const char * name) {
+    return dlsym(handle, name);
+}
+
+static const char * dl_error() {
+    const char *rslt = dlerror();
+    return rslt != nullptr ? rslt : "";
+}
+
+#endif
+
+using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
+
+struct ggml_backend_reg_entry {
+    ggml_backend_reg_t reg;
+    dl_handle_ptr handle;
+};
+
+struct ggml_backend_registry {
+    std::vector<ggml_backend_reg_entry> backends;
+    std::vector<std::pair<ggml_backend_dev_t, int>> devices;
+
+    ggml_backend_registry() {
+#ifdef GGML_USE_CUDA
+        register_backend(ggml_backend_cuda_reg());
+#endif
+#ifdef GGML_USE_METAL
+        register_backend(ggml_backend_metal_reg());
+#endif
+#ifdef GGML_USE_SYCL
+        register_backend(ggml_backend_sycl_reg());
+#endif
+#ifdef GGML_USE_VULKAN
+        register_backend(ggml_backend_vk_reg());
+#endif
+#ifdef GGML_USE_WEBGPU
+        register_backend(ggml_backend_webgpu_reg());
+#endif
+#ifdef GGML_USE_ZDNN
+        register_backend(ggml_backend_zdnn_reg());
+#endif
+#ifdef GGML_USE_OPENCL
+        register_backend(ggml_backend_opencl_reg());
+#endif
+#ifdef GGML_USE_ZENDNN
+        register_backend(ggml_backend_zendnn_reg());
+#endif
+#ifdef GGML_USE_HEXAGON
+        register_backend(ggml_backend_hexagon_reg());
+#endif
+#ifdef GGML_USE_CANN
+        register_backend(ggml_backend_cann_reg());
+#endif
+#ifdef GGML_USE_BLAS
+        register_backend(ggml_backend_blas_reg());
+#endif
+#ifdef GGML_USE_RPC
+        register_backend(ggml_backend_rpc_reg());
+#endif
+#ifdef GGML_USE_CPU
+        register_backend(ggml_backend_cpu_reg());
+#endif
+    }
+
+    ~ggml_backend_registry() {
+        // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
+        // since backend threads may still be running and accessing resources from the dynamic library
+        for (auto & entry : backends) {
+            if (entry.handle) {
+                entry.handle.release(); // NOLINT
+            }
+        }
+    }
+
+    void register_backend(ggml_backend_reg_t reg, int score = -1, dl_handle_ptr handle = nullptr) {
+        if (!reg) {
+            return;
+        }
+
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
+            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
+#endif
+        backends.push_back({ reg, std::move(handle) });
+        for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
+            register_device(ggml_backend_reg_dev_get(reg, i), score);
+        }
+    }
+
+    void register_device(ggml_backend_dev_t device, int score = -1) {
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
+#endif
+        devices.push_back({device, score});
+        std::stable_sort(devices.begin(), devices.end(),
+            [](const auto & a, const auto & b) {
+                return a.second > b.second;
+            }
+        );
+    }
+
+    ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
+        dl_handle_ptr handle { dl_load_library(path) };
+        if (!handle) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(path).c_str(), dl_error());
+            }
+            return nullptr;
+        }
+
+        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+        if (score_fn && score_fn() == 0) {
+            if (!silent) {
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path_str(path).c_str());
+            }
+            return nullptr;
+        }
+
+        auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
+        if (!backend_init_fn) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path_str(path).c_str());
+            }
+            return nullptr;
+        }
+
+        ggml_backend_reg_t reg = backend_init_fn();
+        if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
+            if (!silent) {
+                if (!reg) {
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n",
+                        __func__, path_str(path).c_str());
+                } else {
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
+                        __func__, path_str(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
+                }
+            }
+            return nullptr;
+        }
+
+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
+
+        register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
+
+        return reg;
+    }
+
+    void unload_backend(ggml_backend_reg_t reg, bool silent) {
+        auto it = std::find_if(backends.begin(), backends.end(),
+                               [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
+
+        if (it == backends.end()) {
+            if (!silent) {
+                GGML_LOG_ERROR("%s: backend not found\n", __func__);
+            }
+            return;
+        }
+
+        if (!silent) {
+            GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
+        }
+
+        // remove devices
+        devices.erase(
+            std::remove_if(devices.begin(), devices.end(),
+                            [reg](std::pair<ggml_backend_dev_t, int> dev) { return ggml_backend_dev_backend_reg(dev.first) == reg; }),
+            devices.end());
+
+        // remove backend
+        backends.erase(it);
+    }
+};
+
+static ggml_backend_registry & get_reg() {
+    static ggml_backend_registry reg;
+    return reg;
+}
+
+// Internal API
+void ggml_backend_register(ggml_backend_reg_t reg) {
+    get_reg().register_backend(reg);
+}
+
+void ggml_backend_device_register(ggml_backend_dev_t device) {
+    get_reg().register_device(device);
+}
+
+// Backend (reg) enumeration
+static bool striequals(const char * a, const char * b) {
+    for (; *a && *b; a++, b++) {
+        if (std::tolower(*a) != std::tolower(*b)) {
+            return false;
+        }
+    }
+    return *a == *b;
+}
+
+size_t ggml_backend_reg_count() {
+    return get_reg().backends.size();
+}
+
+ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
+    GGML_ASSERT(index < ggml_backend_reg_count());
+    return get_reg().backends[index].reg;
+}
+
+ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
+    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+        ggml_backend_reg_t reg = ggml_backend_reg_get(i);
+        if (striequals(ggml_backend_reg_name(reg), name)) {
+            return reg;
+        }
+    }
+    return nullptr;
+}
+
+// Device enumeration
+size_t ggml_backend_dev_count() {
+    return get_reg().devices.size();
+}
+
+ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
+    GGML_ASSERT(index < ggml_backend_dev_count());
+    return get_reg().devices[index].first;
+}
+
+ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (striequals(ggml_backend_dev_name(dev), name)) {
+            return dev;
+        }
+    }
+    return nullptr;
+}
+
+ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(dev) == type) {
+            return dev;
+        }
+    }
+    return nullptr;
+}
+
+// Convenience functions
+ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
+    ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
+    if (!dev) {
+        return nullptr;
+    }
+    return ggml_backend_dev_init(dev, params);
+}
+
+ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
+    ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
+    if (!dev) {
+        return nullptr;
+    }
+    return ggml_backend_dev_init(dev, params);
+}
+
+ggml_backend_t ggml_backend_init_best(void) {
+    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
+    dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
+    dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (!dev) {
+        return nullptr;
+    }
+    return ggml_backend_dev_init(dev, nullptr);
+}
+
+// Dynamic loading
+ggml_backend_reg_t ggml_backend_load(const char * path) {
+    return get_reg().load_backend(path, false);
+}
+
+void ggml_backend_unload(ggml_backend_reg_t reg) {
+    get_reg().unload_backend(reg, true);
+}
+
+static fs::path get_executable_path() {
+#if defined(__APPLE__)
+    // get executable path
+    std::vector<char> path;
+    uint32_t size;
+    while (true) {
+        size = path.size();
+        if (_NSGetExecutablePath(path.data(), &size) == 0) {
+            break;
+        }
+        path.resize(size);
+    }
+    std::string base_path(path.data(), size);
+    // remove executable name
+    auto last_slash = base_path.find_last_of('/');
+    if (last_slash != std::string::npos) {
+        base_path = base_path.substr(0, last_slash);
+    }
+    return base_path + "/";
+#elif defined(__linux__) || defined(__FreeBSD__)
+    std::string base_path = ".";
+    std::vector<char> path(1024);
+    while (true) {
+        // get executable path
+#    if defined(__linux__)
+        ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
+#    elif defined(__FreeBSD__)
+        ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
+#    endif
+        if (len == -1) {
+            break;
+        }
+        if (len < (ssize_t) path.size()) {
+            base_path = std::string(path.data(), len);
+            // remove executable name
+            auto last_slash = base_path.find_last_of('/');
+            if (last_slash != std::string::npos) {
+                base_path = base_path.substr(0, last_slash);
+            }
+            break;
+        }
+        path.resize(path.size() * 2);
+    }
+
+    return base_path + "/";
+#elif defined(_WIN32)
+    std::vector<wchar_t> path(MAX_PATH);
+    DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
+    if (len == 0) {
+        return {};
+    }
+    std::wstring base_path(path.data(), len);
+    // remove executable name
+    auto last_slash = base_path.find_last_of('\\');
+    if (last_slash != std::string::npos) {
+        base_path = base_path.substr(0, last_slash);
+    }
+    return base_path + L"\\";
+#else
+    return {};
+#endif
+}
+
+static fs::path backend_filename_prefix() {
+#ifdef _WIN32
+    return fs::u8path("ggml-");
+#else
+    return fs::u8path("libggml-");
+#endif
+}
+
+static fs::path backend_filename_extension() {
+#ifdef _WIN32
+    return fs::u8path(".dll");
+#else
+    return fs::u8path(".so");
+#endif
+}
+
+static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
+    // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
+    const fs::path name_path = fs::u8path(name);
+    const fs::path file_prefix = backend_filename_prefix().native() + name_path.native() + fs::u8path("-").native();
+    const fs::path file_extension = backend_filename_extension();
+
+    std::vector<fs::path> search_paths;
+    if (user_search_path == nullptr) {
+#ifdef GGML_BACKEND_DIR
+        search_paths.push_back(fs::u8path(GGML_BACKEND_DIR));
+#endif
+        // default search paths: executable directory, current directory
+        search_paths.push_back(get_executable_path());
+        search_paths.push_back(fs::current_path());
+    } else {
+        search_paths.push_back(fs::u8path(user_search_path));
+    }
+
+    int best_score = 0;
+    fs::path best_path;
+
+    for (const auto & search_path : search_paths) {
+        if (std::error_code ec; !fs::exists(search_path, ec)) {
+            if (ec) {
+                GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(search_path).c_str(), ec.message().c_str());
+            } else {
+                GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
+            }
+            continue;
+        }
+        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
+        for (const auto & entry : dir_it) {
+            if (entry.is_regular_file()) {
+                auto filename = entry.path().filename();
+                auto ext = entry.path().extension();
+                if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
+                    dl_handle_ptr handle { dl_load_library(entry) };
+                    if (!handle && !silent) {
+                        GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(entry.path()).c_str(), dl_error());
+                    }
+                    if (handle) {
+                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
+                        if (score_fn) {
+                            int s = score_fn();
+#ifndef NDEBUG
+                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, path_str(entry.path()).c_str(), s);
+#endif
+                            if (s > best_score) {
+                                best_score = s;
+                                best_path = entry.path();
+                            }
+                        } else {
+                            if (!silent) {
+                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, path_str(entry.path()).c_str());
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (best_score == 0) {
+        // try to load the base backend
+        for (const auto & search_path : search_paths) {
+            fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
+            fs::path path = search_path / filename;
+            if (std::error_code ec; fs::exists(path, ec)) {
+                return get_reg().load_backend(path, silent);
+            } else {
+                if (ec) {
+                    GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(path).c_str(), ec.message().c_str());
+                }
+            }
+        }
+        return nullptr;
+    }
+
+    return get_reg().load_backend(best_path, silent);
+}
+
+void ggml_backend_load_all() {
+    ggml_backend_load_all_from_path(nullptr);
+}
+
+void ggml_backend_load_all_from_path(const char * dir_path) {
+#ifdef NDEBUG
+    bool silent = true;
+#else
+    bool silent = false;
+#endif
+
+    ggml_backend_load_best("blas", silent, dir_path);
+    ggml_backend_load_best("zendnn", silent, dir_path);
+    ggml_backend_load_best("cann", silent, dir_path);
+    ggml_backend_load_best("cuda", silent, dir_path);
+    ggml_backend_load_best("hip", silent, dir_path);
+    ggml_backend_load_best("metal", silent, dir_path);
+    ggml_backend_load_best("rpc", silent, dir_path);
+    ggml_backend_load_best("sycl", silent, dir_path);
+    ggml_backend_load_best("vulkan", silent, dir_path);
+    ggml_backend_load_best("opencl", silent, dir_path);
+    ggml_backend_load_best("hexagon", silent, dir_path);
+    ggml_backend_load_best("musa", silent, dir_path);
+    ggml_backend_load_best("cpu", silent, dir_path);
+    // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
+    const char * backend_path = std::getenv("GGML_BACKEND_PATH");
+    if (backend_path) {
+        ggml_backend_load(backend_path);
+    }
+}
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
--- a/ml/backend/ggml/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/ggml-blas/CMakeLists.txt
@@ -0,0 +1,87 @@
+if (GGML_STATIC)
+    set(BLA_STATIC ON)
+endif()
+#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
+#    set(BLA_SIZEOF_INTEGER 8)
+#endif()
+
+set(BLA_VENDOR ${GGML_BLAS_VENDOR})
+find_package(BLAS)
+
+if (BLAS_FOUND)
+    message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
+
+    ggml_add_backend_library(ggml-blas
+                             ggml-blas.cpp
+                            )
+
+    if (${GGML_BLAS_VENDOR} MATCHES "Apple")
+        add_compile_definitions(ACCELERATE_NEW_LAPACK)
+        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+        add_compile_definitions(GGML_BLAS_USE_ACCELERATE)
+    elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
+        # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
+        # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
+        find_package(PkgConfig REQUIRED)
+        if (${GGML_BLAS_VENDOR} MATCHES "Generic")
+            pkg_check_modules(DepBLAS blas)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
+            # As of openblas v0.3.22, the 64-bit is named openblas64.pc
+            pkg_check_modules(DepBLAS openblas64)
+            if (NOT DepBLAS_FOUND)
+                pkg_check_modules(DepBLAS openblas)
+            endif()
+        elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
+            add_compile_definitions(GGML_BLAS_USE_BLIS)
+            pkg_check_modules(DepBLAS blis)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
+            pkg_check_modules(DepBLAS blas-atlas)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
+            pkg_check_modules(DepBLAS flexiblas_api)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
+            add_compile_definitions(GGML_BLAS_USE_MKL)
+            # all Intel* libraries share the same include path
+            pkg_check_modules(DepBLAS mkl-sdl)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
+            # this doesn't provide pkg-config
+            # suggest to assign BLAS_INCLUDE_DIRS on your own
+            if ("${NVHPC_VERSION}" STREQUAL "")
+                message(WARNING "Better to set NVHPC_VERSION")
+            else()
+                set(DepBLAS_FOUND ON)
+                set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
+            endif()
+        endif()
+        if (DepBLAS_FOUND)
+            set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
+        else()
+            message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
+            " detected by pkgconfig, trying to find cblas.h from possible paths...")
+            find_path(BLAS_INCLUDE_DIRS
+                NAMES cblas.h
+                HINTS
+                    /usr/include
+                    /usr/local/include
+                    /usr/include/openblas
+                    /opt/homebrew/opt/openblas/include
+                    /usr/local/opt/openblas/include
+                    /usr/include/x86_64-linux-gnu/openblas/include
+            )
+        endif()
+    endif()
+
+    message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
+
+    target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
+
+    if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
+        add_compile_definitions(GGML_BLAS_USE_MKL)
+    endif()
+
+    target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
+    target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
+else()
+    message(FATAL_ERROR "BLAS not found, please refer to "
+                        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
+                        " to set correct GGML_BLAS_VENDOR")
+endif()
--- a/ml/backend/ggml/ggml/src/ggml-blas/blas.go
+++ b/ml/backend/ggml/ggml/src/ggml-blas/blas.go
@@ -0,0 +1,10 @@
+//go:build darwin && arm64
+
+package blas
+
+// #cgo CXXFLAGS: -std=c++11
+// #cgo CPPFLAGS: -DGGML_USE_BLAS
+// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../include
+// #cgo darwin,arm64 CPPFLAGS: -DGGML_BLAS_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
+// #cgo darwin,arm64 LDFLAGS: -framework Accelerate
+import "C"
--- a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
@@ -0,0 +1,519 @@
+#include "ggml-impl.h"
+#include "ggml-blas.h"
+#include "ggml-backend-impl.h"
+
+#include <future>
+#include <vector>
+#include <cstring>
+
+#if defined(GGML_BLAS_USE_ACCELERATE)
+#   include <Accelerate/Accelerate.h>
+#elif defined(GGML_BLAS_USE_MKL)
+#   include <mkl.h>
+#elif defined(GGML_BLAS_USE_BLIS)
+#   include <blis.h>
+#elif defined(GGML_BLAS_USE_NVPL)
+#   include <nvpl_blas.h>
+#else
+#   include <cblas.h>
+#endif
+
+struct ggml_backend_blas_context {
+    int n_threads = GGML_DEFAULT_N_THREADS;
+    std::unique_ptr<char[]> work_data;
+    size_t work_size = 0;
+#ifndef GGML_USE_OPENMP
+    std::vector<std::future<void>> tasks;
+#endif
+};
+
+static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const enum ggml_type type = src0->type;
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    const int64_t ne_plane      = ne01*ne00;
+    const size_t  desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float);
+
+    if (ctx->work_size < desired_wsize) {
+        ctx->work_data.reset(new char[desired_wsize]);
+        ctx->work_size = desired_wsize;
+    }
+    void * wdata = ctx->work_data.get();
+
+    // convert src0 to float
+    if (type != GGML_TYPE_F32) {
+        const auto * type_traits = ggml_get_type_traits(type);
+        ggml_to_float_t const to_float = type_traits->to_float;
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const void  *       x      = (char *)  src0->data + i02*nb02          + i03*nb03;
+                      float * const wplane = (float *) wdata      + i02*ne_plane      + i03*ne02*ne_plane;
+
+                const int min_cols_per_thread = 4096;
+                const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
+                const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1);
+
+#ifdef GGML_USE_OPENMP
+                #pragma omp parallel for num_threads(n_threads)
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                }
+#else
+                for (int i = 1; i < n_threads; i++) {
+                    const int64_t start =       i*ne01/n_threads;
+                    const int64_t end   = (i + 1)*ne01/n_threads;
+                    if (start < end) {
+                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
+                            for (int64_t i01 = start; i01 < end; i01++) {
+                                to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                            }
+                        }));
+                    }
+                }
+                {
+                    // reuse the current thread for the first task
+                    const int64_t start = 0;
+                    const int64_t end   = ne01/n_threads;
+                    for (int64_t i01 = start; i01 < end; i01++) {
+                        to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                    }
+                }
+#endif
+            }
+        }
+
+#ifndef GGML_USE_OPENMP
+        // wait for all tasks to finish
+        for (auto & task : ctx->tasks) {
+            task.get();
+        }
+        ctx->tasks.clear();
+#endif
+    }
+
+#if defined(OPENBLAS_VERSION)
+    openblas_set_num_threads(ctx->n_threads);
+#endif
+
+#if defined(GGML_BLAS_USE_BLIS)
+    bli_thread_set_num_threads(ctx->n_threads);
+#endif
+
+#if defined(GGML_BLAS_USE_NVPL)
+    nvpl_blas_set_num_threads(ctx->n_threads);
+#endif
+
+    for (int64_t i13 = 0; i13 < ne13; i13++) {
+        for (int64_t i12 = 0; i12 < ne12; i12++) {
+            const int64_t i03 = i13/r3;
+            const int64_t i02 = i12/r2;
+
+            const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
+            const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
+                  float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
+
+            if (type != GGML_TYPE_F32) {
+                x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
+            }
+
+            cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne1, ne01, ne10,
+                        1.0f,   y, ne10,
+                                x, ne00,
+                        0.0f,   d, ne01);
+        }
+    }
+}
+
+static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(ne0  == ne00);
+    GGML_ASSERT(ne1  == ne10);
+    GGML_ASSERT(ne2  == ne02);
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne3  == ne13);
+    GGML_ASSERT(ne03 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    // GGML_ASSERT(nb0 <= nb1);
+    // GGML_ASSERT(nb1 <= nb2);
+    // GGML_ASSERT(nb2 <= nb3);
+
+    // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
+    // src0: (k,n)
+    // src1: (k,m)
+    // dst:  (m,n)
+    //
+    // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
+    // Also expressed as (major,minor)
+    // a: (m,k): so src1 transposed
+    // b: (k,n): so src0
+    // c: (m,n)
+    //
+    // However, if ggml_is_transposed(src1) is true, then
+    // src1->data already contains a transposed version, so sgemm mustn't
+    // transpose it further.
+
+    int n = src0->ne[0];
+    int k = src0->ne[1];
+    int m = src1->ne[0];
+
+    CBLAS_TRANSPOSE transposeA;
+    int lda;
+
+    if (!ggml_is_transposed(src1)) {
+        transposeA = CblasTrans;
+        lda = m;
+    } else {
+        transposeA = CblasNoTrans;
+        lda = k;
+    }
+
+    float * a = (float *) ((char *) src1->data);
+    float * b = (float *) ((char *) src0->data);
+    float * c = (float *) ((char *) dst->data);
+
+    cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
+
+    GGML_UNUSED(ctx);
+}
+
+// backend interface
+
+static const char * ggml_backend_blas_get_name(ggml_backend_t backend) {
+    return "BLAS";
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_blas_free(ggml_backend_t backend) {
+    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
+    delete ctx;
+    delete backend;
+}
+
+static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
+    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        switch (node->op) {
+            case GGML_OP_MUL_MAT:
+                ggml_backend_blas_mul_mat(ctx, node);
+                break;
+
+            case GGML_OP_OUT_PROD:
+                ggml_backend_blas_out_prod(ctx, node);
+                break;
+
+            case GGML_OP_NONE:
+            case GGML_OP_RESHAPE:
+            case GGML_OP_VIEW:
+            case GGML_OP_PERMUTE:
+            case GGML_OP_TRANSPOSE:
+                break;
+
+            default:
+                GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
+        }
+    }
+
+    return GGML_STATUS_SUCCESS;
+
+    GGML_UNUSED(backend);
+    GGML_UNUSED(batch_size);
+}
+
+static struct ggml_backend_i blas_backend_i = {
+    /* .get_name                = */ ggml_backend_blas_get_name,
+    /* .free                    = */ ggml_backend_blas_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_blas_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .graph_optimize          = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_blas_guid(void) {
+    static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
+    return &guid;
+}
+
+ggml_backend_t ggml_backend_blas_init(void) {
+    ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
+
+    ggml_backend_t backend = new ggml_backend {
+        /* .guid    = */ ggml_backend_blas_guid(),
+        /* .iface   = */ blas_backend_i,
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
+        /* .context = */ ctx,
+    };
+
+#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
+    if (openblas_get_parallel() != OPENBLAS_OPENMP) {
+        GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
+    }
+#endif
+
+#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
+    GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
+#endif
+
+    return backend;
+}
+
+bool ggml_backend_is_blas(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
+}
+
+void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_blas(backend_blas));
+
+    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
+    ctx->n_threads = n_threads;
+}
+
+// device interface
+
+static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
+    return "BLAS";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
+    #if defined(GGML_BLAS_USE_ACCELERATE)
+        return "Accelerate";
+    #elif defined(GGML_BLAS_USE_MKL)
+        return "MKL";
+    #elif defined(GGML_BLAS_USE_BLIS)
+        return "BLIS";
+    #elif defined(GGML_BLAS_USE_NVPL)
+        return "NVPL";
+    #elif defined(OPENBLAS_VERSION)
+        return "OpenBLAS";
+    #else
+        return "BLAS";
+    #endif
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    // TODO
+    *free = 0;
+    *total = 0;
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_blas_device_get_name(dev);
+    props->description = ggml_backend_blas_device_get_description(dev);
+    props->type        = ggml_backend_blas_device_get_type(dev);
+    ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ true,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+    return ggml_backend_blas_init();
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_cpu_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(max_tensor_size);
+}
+
+static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+
+    switch (op->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+
+        case GGML_OP_MUL_MAT:
+        {
+            // BLAS usually is only faster for large matrices
+            const struct ggml_tensor * src0 = op->src[0];
+            const struct ggml_tensor * src1 = op->src[1];
+
+            const int64_t ne10 = src1->ne[0];
+
+            const int64_t ne0 = op->ne[0];
+            const int64_t ne1 = op->ne[1];
+
+            // TODO: find the optimal value
+            const int64_t min_batch = 32;
+
+            return ggml_is_contiguous(src0) &&
+                   ggml_is_contiguous(src1) &&
+                   src1->type == GGML_TYPE_F32 &&
+                   (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
+                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
+        }
+
+        case GGML_OP_OUT_PROD:
+            return op->src[0]->type == GGML_TYPE_F32 &&
+                   op->src[1]->type == GGML_TYPE_F32 &&
+                   ggml_is_matrix(src0) &&
+                   ggml_is_matrix(src1) &&
+                   ggml_is_contiguous(src0) &&
+                   (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
+                   (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
+
+        default:
+            return false;
+
+    }
+
+    GGML_UNUSED(dev);
+}
+
+static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_buft_is_host(buft);
+
+    GGML_UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
+    /* .get_name             = */ ggml_backend_blas_device_get_name,
+    /* .get_description      = */ ggml_backend_blas_device_get_description,
+    /* .get_memory           = */ ggml_backend_blas_device_get_memory,
+    /* .get_type             = */ ggml_backend_blas_device_get_type,
+    /* .get_props            = */ ggml_backend_blas_device_get_props,
+    /* .init_backend         = */ ggml_backend_blas_device_init_backend,
+    /* .get_buffer_type      = */ ggml_backend_blas_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
+    /* .supports_op          = */ ggml_backend_blas_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_blas_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// backend reg interface
+
+static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) {
+    return "BLAS";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) {
+    return 1;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    static ggml_backend_device ggml_backend_blas_device = {
+        /* .iface   = */ ggml_backend_blas_device_i,
+        /* .reg     = */ reg,
+        /* .context = */ nullptr,
+    };
+
+    return &ggml_backend_blas_device;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+}
+
+static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
+        return (void *)ggml_backend_blas_set_n_threads;
+    }
+    return NULL;
+
+    GGML_UNUSED(reg);
+    GGML_UNUSED(name);
+}
+
+static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
+    /* .get_name         = */ ggml_backend_blas_reg_get_name,
+    /* .get_device_count = */ ggml_backend_blas_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_blas_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_blas_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_blas_reg(void) {
+    static struct ggml_backend_reg ggml_backend_blas_reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_blas_reg_i,
+        /* .context     = */ NULL,
+    };
+
+    return &ggml_backend_blas_reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
--- a/ml/backend/ggml/ggml/src/ggml-common.h
+++ b/ml/backend/ggml/ggml/src/ggml-common.h
--- a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
@@ -0,0 +1,675 @@
+function(ggml_add_cpu_backend_features cpu_name arch)
+    # The feature detection code is compiled as a separate target so that
+    # it can be built without the architecture flags
+    # Since multiple variants of the CPU backend may be included in the same
+    # build, using set_source_files_properties() to set the arch flags is not possible
+    set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
+    add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
+    target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
+    target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
+    target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
+    set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
+endfunction()
+
+function(ggml_add_cpu_backend_variant_impl tag_name)
+    if (tag_name)
+        set(GGML_CPU_NAME ggml-cpu-${tag_name})
+    else()
+        set(GGML_CPU_NAME ggml-cpu)
+    endif()
+
+    ggml_add_backend_library(${GGML_CPU_NAME})
+
+    list (APPEND GGML_CPU_SOURCES
+        ggml-cpu/ggml-cpu.c
+        ggml-cpu/ggml-cpu.cpp
+        ggml-cpu/repack.cpp
+        ggml-cpu/repack.h
+        ggml-cpu/hbm.cpp
+        ggml-cpu/hbm.h
+        ggml-cpu/quants.c
+        ggml-cpu/quants.h
+        ggml-cpu/traits.cpp
+        ggml-cpu/traits.h
+        ggml-cpu/amx/amx.cpp
+        ggml-cpu/amx/amx.h
+        ggml-cpu/amx/mmq.cpp
+        ggml-cpu/amx/mmq.h
+        ggml-cpu/ggml-cpu-impl.h
+        ggml-cpu/common.h
+        ggml-cpu/binary-ops.h
+        ggml-cpu/binary-ops.cpp
+        ggml-cpu/unary-ops.h
+        ggml-cpu/unary-ops.cpp
+        ggml-cpu/simd-mappings.h
+        ggml-cpu/vec.h
+        ggml-cpu/vec.cpp
+        ggml-cpu/ops.h
+        ggml-cpu/ops.cpp
+        )
+
+    target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
+    target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
+
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate)
+        if (ACCELERATE_FRAMEWORK)
+            message(STATUS "Accelerate framework found")
+
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64)
+
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK})
+        else()
+            message(WARNING "Accelerate framework not found")
+        endif()
+    endif()
+
+    if (GGML_OPENMP)
+        find_package(OpenMP)
+        if (OpenMP_FOUND)
+            set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
+
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        else()
+            set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
+            message(WARNING "OpenMP not found")
+        endif()
+    endif()
+
+    if (GGML_LLAMAFILE)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
+
+        list(APPEND GGML_CPU_SOURCES
+                    ggml-cpu/llamafile/sgemm.cpp
+                    ggml-cpu/llamafile/sgemm.h)
+    endif()
+
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind REQUIRED)
+
+        message(STATUS "Using memkind for CPU HBM")
+
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM)
+
+        target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
+    endif()
+
+    if (GGML_SYSTEM_ARCH STREQUAL "ARM")
+        message(STATUS "ARM detected")
+        list(APPEND GGML_CPU_SOURCES
+            ggml-cpu/arch/arm/quants.c
+            ggml-cpu/arch/arm/repack.cpp
+            )
+
+        if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
+            message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
+        else()
+            check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+            if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
+                list(APPEND ARCH_FLAGS -mfp16-format=ieee)
+            endif()
+
+            if (GGML_NATIVE)
+                # -mcpu=native does not always enable all the features in some compilers,
+                # so we check for them manually and enable them if available
+
+                execute_process(
+                    COMMAND ${CMAKE_C_COMPILER} -mcpu=native -E -v -
+                    INPUT_FILE "/dev/null"
+                    OUTPUT_QUIET
+                    ERROR_VARIABLE ARM_MCPU
+                    RESULT_VARIABLE ARM_MCPU_RESULT
+                )
+                if (NOT ARM_MCPU_RESULT)
+                    string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
+                    string(REGEX MATCH "-march=[^ ']+" ARM_MARCH_FLAG "${ARM_MCPU}")
+
+                    # on some old GCC we need to read -march=
+                    if (ARM_MARCH_FLAG AND NOT "${ARM_MARCH_FLAG}" STREQUAL "-march=native")
+                        set(ARM_NATIVE_FLAG "${ARM_MARCH_FLAG}")
+                    elseif(ARM_MCPU_FLAG AND NOT "${ARM_MCPU_FLAG}" STREQUAL "-mcpu=native")
+                        set(ARM_NATIVE_FLAG "${ARM_MCPU_FLAG}")
+                    endif()
+                endif()
+
+                if ("${ARM_NATIVE_FLAG}" STREQUAL "")
+                    set(ARM_NATIVE_FLAG -mcpu=native)
+                    message(WARNING "ARM -march/-mcpu not found, -mcpu=native will be used")
+                else()
+                    message(STATUS "ARM detected flags: ${ARM_NATIVE_FLAG}")
+                endif()
+
+                include(CheckCXXSourceRuns)
+
+                macro(check_arm_feature tag feature code)
+                    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+                    set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+${tag}")
+                    check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
+                    if (GGML_MACHINE_SUPPORTS_${tag})
+                        set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+${tag}")
+                    else()
+                        set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+no${tag}")
+                        check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
+                        if (GGML_MACHINE_SUPPORTS_no${tag})
+                            set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+no${tag}")
+                            list(APPEND ARCH_FLAGS -U__ARM_FEATURE_${feature})
+                        endif()
+                    endif()
+                    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+                endmacro()
+
+                check_arm_feature(dotprod DOTPROD     "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
+                check_arm_feature(i8mm    MATMUL_INT8 "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
+                check_arm_feature(sve     SVE         "#include <arm_sve.h>\nint main()  { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
+                check_arm_feature(sme     SME         "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
+
+                list(APPEND ARCH_FLAGS "${ARM_NATIVE_FLAG}${ARM_NATIVE_FLAG_FIX}")
+            else()
+                if (GGML_CPU_ARM_ARCH)
+                    list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
+                elseif(GGML_CPU_ALL_VARIANTS)
+                    # Begin with the lowest baseline
+                    set(ARM_MCPU "armv8-a")
+                    set(ARCH_TAGS "")
+                    set(ARCH_DEFINITIONS "")
+
+                    # When a feature is selected, bump the MCPU to the first
+                    # version that supported it
+                    if (GGML_INTERNAL_DOTPROD)
+                        set(ARM_MCPU "armv8.2-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
+                    endif()
+                    if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
+                        set(ARM_MCPU "armv8.2-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+fp16")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
+                    endif()
+                    if (GGML_INTERNAL_SVE)
+                        set(ARM_MCPU "armv8.2-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+sve")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
+                    endif()
+                    if (GGML_INTERNAL_MATMUL_INT8)
+                        set(ARM_MCPU "armv8.6-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
+                    endif()
+                    if (GGML_INTERNAL_SVE2)
+                        set(ARM_MCPU "armv8.6-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+sve2")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
+                    endif()
+                    if (GGML_INTERNAL_NOSVE)
+                        set(ARCH_TAGS "${ARCH_TAGS}+nosve")
+                    endif()
+                    if (GGML_INTERNAL_SME)
+                        set(ARM_MCPU "armv9.2-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+sme")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
+                    endif()
+                    list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
+                    ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
+                endif()
+            endif()
+
+            message(STATUS "Checking for ARM features using flags:")
+            foreach(flag IN LISTS ARCH_FLAGS)
+                message(STATUS "  ${flag}")
+            endforeach()
+
+            include(CheckCXXSourceCompiles)
+            set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+            string(REPLACE ";" " " ARCH_FLAGS_STR "${ARCH_FLAGS}")
+            set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS_STR}")
+            foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
+                set(ARM_FEATURE "HAVE_${feature}")
+                check_cxx_source_compiles(
+                    "
+                    #if !defined(__ARM_FEATURE_${feature})
+                    #  error \"Feature ${feature} is not defined\"
+                    #endif
+                    int main() { return 0; }
+                    "
+                    ${ARM_FEATURE}
+                )
+            endforeach()
+            set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
+        message(STATUS "x86 detected")
+        list(APPEND GGML_CPU_SOURCES
+            ggml-cpu/arch/x86/quants.c
+            ggml-cpu/arch/x86/repack.cpp
+            )
+
+        if (MSVC)
+            # instruction set detection for MSVC only
+            if (GGML_NATIVE)
+                include(ggml-cpu/cmake/FindSIMD.cmake)
+            endif ()
+            if (GGML_AVX512)
+                list(APPEND ARCH_FLAGS /arch:AVX512)
+                # /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
+                # MSVC has no compile-time flags enabling specific
+                # AVX512 extensions, neither it defines the
+                # macros corresponding to the extensions.
+                # Do it manually.
+                list(APPEND ARCH_DEFINITIONS GGML_AVX512)
+                if (GGML_AVX512_VBMI)
+                    list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512vbmi)
+                    endif()
+                endif()
+                if (GGML_AVX512_VNNI)
+                    list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512vnni)
+                    endif()
+                endif()
+                if (GGML_AVX512_BF16)
+                    list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512bf16)
+                    endif()
+                endif()
+                if (GGML_AMX_TILE)
+                    list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE)
+                endif()
+                if (GGML_AMX_INT8)
+                    list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8)
+                endif()
+                if (GGML_AMX_BF16)
+                    list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16)
+                endif()
+            elseif (GGML_AVX2)
+                list(APPEND ARCH_FLAGS /arch:AVX2)
+                list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C)
+            elseif (GGML_AVX)
+                list(APPEND ARCH_FLAGS /arch:AVX)
+                list(APPEND ARCH_DEFINITIONS GGML_AVX)
+            elseif (GGML_SSE42)
+                list(APPEND ARCH_FLAGS /arch:SSE4.2)
+                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+            endif()
+            if (GGML_AVX_VNNI)
+                list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
+            endif()
+            if (GGML_BMI2)
+                # MSVC does not define macro __BMI2__
+                list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
+            endif()
+        else ()
+            if (GGML_NATIVE)
+                list(APPEND ARCH_FLAGS -march=native)
+            else ()
+                if (GGML_SSE42)
+                    list(APPEND ARCH_FLAGS -msse4.2)
+                    list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+                endif()
+                if (GGML_F16C)
+                    list(APPEND ARCH_FLAGS -mf16c)
+                    list(APPEND ARCH_DEFINITIONS GGML_F16C)
+                endif()
+                if (GGML_FMA)
+                    list(APPEND ARCH_FLAGS -mfma)
+                    list(APPEND ARCH_DEFINITIONS GGML_FMA)
+                endif()
+                if (GGML_BMI2)
+                    list(APPEND ARCH_FLAGS -mbmi2)
+                    list(APPEND ARCH_DEFINITIONS GGML_BMI2)
+                endif()
+                if (GGML_AVX)
+                    list(APPEND ARCH_FLAGS -mavx)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX)
+                endif()
+                if (GGML_AVX2)
+                    list(APPEND ARCH_FLAGS -mavx2)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX2)
+                endif()
+                if (GGML_AVX_VNNI)
+                    list(APPEND ARCH_FLAGS -mavxvnni)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI)
+                endif()
+                if (GGML_AVX512)
+                    list(APPEND ARCH_FLAGS -mavx512f)
+                    list(APPEND ARCH_FLAGS -mavx512cd)
+                    list(APPEND ARCH_FLAGS -mavx512vl)
+                    list(APPEND ARCH_FLAGS -mavx512dq)
+                    list(APPEND ARCH_FLAGS -mavx512bw)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512)
+                endif()
+                if (GGML_AVX512_VBMI)
+                    list(APPEND ARCH_FLAGS -mavx512vbmi)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI)
+                endif()
+                if (GGML_AVX512_VNNI)
+                    list(APPEND ARCH_FLAGS -mavx512vnni)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI)
+                endif()
+                if (GGML_AVX512_BF16)
+                    list(APPEND ARCH_FLAGS -mavx512bf16)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16)
+                endif()
+                if (GGML_AMX_TILE)
+                    list(APPEND ARCH_FLAGS -mamx-tile)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE)
+                endif()
+                if (GGML_AMX_INT8)
+                    list(APPEND ARCH_FLAGS -mamx-int8)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8)
+                endif()
+                if (GGML_AMX_BF16)
+                    list(APPEND ARCH_FLAGS -mamx-bf16)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16)
+                endif()
+            endif()
+        endif()
+
+        if (GGML_BACKEND_DL)
+            if (GGML_NATIVE)
+                # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
+                message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
+            endif()
+            ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
+        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
+        message(STATUS "PowerPC detected")
+        list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/powerpc/quants.c)
+        if (GGML_NATIVE)
+            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+                file(READ "/proc/cpuinfo" POWER10_M)
+            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
+                execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
+            endif()
+
+            string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
+            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
+            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
+
+            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
+                list(APPEND ARCH_FLAGS -mcpu=power10)
+            elseif (EXTRACTED_NUMBER EQUAL 9)
+                list(APPEND ARCH_FLAGS -mcpu=power9)
+            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+                list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
+            else()
+                list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
+            endif()
+        elseif(GGML_CPU_ALL_VARIANTS)
+            # Begin with the lowest baseline
+            set(ARCH_DEFINITIONS "")
+
+            # When a feature is selected, bump the MCPU to the first
+            # version that supported it
+            foreach(PVER RANGE 7 11)
+                if(DEFINED GGML_INTERNAL_POWER${PVER})
+                    set(POWERPC_MCPU "power${PVER}")
+                    list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER})
+                endif()
+            endforeach()
+            if (GGML_INTERNAL_VSX)
+                list(APPEND ARCH_DEFINITIONS GGML_USE_VSX)
+                list(APPEND ARCH_FLAGS -mvsx)
+            endif()
+
+            if (DEFINED POWERPC_MCPU)
+                list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU})
+            endif()
+            ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS})
+        else()
+            if (GGML_CPU_POWERPC_CPUTYPE)
+                list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
+            endif()
+        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
+        message(STATUS "loongarch64 detected")
+        list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/loongarch/quants.c)
+
+        list(APPEND ARCH_FLAGS -march=loongarch64)
+        if (GGML_LASX)
+            list(APPEND ARCH_FLAGS -mlasx)
+        endif()
+        if (GGML_LSX)
+            list(APPEND ARCH_FLAGS -mlsx)
+        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
+        message(STATUS "riscv64 detected")
+        list(APPEND GGML_CPU_SOURCES
+            ggml-cpu/arch/riscv/quants.c
+            ggml-cpu/arch/riscv/repack.cpp
+            )
+        if (GGML_CPU_RISCV64_SPACEMIT)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
+            list(APPEND GGML_CPU_SOURCES
+                ggml-cpu/spacemit/ime.cpp
+                ggml-cpu/spacemit/ime.h
+                ggml-cpu/spacemit/ime1_kernels.cpp
+                ggml-cpu/spacemit/ime_kernels.h
+            )
+        endif()
+        if(NOT GGML_CPU_ALL_VARIANTS)
+            set(MARCH_STR "rv64gc")
+            if (GGML_RV_ZFH)
+                string(APPEND MARCH_STR "_zfh")
+            endif()
+            if (GGML_XTHEADVECTOR)
+                string(APPEND MARCH_STR "_xtheadvector")
+            elseif (GGML_RVV)
+                string(APPEND MARCH_STR "_v")
+                if (GGML_RV_ZVFH)
+                    string(APPEND MARCH_STR "_zvfh")
+                endif()
+            endif()
+            if (GGML_RV_ZICBOP)
+                string(APPEND MARCH_STR "_zicbop")
+            endif()
+            if (GGML_RV_ZIHINTPAUSE)
+                string(APPEND MARCH_STR "_zihintpause")
+            endif()
+            list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
+        else()
+            # Begin with the lowest baseline
+            set(ARCH_DEFINITIONS "")
+
+            if (GGML_INTERNAL_RVV)
+                message(STATUS "RVV enabled")
+                list(APPEND ARCH_DEFINITIONS GGML_USE_RVV)
+                list(APPEND ARCH_FLAGS -march=rv64gc_v -mabi=lp64d)
+            endif()
+
+            ggml_add_cpu_backend_features(${GGML_CPU_NAME} riscv ${ARCH_DEFINITIONS})
+        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
+        message(STATUS "s390x detected")
+        list(APPEND GGML_CPU_SOURCES
+            ggml-cpu/arch/s390/quants.c)
+
+        # for native compilation
+        if (GGML_NATIVE)
+            # check machine level to determine target
+            file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
+            string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
+
+            # TODO: Separation to determine activation of VX/VXE/VXE2
+            if (${S390X_M} MATCHES "8561|8562")
+                message(STATUS "z15 target")
+                list(APPEND ARCH_FLAGS -march=z15)
+            elseif (${S390X_M} MATCHES "3931")
+                message(STATUS "z16 target")
+                list(APPEND ARCH_FLAGS -march=z16)
+            elseif (${S390X_M} MATCHES "9175|9176")
+                # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
+                #       binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
+                message(STATUS "z17 target")
+                list(APPEND ARCH_FLAGS -march=arch15)
+            else()
+                message(STATUS "Unknown target")
+                message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
+                list(APPEND ARCH_FLAGS -march=native -mtune=native)
+            endif()
+        # for cross-compilation
+        elseif(GGML_CPU_ALL_VARIANTS)
+            # range through IBM z15 to z17
+            # NOTE: update when a new hardware level is released
+            foreach (ZHW RANGE 15 17)
+                if(DEFINED GGML_INTERNAL_Z${ZHW})
+                    message(STATUS "z${ZHW} cross-compile target")
+                    list(APPEND ARCH_FLAGS -march=z${ZHW})
+                endif()
+            endforeach()
+        endif()
+
+        if (GGML_VXE OR GGML_INTERNAL_VXE2)
+            message(STATUS "VXE2 enabled")
+            list(APPEND ARCH_FLAGS -mvx -mzvector)
+            list(APPEND ARCH_DEFINITIONS GGML_USE_VXE2)
+        endif()
+
+        if (GGML_INTERNAL_NNPA)
+            message(STATUS "NNPA enabled")
+            list(APPEND ARCH_DEFINITIONS GGML_USE_NNPA)
+        endif()
+
+        ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
+    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
+        message(STATUS "Wasm detected")
+        list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
+    else()
+        message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
+        list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
+    endif()
+
+    if (GGML_CPU_REPACK)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK)
+    endif()
+
+    if (GGML_CPU_KLEIDIAI)
+        message(STATUS "Using KleidiAI optimized kernels if applicable")
+
+        # Disable the KleidiAI tests
+        set(KLEIDIAI_BUILD_TESTS  OFF)
+
+        # Fetch KleidiAI sources:
+        include(FetchContent)
+        set(KLEIDIAI_COMMIT_TAG "v1.14.0")
+        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
+        set(KLEIDIAI_ARCHIVE_MD5  "45e110675d93f99f82c23a1afcca76bc")
+
+        if (POLICY CMP0135)
+            cmake_policy(SET CMP0135 NEW)
+        endif()
+
+        FetchContent_Declare(KleidiAI_Download
+            URL ${KLEIDIAI_DOWNLOAD_URL}
+            DOWNLOAD_EXTRACT_TIMESTAMP NEW
+            URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
+
+        FetchContent_MakeAvailable(KleidiAI_Download)
+        FetchContent_GetProperties(KleidiAI_Download
+            SOURCE_DIR  KLEIDIAI_SRC
+            POPULATED   KLEIDIAI_POPULATED)
+
+        if (NOT KLEIDIAI_POPULATED)
+            message(FATAL_ERROR "KleidiAI source downloaded failed.")
+        endif()
+
+        add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
+
+        # Remove kleidiai target after fetching it
+        if (TARGET kleidiai)
+            set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
+        endif()
+
+        list(APPEND GGML_CPU_SOURCES
+            ggml-cpu/kleidiai/kleidiai.cpp
+            ggml-cpu/kleidiai/kernels.cpp
+            ggml-cpu/kleidiai/kleidiai.h
+            ggml-cpu/kleidiai/kernels.h
+            )
+
+        # KleidiAI
+        include_directories(
+            ${KLEIDIAI_SRC}/
+            ${KLEIDIAI_SRC}/kai/
+            ${KLEIDIAI_SRC}/kai/ukernels/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
+
+        set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
+        if (NOT ARCH_FLAGS_TEMP)
+            string(REGEX MATCH "-march=[^ ]+" ARCH_FLAGS_TEMP "${CMAKE_C_FLAGS}")
+        endif()
+        string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED)
+        string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
+        string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
+
+        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
+
+        list(APPEND GGML_KLEIDIAI_SOURCES
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.c)
+
+        if (NOT DOTPROD_ENABLED MATCHES -1)
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c)
+        endif()
+
+        if (NOT I8MM_ENABLED MATCHES -1)
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.c)
+        endif()
+
+        if (NOT SME_ENABLED MATCHES -1)
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot_asm.S
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
+                ${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
+            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
+        endif()
+
+        set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
+        list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
+    endif()
+
+    message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
+    target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
+    target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
+    target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
+
+    if (EMSCRIPTEN)
+        set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
+        # The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
+        target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
+    endif()
+endfunction()
--- a/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.cpp
@@ -0,0 +1,224 @@
+#include "amx.h"
+#include "common.h"
+#include "mmq.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "traits.h"
+
+#if defined(__linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+
+// AMX type_trais
+namespace ggml::cpu::amx {
+class tensor_traits : public ggml::cpu::tensor_traits {
+    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
+        size = ggml_backend_amx_desired_wsize(op);
+        return true;
+    }
+
+    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
+        if (op->op == GGML_OP_MUL_MAT) {
+            ggml_backend_amx_mul_mat(params, op);
+            return true;
+        }
+        return false;
+    }
+};
+
+static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
+    static tensor_traits traits;
+    return &traits;
+}
+}  // namespace ggml::cpu::amx
+
+// AMX buffer interface
+static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+}
+
+static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *) (buffer->context);
+}
+
+static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
+
+    GGML_UNUSED(buffer);
+    return GGML_STATUS_SUCCESS;
+}
+
+static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+                                                  uint8_t value, size_t offset, size_t size) {
+    memset((char *) tensor->data + offset, value, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+                                               const void * data, size_t offset, size_t size) {
+    if (qtype_has_amx_kernels(tensor->type)) {
+        GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
+        ggml_backend_amx_convert_weight(tensor, data, offset, size);
+    } else {
+        memcpy((char *) tensor->data + offset, data, size);
+    }
+
+    GGML_UNUSED(buffer);
+}
+
+/*
+// need to figure what we need to do with buffer->extra.
+static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        if (qtype_has_amx_kernels(src->type)) {
+            ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
+        } else {
+            memcpy(dst->data, src->data, ggml_nbytes(src));
+        }
+        return true;
+    }
+    return false;
+
+    GGML_UNUSED(buffer);
+}
+*/
+
+static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
+
+static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_amx_buffer_init_tensor,
+    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
+    /* .get_tensor      = */ nullptr,
+    /* .cpy_tensor      = */ nullptr,
+    /* .clear           = */ ggml_backend_amx_buffer_clear,
+    /* .reset           = */ nullptr,
+};
+
+static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "AMX";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * data = ggml_aligned_malloc(size);
+    if (data == NULL) {
+        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
+        return NULL;
+    }
+
+    return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
+}
+
+static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+
+    GGML_UNUSED(buft);
+}
+
+namespace ggml::cpu::amx {
+class extra_buffer_type : ggml::cpu::extra_buffer_type {
+    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
+        // handle only 2d gemm for now
+        auto is_contiguous_2d = [](const struct ggml_tensor * t) {
+            return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
+        };
+
+        if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) &&  // src0 must be contiguous
+            is_contiguous_2d(op->src[1]) &&                               // src1 must be contiguous
+            op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
+            op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
+            op->ne[0] % (TILE_N * 2) == 0 &&                              // out_features is 32x
+            (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
+            // src1 must be host buffer
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            // src1 must be float32
+            if (op->src[1]->type == GGML_TYPE_F32) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
+        if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
+            op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
+            return (ggml::cpu::tensor_traits *) op->src[0]->extra;
+        }
+
+        return nullptr;
+    }
+};
+}  // namespace ggml::cpu::amx
+
+static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    return ggml_backend_amx_get_alloc_size(tensor);
+
+    GGML_UNUSED(buft);
+}
+
+#define ARCH_GET_XCOMP_PERM     0x1022
+#define ARCH_REQ_XCOMP_PERM     0x1023
+#define XFEATURE_XTILECFG       17
+#define XFEATURE_XTILEDATA      18
+
+static bool ggml_amx_init() {
+#if defined(__linux__)
+    if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
+        fprintf(stderr, "AMX is not ready to be used!\n");
+        return false;
+    }
+    return true;
+#elif defined(_WIN32)
+    return true;
+#else
+    return false;
+#endif
+}
+
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
+        /* .iface = */ {
+                        /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
+                        /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
+                        /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
+                        /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
+                        /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
+                        /* .is_host          = */ nullptr,
+                        },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ new ggml::cpu::amx::extra_buffer_type(),
+    };
+
+    if (!ggml_amx_init()) {
+        return nullptr;
+    }
+
+    return &ggml_backend_buffer_type_amx;
+}
+
+#endif  // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
--- a/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.h
@@ -0,0 +1,8 @@
+#include "ggml-backend.h"
+#include "ggml-cpu-impl.h"
+
+// GGML internal header
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/amx/common.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/amx/common.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpu-impl.h"
+
+#include <algorithm>
+#include <memory>
+#include <type_traits>
+
+#if defined(GGML_USE_OPENMP)
+#include <omp.h>
+#endif
+
+#define TILE_M 16
+#define TILE_N 16
+#define TILE_K 32
+#define VNNI_BLK 4
+
+#define AMX_BLK_SIZE 32
+
+#define TMM0 0
+#define TMM1 1
+#define TMM2 2
+#define TMM3 3
+#define TMM4 4
+#define TMM5 5
+#define TMM6 6
+#define TMM7 7
+
+// parallel routines
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+inline T div_up(T x, T y) { return (x + y - 1) / y; }
+
+template <typename T>
+inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
+#if 0
+    // onednn partition pattern
+    T& n_my = n_end;
+    if (nth <= 1 || n == 0) {
+        n_start = 0;
+        n_my = n;
+    } else {
+        T n1 = div_up(n, nth);
+        T n2 = n1 - 1;
+        T T1 = n - n2 * nth;
+        n_my = ith < T1 ? n1 : n2;
+        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
+    }
+    n_end += n_start;
+#else
+    // pytorch aten partition pattern
+    T n_my = div_up(n, nth);
+    n_start = ith * n_my;
+    n_end = std::min(n_start + n_my, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for(int n, const func_t& f) {
+#if defined(GGML_USE_OPENMP)
+#pragma omp parallel
+{
+    int nth = omp_get_num_threads();
+    int ith = omp_get_thread_num();
+    int tbegin, tend;
+    balance211(n, nth, ith, tbegin, tend);
+    f(tbegin, tend);
+}
+#else
+    f(0, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
+    int tbegin, tend;
+    balance211(n, params->nth, params->ith, tbegin, tend);
+    f(tbegin, tend);
+}
+
+// quantized types that have AMX support
+inline bool qtype_has_amx_kernels(const enum ggml_type type) {
+    // TODO: fix padding for vnni format
+    return (type == GGML_TYPE_Q4_0) ||
+        (type == GGML_TYPE_Q4_1) ||
+        (type == GGML_TYPE_Q8_0) ||
+        (type == GGML_TYPE_Q4_K) ||
+        (type == GGML_TYPE_Q5_K) ||
+        (type == GGML_TYPE_Q6_K) ||
+        (type == GGML_TYPE_IQ4_XS);
+}
--- a/ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.cpp
--- a/ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.h
@@ -0,0 +1,10 @@
+#pragma once
+#include "common.h"
+
+size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
+
+size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
+
+void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+
+void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/arch-fallback.h
@@ -0,0 +1,234 @@
+#pragma once
+
+// Rename `_generic` functions if no native implementation is available.
+// This effectively selects the generic implementation.
+
+#if defined(GGML_CPU_GENERIC)
+// quants.c
+#define quantize_row_q8_0_generic quantize_row_q8_0
+#define quantize_row_q8_1_generic quantize_row_q8_1
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
+#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
+#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
+#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
+#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
+#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
+#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
+#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
+#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
+#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
+#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
+#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
+#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
+#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
+#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
+#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
+#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
+// repack.cpp
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
+#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#elif defined(__POWERPC__) || defined(__powerpc__)
+// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
+// quants.c
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#elif defined(__loongarch64)
+// quants.c
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#elif defined(__riscv)
+// quants.c
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
+#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
+#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
+#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
+#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
+#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
+#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
+#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#elif defined(__s390x__)
+// quants.c
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
+#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
+#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
+#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
+#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
+#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
+#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#elif defined(__wasm__)
+// quants.c
+#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
+#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
+#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
+#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
+#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
+#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
+#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
+#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/arm.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/arm.go
@@ -0,0 +1,7 @@
+//go:build arm64
+
+package arm
+
+// #cgo CXXFLAGS: -std=c++17
+// #cgo CPPFLAGS: -I${SRCDIR}/../.. -I${SRCDIR}/../../.. -I${SRCDIR}/../../../../include -DHWCAP2_SVE2="2"
+import "C"
--- a/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp
@@ -0,0 +1,98 @@
+#include "ggml-backend-impl.h"
+
+#if defined(__aarch64__)
+
+#if defined(__linux__)
+#include <sys/auxv.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+#if !defined(HWCAP2_SVE2)
+#define HWCAP2_SVE2 (1 << 1)
+#endif
+
+#if !defined(HWCAP2_I8MM)
+#define HWCAP2_I8MM (1 << 13)
+#endif
+
+#if !defined(HWCAP2_SME)
+#define HWCAP2_SME (1 << 23)
+#endif
+
+struct aarch64_features {
+    // has_neon not needed, aarch64 has NEON guaranteed
+    bool has_dotprod     = false;
+    bool has_fp16_va     = false;
+    bool has_sve         = false;
+    bool has_sve2        = false;
+    bool has_i8mm        = false;
+    bool has_sme         = false;
+
+    aarch64_features() {
+#if defined(__linux__)
+        uint32_t hwcap = getauxval(AT_HWCAP);
+        uint32_t hwcap2 = getauxval(AT_HWCAP2);
+
+        has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
+        has_fp16_va = !!(hwcap & HWCAP_FPHP);
+        has_sve     = !!(hwcap & HWCAP_SVE);
+        has_sve2    = !!(hwcap2 & HWCAP2_SVE2);
+        has_i8mm    = !!(hwcap2 & HWCAP2_I8MM);
+        has_sme     = !!(hwcap2 & HWCAP2_SME);
+#elif defined(__APPLE__)
+        int oldp = 0;
+        size_t size = sizeof(oldp);
+
+        if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
+            has_dotprod = static_cast<bool>(oldp);
+        }
+
+        if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
+            has_i8mm = static_cast<bool>(oldp);
+        }
+
+        if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
+            has_sme = static_cast<bool>(oldp);
+        }
+
+        // Apple apparently does not implement SVE yet
+#endif
+    }
+};
+
+static int ggml_backend_cpu_aarch64_score() {
+    int score = 1;
+    aarch64_features af;
+
+#ifdef GGML_USE_DOTPROD
+    if (!af.has_dotprod) { return 0; }
+    score += 1<<1;
+#endif
+#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
+    if (!af.has_fp16_va) { return 0; }
+    score += 1<<2;
+#endif
+#ifdef GGML_USE_SVE
+    if (!af.has_sve) { return 0; }
+    score += 1<<3;
+#endif
+#ifdef GGML_USE_MATMUL_INT8
+    if (!af.has_i8mm) { return 0; }
+    score += 1<<4;
+#endif
+#ifdef GGML_USE_SVE2
+    if (!af.has_sve2) { return 0; }
+    score += 1<<5;
+#endif
+#ifdef GGML_USE_SME
+    if (!af.has_sme) { return 0; }
+    score += 1<<6;
+#endif
+
+    return score;
+}
+
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
+
+# endif // defined(__aarch64__)
--- a/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/quants.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/quants.c
--- a/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm/repack.cpp
--- a/ml/backend/ggml/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp
@@ -0,0 +1,327 @@
+#include "ggml-backend-impl.h"
+
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstring>
+#include <vector>
+#include <bitset>
+#include <array>
+#include <string>
+
+// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
+struct cpuid_x86 {
+    bool SSE3(void) { return f_1_ecx[0]; }
+    bool PCLMULQDQ(void) { return f_1_ecx[1]; }
+    bool MONITOR(void) { return f_1_ecx[3]; }
+    bool SSSE3(void) { return f_1_ecx[9]; }
+    bool FMA(void) { return f_1_ecx[12]; }
+    bool CMPXCHG16B(void) { return f_1_ecx[13]; }
+    bool SSE41(void) { return f_1_ecx[19]; }
+    bool SSE42(void) { return f_1_ecx[20]; }
+    bool MOVBE(void) { return f_1_ecx[22]; }
+    bool POPCNT(void) { return f_1_ecx[23]; }
+    bool AES(void) { return f_1_ecx[25]; }
+    bool XSAVE(void) { return f_1_ecx[26]; }
+    bool OSXSAVE(void) { return f_1_ecx[27]; }
+    bool AVX(void) { return f_1_ecx[28]; }
+    bool F16C(void) { return f_1_ecx[29]; }
+    bool RDRAND(void) { return f_1_ecx[30]; }
+
+    bool MSR(void) { return f_1_edx[5]; }
+    bool CX8(void) { return f_1_edx[8]; }
+    bool SEP(void) { return f_1_edx[11]; }
+    bool CMOV(void) { return f_1_edx[15]; }
+    bool CLFSH(void) { return f_1_edx[19]; }
+    bool MMX(void) { return f_1_edx[23]; }
+    bool FXSR(void) { return f_1_edx[24]; }
+    bool SSE(void) { return f_1_edx[25]; }
+    bool SSE2(void) { return f_1_edx[26]; }
+
+    bool FSGSBASE(void) { return f_7_ebx[0]; }
+    bool BMI1(void) { return f_7_ebx[3]; }
+    bool HLE(void) { return is_intel && f_7_ebx[4]; }
+    bool AVX2(void) { return f_7_ebx[5]; }
+    bool BMI2(void) { return f_7_ebx[8]; }
+    bool ERMS(void) { return f_7_ebx[9]; }
+    bool INVPCID(void) { return f_7_ebx[10]; }
+    bool RTM(void) { return is_intel && f_7_ebx[11]; }
+    bool AVX512F(void) { return f_7_ebx[16]; }
+    bool AVX512DQ(void) { return f_7_ebx[17]; }
+    bool RDSEED(void) { return f_7_ebx[18]; }
+    bool ADX(void) { return f_7_ebx[19]; }
+    bool AVX512PF(void) { return f_7_ebx[26]; }
+    bool AVX512ER(void) { return f_7_ebx[27]; }
+    bool AVX512CD(void) { return f_7_ebx[28]; }
+    bool AVX512BW(void) { return f_7_ebx[30]; }
+    bool AVX512VL(void) { return f_7_ebx[31]; }
+
+    bool SHA(void) { return f_7_ebx[29]; }
+
+    bool PREFETCHWT1(void) { return f_7_ecx[0]; }
+
+    bool LAHF(void) { return f_81_ecx[0]; }
+    bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
+    bool ABM(void) { return is_amd && f_81_ecx[5]; }
+    bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
+    bool XOP(void) { return is_amd && f_81_ecx[11]; }
+    bool TBM(void) { return is_amd && f_81_ecx[21]; }
+
+    bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
+    bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
+    bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
+    bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
+    bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
+
+    bool AVX512_VBMI(void) { return f_7_ecx[1]; }
+    bool AVX512_VNNI(void) { return f_7_ecx[11]; }
+    bool AVX512_FP16(void) { return f_7_edx[23]; }
+    bool AVX512_BF16(void) { return f_7_1_eax[5]; }
+    bool AVX_VNNI(void) { return f_7_1_eax[4]; }
+
+    bool AMX_TILE(void) { return f_7_edx[24]; }
+    bool AMX_INT8(void) { return f_7_edx[25]; }
+    bool AMX_FP16(void) { return f_7_1_eax[21]; }
+    bool AMX_BF16(void) { return f_7_edx[22]; }
+
+#ifdef _MSC_VER
+    static void cpuid(int cpu_info[4], int eax) {
+        __cpuid(cpu_info, eax);
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __cpuidex(cpu_info, eax, ecx);
+    }
+#else
+    static void cpuid(int cpu_info[4], int eax) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(0));
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(ecx));
+    }
+#endif
+
+    cpuid_x86() {
+        std::array<int, 4> cpui;
+        std::vector<std::array<int, 4>> data;
+
+        // calling __cpuid with 0x0 as the function_id argument
+        // gets the number of the highest valid function ID.
+        cpuid(cpui.data(), 0);
+        int n_ids = cpui[0];
+
+        for (int i = 0; i <= n_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            data.push_back(cpui);
+        }
+
+        // capture vendor string
+        char vendor[0x20] = {};
+        *reinterpret_cast<int *>(vendor)     = data[0][1];
+        *reinterpret_cast<int *>(vendor + 4) = data[0][3];
+        *reinterpret_cast<int *>(vendor + 8) = data[0][2];
+        this->vendor = vendor;
+        if (this->vendor == "GenuineIntel") {
+            is_intel = true;
+        } else if (this->vendor == "AuthenticAMD") {
+            is_amd = true;
+        }
+
+        // load bitset with flags for function 0x00000001
+        if (n_ids >= 1) {
+            f_1_ecx = data[1][2];
+            f_1_edx = data[1][3];
+        }
+
+        // load bitset with flags for function 0x00000007
+        if (n_ids >= 7) {
+            f_7_ebx = data[7][1];
+            f_7_ecx = data[7][2];
+            f_7_edx = data[7][3];
+            cpuidex(cpui.data(), 7, 1);
+            f_7_1_eax = cpui[0];
+        }
+
+        // calling __cpuid with 0x80000000 as the function_id argument
+        // gets the number of the highest valid extended ID.
+        cpuid(cpui.data(), 0x80000000);
+        unsigned int n_ex_ids = cpui[0];
+
+        std::vector<std::array<int, 4>> ext_data;
+        for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            ext_data.push_back(cpui);
+        }
+
+        // load bitset with flags for function 0x80000001
+        if (n_ex_ids >= 0x80000001) {
+            f_81_ecx = ext_data[1][2];
+            f_81_edx = ext_data[1][3];
+        }
+
+        // interpret CPU brand string if reported
+        char brand[0x40] = {};
+        if (n_ex_ids >= 0x80000004) {
+            std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
+            std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
+            std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
+            this->brand = brand;
+        }
+    }
+
+    bool is_intel = false;
+    bool is_amd = false;
+    std::string vendor;
+    std::string brand;
+    std::bitset<32> f_1_ecx;
+    std::bitset<32> f_1_edx;
+    std::bitset<32> f_7_ebx;
+    std::bitset<32> f_7_ecx;
+    std::bitset<32> f_7_edx;
+    std::bitset<32> f_7_1_eax;
+    std::bitset<32> f_81_ecx;
+    std::bitset<32> f_81_edx;
+};
+
+#if 0
+void test_x86_is() {
+    cpuid_x86 is;
+    printf("CPU Vendor: %s\n", is.vendor.c_str());
+    printf("Brand: %s\n", is.brand.c_str());
+    printf("is_intel: %d\n", is.is_intel);
+    printf("is_amd: %d\n", is.is_amd);
+    printf("sse3: %d\n", is.SSE3());
+    printf("pclmulqdq: %d\n", is.PCLMULQDQ());
+    printf("ssse3: %d\n", is.SSSE3());
+    printf("fma: %d\n", is.FMA());
+    printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
+    printf("sse41: %d\n", is.SSE41());
+    printf("sse42: %d\n", is.SSE42());
+    printf("movbe: %d\n", is.MOVBE());
+    printf("popcnt: %d\n", is.POPCNT());
+    printf("aes: %d\n", is.AES());
+    printf("xsave: %d\n", is.XSAVE());
+    printf("osxsave: %d\n", is.OSXSAVE());
+    printf("avx: %d\n", is.AVX());
+    printf("f16c: %d\n", is.F16C());
+    printf("rdrand: %d\n", is.RDRAND());
+    printf("msr: %d\n", is.MSR());
+    printf("cx8: %d\n", is.CX8());
+    printf("sep: %d\n", is.SEP());
+    printf("cmov: %d\n", is.CMOV());
+    printf("clflush: %d\n", is.CLFSH());
+    printf("mmx: %d\n", is.MMX());
+    printf("fxsr: %d\n", is.FXSR());
+    printf("sse: %d\n", is.SSE());
+    printf("sse2: %d\n", is.SSE2());
+    printf("fsgsbase: %d\n", is.FSGSBASE());
+    printf("bmi1: %d\n", is.BMI1());
+    printf("hle: %d\n", is.HLE());
+    printf("avx2: %d\n", is.AVX2());
+    printf("bmi2: %d\n", is.BMI2());
+    printf("erms: %d\n", is.ERMS());
+    printf("invpcid: %d\n", is.INVPCID());
+    printf("rtm: %d\n", is.RTM());
+    printf("avx512f: %d\n", is.AVX512F());
+    printf("rdseed: %d\n", is.RDSEED());
+    printf("adx: %d\n", is.ADX());
+    printf("avx512pf: %d\n", is.AVX512PF());
+    printf("avx512er: %d\n", is.AVX512ER());
+    printf("avx512cd: %d\n", is.AVX512CD());
+    printf("sha: %d\n", is.SHA());
+    printf("prefetchwt1: %d\n", is.PREFETCHWT1());
+    printf("lahf: %d\n", is.LAHF());
+    printf("lzcnt: %d\n", is.LZCNT());
+    printf("abm: %d\n", is.ABM());
+    printf("sse4a: %d\n", is.SSE4a());
+    printf("xop: %d\n", is.XOP());
+    printf("tbm: %d\n", is.TBM());
+    printf("syscall: %d\n", is.SYSCALL());
+    printf("mmxext: %d\n", is.MMXEXT());
+    printf("rdtscp: %d\n", is.RDTSCP());
+    printf("3dnowext: %d\n", is._3DNOWEXT());
+    printf("3dnow: %d\n", is._3DNOW());
+    printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
+    printf("avx512_vnni: %d\n", is.AVX512_VNNI());
+    printf("avx512_fp16: %d\n", is.AVX512_FP16());
+    printf("avx512_bf16: %d\n", is.AVX512_BF16());
+    printf("amx_tile: %d\n", is.AMX_TILE());
+    printf("amx_int8: %d\n", is.AMX_INT8());
+    printf("amx_fp16: %d\n", is.AMX_FP16());
+    printf("amx_bf16: %d\n", is.AMX_BF16());
+}
+#endif
+
+static int ggml_backend_cpu_x86_score() {
+    // FIXME: this does not check for OS support
+
+    int score = 1;
+    cpuid_x86 is;
+
+#ifdef GGML_FMA
+    if (!is.FMA()) { return 0; }
+    score += 1;
+#endif
+#ifdef GGML_F16C
+    if (!is.F16C()) { return 0; }
+    score += 1<<1;
+#endif
+#ifdef GGML_SSE42
+    if (!is.SSE42()) { return 0; }
+    score += 1<<2;
+#endif
+#ifdef GGML_BMI2
+    if (!is.BMI2()) { return 0; }
+    score += 1<<3;
+#endif
+#ifdef GGML_AVX
+    if (!is.AVX()) { return 0; }
+    score += 1<<4;
+#endif
+#ifdef GGML_AVX2
+    if (!is.AVX2()) { return 0; }
+    score += 1<<5;
+#endif
+#ifdef GGML_AVX_VNNI
+    if (!is.AVX_VNNI()) { return 0; }
+    score += 1<<6;
+#endif
+#ifdef GGML_AVX512
+    if (!is.AVX512F()) { return 0; }
+    if (!is.AVX512CD()) { return 0; }
+    if (!is.AVX512VL()) { return 0; }
+    if (!is.AVX512DQ()) { return 0; }
+    if (!is.AVX512BW()) { return 0; }
+    score += 1<<7;
+#endif
+#ifdef GGML_AVX512_VBMI
+    if (!is.AVX512_VBMI()) { return 0; }
+    score += 1<<8;
+#endif
+#ifdef GGML_AVX512_BF16
+    if (!is.AVX512_BF16()) { return 0; }
+    score += 1<<9;
+#endif
+#ifdef GGML_AVX512_VNNI
+    if (!is.AVX512_VNNI()) { return 0; }
+    score += 1<<10;
+#endif
+#ifdef GGML_AMX_INT8
+    if (!is.AMX_INT8()) { return 0; }
+    score += 1<<11;
+#endif
+
+    return score;
+}
+
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
+
+#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
--- a/ml/backend/ggml/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/arch/x86/quants.c
--- a/ml/backend/ggml/ggml/src/ggml-cpu/arch/x86/repack.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/arch/x86/repack.cpp
--- a/ml/backend/ggml/ggml/src/ggml-cpu/arch/x86/x86.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/arch/x86/x86.go
@@ -0,0 +1,5 @@
+package x86
+
+// #cgo CXXFLAGS: -std=c++17
+// #cgo CPPFLAGS: -I${SRCDIR}/../.. -I${SRCDIR}/../../.. -I${SRCDIR}/../../../../include
+import "C"
--- a/ml/backend/ggml/ggml/src/ggml-cpu/binary-ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/binary-ops.cpp
@@ -0,0 +1,158 @@
+#include "binary-ops.h"
+
+#if defined(GGML_USE_ACCELERATE)
+#include <Accelerate/Accelerate.h>
+
+using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length);
+#endif
+
+static inline float op_add(float a, float b) {
+    return a + b;
+}
+
+static inline float op_sub(float a, float b) {
+    return a - b;
+}
+
+static inline float op_mul(float a, float b) {
+    return a * b;
+}
+
+static inline float op_div(float a, float b) {
+    return a / b;
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
+    }
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        int i10 = i % ne10;
+        const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
+        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
+    }
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+    const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
+
+    if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
+        GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    }
+
+#ifdef GGML_USE_ACCELERATE
+    vDSP_fn_t vDSP_op = nullptr;
+    // TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions
+    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        if (op == op_add) {
+            vDSP_op = vDSP_vadd;
+        } else if (op == op_sub) {
+            vDSP_op = vDSP_vsub;
+        } else if (op == op_mul) {
+            vDSP_op = vDSP_vmul;
+        } else if (op == op_div) {
+            vDSP_op = vDSP_vdiv;
+        }
+    }
+#endif
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        const int64_t i13 = i03 % ne13;
+        const int64_t i12 = i02 % ne12;
+        const int64_t i11 = i01 % ne11;
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+        const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+        if (is_src1_contiguous) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t nr0 = ne00 / ne10;
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                if constexpr (std::is_same_v<src0_t, float> && std::is_same_v<src1_t, float> && std::is_same_v<dst_t, float>) {
+                    if (vDSP_op != nullptr) {
+                        vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+                        continue;
+                    }
+                }
+#endif
+                vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+            }
+        } else {
+            vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
+        }
+    }
+}
+
+// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
+template <float (*op)(float, float)>
+static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_binary_op<op, float, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_BF16) {
+        apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
+        apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F16) {
+        apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
+        apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
+    } else {
+        GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+    }
+}
+
+void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_add>(params, dst);
+}
+
+void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_sub>(params, dst);
+}
+
+void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_mul>(params, dst);
+}
+
+void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_div>(params, dst);
+}
--- a/ml/backend/ggml/ggml/src/ggml-cpu/binary-ops.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/binary-ops.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/common.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/common.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include "ggml.h"
+#include "traits.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-impl.h"
+#include "simd-mappings.h"
+
+#ifdef __cplusplus
+
+#include <utility>
+
+// convenience functions/macros for use in template calls
+// note: these won't be required after the 'traits' lookup table is used.
+static inline ggml_fp16_t f32_to_f16(float x) {
+    return GGML_CPU_FP32_TO_FP16(x);
+}
+
+static inline float f16_to_f32(ggml_fp16_t x) {
+    return GGML_CPU_FP16_TO_FP32(x);
+}
+
+static inline ggml_bf16_t f32_to_bf16(float x) {
+    return GGML_FP32_TO_BF16(x);
+}
+
+static inline float bf16_to_f32(ggml_bf16_t x) {
+    return GGML_BF16_TO_FP32(x);
+}
+
+static inline float i32_to_f32(int32_t x) {
+    return x;
+}
+
+static inline int32_t f32_to_i32(float x) {
+    return x;
+}
+
+static inline float f32_to_f32(float x) {
+    return x;
+}
+
+// TODO - merge this into the traits table, after using row-based conversions
+template <class T>
+struct type_conversion_table;
+
+template <>
+struct type_conversion_table<ggml_fp16_t> {
+    static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
+    static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
+};
+
+template <>
+struct type_conversion_table<float> {
+    static constexpr float (*to_f32)(float) = f32_to_f32;
+    static constexpr float (*from_f32)(float) = f32_to_f32;
+};
+
+template <>
+struct type_conversion_table<ggml_bf16_t> {
+    static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
+    static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
+};
+
+template <>
+struct type_conversion_table<int32_t> {
+    static constexpr float (*to_f32)(int32_t) = i32_to_f32;
+    static constexpr int32_t (*from_f32)(float) = f32_to_i32;
+};
+
+static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+
+    const int64_t nr  = ggml_nrows(src0);
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    return {ir0, ir1};
+}
+
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -0,0 +1,11 @@
+package cpu
+
+// #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
+// #cgo CXXFLAGS: -std=c++17
+// #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
+// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
+// #cgo linux CPPFLAGS: -D_GNU_SOURCE
+// #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
+// #cgo darwin,arm64 LDFLAGS: -framework Accelerate
+import "C"
+import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-cpu/llamafile"
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu_amd64.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu_amd64.go
@@ -0,0 +1,3 @@
+package cpu
+
+import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-cpu/arch/x86"
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu_arm64.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu_arm64.go
@@ -0,0 +1,3 @@
+package cpu
+
+import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-cpu/arch/arm"
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu_debug.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu_debug.go
@@ -0,0 +1,6 @@
+//go:build debug
+
+package cpu
+
+// #cgo CPPFLAGS: -DOLLAMA_DEBUG
+import "C"
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -0,0 +1,526 @@
+#pragma once
+
+// GGML CPU internal header
+
+#include "ggml.h"
+#include "ggml-impl.h"
+
+#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
+//#include <stddef.h>
+#include <stdbool.h>
+#include <string.h> // memcpy
+#include <math.h>   // fabsf
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_compute_params {
+    // ith = thread index, nth = number of threads
+    int ith, nth;
+
+    // work buffer for all threads
+    size_t wsize;
+    void * wdata;
+
+    struct ggml_threadpool * threadpool;
+};
+
+
+#if defined(_MSC_VER)
+
+#define m512bh(p) p
+#define m512i(p) p
+
+#else
+
+#define m512bh(p) (__m512bh)(p)
+#define m512i(p) (__m512i)(p)
+
+#endif
+
+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __FMA__
+#define __FMA__
+#endif
+#ifndef __F16C__
+#define __F16C__
+#endif
+#endif
+
+// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
+#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __SSE3__
+#define __SSE3__
+#endif
+#ifndef __SSSE3__
+#define __SSSE3__
+#endif
+#endif
+
+#if defined(__s390x__) && defined(__VEC__)
+#ifndef __VXE__
+#define __VXE__
+#endif  // __VXE__
+#ifndef __VXE2__
+#define __VXE2__
+#endif  // __VXE2__
+#endif  // __s390x__ && __VEC__
+
+#if defined(__ARM_FEATURE_SVE) && defined(__linux__)
+#include <sys/prctl.h>
+#endif
+
+#if defined(__ARM_NEON)
+
+// ref: https://github.com/ggml-org/llama.cpp/pull/5404
+#ifdef _MSC_VER
+#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
+#else
+#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
+#endif // _MSC_VER
+
+#if !defined(__aarch64__)
+
+// 32-bit ARM compatibility
+
+// vaddlvq_s16
+// vpaddq_s16
+// vpaddq_s32
+// vaddvq_s32
+// vaddvq_f32
+// vmaxvq_f32
+// vcvtnq_s32_f32
+// vzip1_u8
+// vzip2_u8
+
+inline static int32_t vaddlvq_s16(int16x8_t v) {
+    int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
+    return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+    return vcombine_s16(a0, b0);
+}
+
+inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
+    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+    return vcombine_s32(a0, b0);
+}
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline static float vmaxvq_f32(float32x4_t v) {
+    return
+        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+    int32x4_t res;
+
+    res[0] = roundf(vgetq_lane_f32(v, 0));
+    res[1] = roundf(vgetq_lane_f32(v, 1));
+    res[2] = roundf(vgetq_lane_f32(v, 2));
+    res[3] = roundf(vgetq_lane_f32(v, 3));
+
+    return res;
+}
+
+inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[0]; res[1] = b[0];
+    res[2] = a[1]; res[3] = b[1];
+    res[4] = a[2]; res[5] = b[2];
+    res[6] = a[3]; res[7] = b[3];
+
+    return res;
+}
+
+inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[4]; res[1] = b[4];
+    res[2] = a[5]; res[3] = b[5];
+    res[4] = a[6]; res[5] = b[6];
+    res[6] = a[7]; res[7] = b[7];
+
+    return res;
+}
+
+// vld1q_s16_x2
+// vld1q_u8_x2
+// vld1q_u8_x4
+// vld1q_s8_x2
+// vld1q_s8_x4
+// TODO: double-check these work correctly
+
+typedef struct ggml_int16x8x2_t {
+    int16x8_t val[2];
+} ggml_int16x8x2_t;
+
+inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
+    ggml_int16x8x2_t res;
+
+    res.val[0] = vld1q_s16(ptr + 0);
+    res.val[1] = vld1q_s16(ptr + 8);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x2_t {
+    uint8x16_t val[2];
+} ggml_uint8x16x2_t;
+
+inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
+    ggml_uint8x16x2_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x4_t {
+    uint8x16_t val[4];
+} ggml_uint8x16x4_t;
+
+inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
+    ggml_uint8x16x4_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+    res.val[2] = vld1q_u8(ptr + 32);
+    res.val[3] = vld1q_u8(ptr + 48);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x2_t {
+    int8x16_t val[2];
+} ggml_int8x16x2_t;
+
+inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
+    ggml_int8x16x2_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x4_t {
+    int8x16_t val[4];
+} ggml_int8x16x4_t;
+
+inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
+    ggml_int8x16x4_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+    res.val[2] = vld1q_s8(ptr + 32);
+    res.val[3] = vld1q_s8(ptr + 48);
+
+    return res;
+}
+
+// NOTE: not tested
+inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
+    int8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+// NOTE: not tested
+inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
+    uint8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+#else
+
+#define ggml_int16x8x2_t  int16x8x2_t
+#define ggml_uint8x16x2_t uint8x16x2_t
+#define ggml_uint8x16x4_t uint8x16x4_t
+#define ggml_int8x16x2_t  int8x16x2_t
+#define ggml_int8x16x4_t  int8x16x4_t
+
+#define ggml_vld1q_s16_x2 vld1q_s16_x2
+#define ggml_vld1q_u8_x2  vld1q_u8_x2
+#define ggml_vld1q_u8_x4  vld1q_u8_x4
+#define ggml_vld1q_s8_x2  vld1q_s8_x2
+#define ggml_vld1q_s8_x4  vld1q_s8_x4
+#define ggml_vqtbl1q_s8   vqtbl1q_s8
+#define ggml_vqtbl1q_u8   vqtbl1q_u8
+
+#endif // !defined(__aarch64__)
+
+#if !defined(__ARM_FEATURE_DOTPROD)
+
+inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
+    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
+
+    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
+}
+
+#else
+
+#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
+
+#endif // !defined(__ARM_FEATURE_DOTPROD)
+
+#endif // defined(__ARM_NEON)
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#endif
+
+#ifdef __POWER9_VECTOR__
+#include <altivec.h>
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
+#include <immintrin.h>
+#endif
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+#if defined(__loongarch64)
+#if defined(__loongarch_asx)
+#include <lasxintrin.h>
+#endif
+#if defined(__loongarch_sx)
+#include <lsxintrin.h>
+#endif
+#endif
+
+#if defined(__VXE__) || defined(__VXE2__)
+#include <vecintrin.h>
+
+#define vec_neg(a)    (-(a))                // Vector Negate
+#define vec_add(a, b) ((a) + (b))           // Vector Add
+#define vec_sub(a, b) ((a) - (b))           // Vector Subtract
+#define vec_mul(a, b) ((a) * (b))           // Vector Multiply
+#define vec_div(a, b) ((a) / (b))           // Vector Divide
+#define vec_sl(a, b)  ((a) << (b))          // Vector Shift Left
+#define vec_sra(a, b) ((a) >> (b))          // Vector Shift Right
+#define vec_sr(a, b)  ((a) >> (b))          // Vector Shift Right Algebraic
+#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
+#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
+
+#ifndef vec_and
+#define vec_and(a, b) ((a) & (b)) // Vector AND
+#endif
+
+#ifndef vec_or
+#define vec_or(a, b)  ((a) | (b)) // Vector OR
+#endif
+
+#ifndef vec_xor
+#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
+#endif
+
+typedef signed   char char8x16_t  __attribute__((vector_size(16)));
+typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
+
+typedef int8_t  int8x16_t __attribute__((vector_size(16)));
+typedef int16_t int16x8_t __attribute__((vector_size(16)));
+typedef int32_t int32x4_t __attribute__((vector_size(16)));
+
+typedef uint8_t  uint8x16_t __attribute__((vector_size(16)));
+typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
+typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
+
+typedef float  float32x4_t  __attribute__((vector_size(16)));
+typedef double double64x2_t __attribute__((vector_size(16)));
+
+typedef signed   long long long64x2_t  __attribute__((vector_size(16)));
+typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
+
+typedef struct ggml_uint8x16x2_t {
+    uint8x16_t val[2];
+} ggml_uint8x16x2_t;
+
+inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2(const uint8_t * ptr) {
+    ggml_uint8x16x2_t res;
+
+    res.val[0] = vec_xl( 0, ptr);
+    res.val[1] = vec_xl(16, ptr);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x4_t {
+    uint8x16_t val[4];
+} ggml_uint8x16x4_t;
+
+inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4(const uint8_t * ptr) {
+    ggml_uint8x16x4_t res;
+
+    res.val[0] = vec_xl( 0, ptr);
+    res.val[1] = vec_xl(16, ptr);
+    res.val[2] = vec_xl(32, ptr);
+    res.val[3] = vec_xl(48, ptr);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x4_t {
+    int8x16_t val[4];
+} ggml_int8x16x4_t;
+
+inline static ggml_int8x16x4_t ggml_vec_xl_s8x4(const int8_t * ptr) {
+    ggml_int8x16x4_t res;
+
+    res.val[0] = vec_xl( 0, ptr);
+    res.val[1] = vec_xl(16, ptr);
+    res.val[2] = vec_xl(32, ptr);
+    res.val[3] = vec_xl(48, ptr);
+
+    return res;
+}
+
+typedef struct ggml_int16x8x2_t {
+    int16x8_t val[2];
+} ggml_int16x8x2_t;
+
+inline static ggml_int16x8x2_t ggml_vec_xl_s16x2(const int16_t * ptr) {
+    ggml_int16x8x2_t res;
+
+    res.val[0] = vec_xl( 0, ptr);
+    res.val[1] = vec_xl(16, ptr);
+
+    return res;
+}
+
+/*
+    ! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
+    !          or iq4_nl for example implementation.
+*/
+inline static int8x16_t ggml_vec_tbl(int8x16_t a, uint8x16_t b) {
+    int8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
+    const uchar8x16_t v_maske = {  0,  1,  4,  5,  8,  9, 12, 13,
+                                  16, 17, 20, 21, 24, 25, 28, 29 };
+
+    const int16x8_t v_abo = vec_pack((int32x4_t)a, (int32x4_t)b);
+    const int16x8_t v_abe = vec_perm(a, b, v_maske);
+    return v_abo + v_abe;
+}
+
+/**
+ * @see https://github.com/ggml-org/llama.cpp/pull/14037
+ */
+inline static float vec_hsum_f32x4(float32x4_t v) {
+    float32x4_t v_temp = v + vec_reve(v);
+    return v_temp[0] + v_temp[1];
+}
+
+inline static int32_t vec_hsum_i32x4(int32x4_t v) {
+    int32x4_t v_temp = v + vec_reve(v);
+    return v_temp[0] + v_temp[1];
+}
+
+inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
+    return acc + (vec_unpackh(p) + vec_unpackl(p));
+}
+
+#endif
+
+#if defined(__loongarch_sx)
+/* float type data load instructions */
+static __m128 __lsx_vreplfr2vr_s(const float val) {
+    v4f32 res = {val, val, val, val};
+    return (__m128)res;
+}
+#endif
+
+#if defined(__loongarch_asx)
+static __m256 __lasx_xvreplfr2vr_s(const float val) {
+    v8f32 res = {val, val, val, val, val, val, val, val};
+    return (__m256)res;
+}
+#endif
+
+// TODO: move to ggml-threading
+void ggml_barrier(struct ggml_threadpool * tp);
+
+void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
+int  ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -0,0 +1,688 @@
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cpu.h"
+#include "repack.h"
+#include "traits.h"
+#include "ggml-impl.h"
+#include "amx/amx.h"
+
+#include <cctype>
+#include <string>
+#include <vector>
+
+#ifdef GGML_USE_CPU_HBM
+#    include "hbm.h"
+#endif
+
+#ifdef GGML_USE_CPU_KLEIDIAI
+#    include "kleidiai/kleidiai.h"
+#endif
+
+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+#    include "spacemit/ime.h"
+#endif
+
+#if defined(_WIN32)
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#else
+#    include <unistd.h>
+#endif
+
+#if defined(__APPLE__)
+#    include <sys/sysctl.h>
+#    include <sys/types.h>
+#endif
+
+// ggml-backend interface
+
+std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() {
+    static std::vector<ggml_backend_buffer_type_t> bufts = []() {
+        std::vector<ggml_backend_buffer_type_t> bufts;
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+        if (ggml_backend_amx_buffer_type()) {
+            bufts.push_back(ggml_backend_amx_buffer_type());
+        }
+#endif
+
+#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
+        if (ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
+            bufts.push_back(ggml_backend_cpu_riscv64_spacemit_buffer_type());
+        }
+#endif
+
+#ifdef GGML_USE_CPU_KLEIDIAI
+        if (ggml_backend_cpu_kleidiai_buffer_type()) {
+            bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());
+        }
+#endif
+
+#ifdef GGML_USE_CPU_REPACK
+        if (ggml_backend_cpu_repack_buffer_type()) {
+            bufts.push_back(ggml_backend_cpu_repack_buffer_type());
+        }
+#endif
+
+        return bufts;
+    }();
+
+    return bufts;
+}
+
+static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
+    static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] {
+        std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types();
+        bufts.push_back(nullptr);
+        return bufts;
+    }();
+
+    return extra_bufts.data();
+
+    GGML_UNUSED(device);
+}
+
+static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
+    for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) {
+        if (extra == buft) {
+            return true;
+        }
+    }
+    return false;
+}
+
+// CPU backend - backend (stream)
+
+struct ggml_backend_cpu_context {
+    int                 n_threads;
+    ggml_threadpool_t   threadpool;
+
+    uint8_t *           work_data;
+    size_t              work_size;
+
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
+};
+
+static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
+    return "CPU";
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_cpu_free(ggml_backend_t backend) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+    delete[] cpu_ctx->work_data;
+    delete cpu_ctx;
+    delete backend;
+}
+
+struct ggml_backend_plan_cpu {
+    struct ggml_cplan cplan;
+    struct ggml_cgraph cgraph;
+};
+
+static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
+
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
+
+    if (cpu_plan->cplan.work_size > 0) {
+        cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
+        if (cpu_plan->cplan.work_data == NULL) {
+            delete cpu_plan;
+            return NULL;
+        }
+    }
+
+    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
+    return cpu_plan;
+}
+
+static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    delete[] cpu_plan->cplan.work_data;
+    delete cpu_plan;
+
+    GGML_UNUSED(backend);
+}
+
+static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+
+    GGML_UNUSED(backend);
+}
+
+static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+
+    if (cpu_ctx->work_size < cplan.work_size) {
+        delete[] cpu_ctx->work_data;
+        cpu_ctx->work_data = new uint8_t[cplan.work_size];
+        if (cpu_ctx->work_data == NULL) {
+            cpu_ctx->work_size = 0;
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+        cpu_ctx->work_size = cplan.work_size;
+    }
+    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
+
+    cplan.abort_callback      = cpu_ctx->abort_callback;
+    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
+    return ggml_graph_compute(cgraph, &cplan);
+
+    GGML_UNUSED(batch_size);
+}
+
+static const struct ggml_backend_i ggml_backend_cpu_i = {
+    /* .get_name                = */ ggml_backend_cpu_get_name,
+    /* .free                    = */ ggml_backend_cpu_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
+    /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
+    /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+    /* .graph_optimize          = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_cpu_guid(void) {
+    static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
+    return &guid;
+}
+
+ggml_backend_t ggml_backend_cpu_init(void) {
+    // initialize CPU backend now to avoid slowing the first graph computation
+    ggml_cpu_init();
+
+    struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
+    if (ctx == NULL) {
+        return NULL;
+    }
+
+    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->threadpool          = NULL;
+    ctx->work_data           = NULL;
+    ctx->work_size           = 0;
+    ctx->abort_callback      = NULL;
+    ctx->abort_callback_data = NULL;
+
+    ggml_backend_t cpu_backend = new ggml_backend {
+        /* .guid    = */ ggml_backend_cpu_guid(),
+        /* .iface   = */ ggml_backend_cpu_i,
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ ctx,
+    };
+
+    if (cpu_backend == NULL) {
+        delete ctx;
+        return NULL;
+    }
+
+    return cpu_backend;
+}
+
+bool ggml_backend_is_cpu(ggml_backend_t backend) {
+    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
+}
+
+void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->n_threads = n_threads;
+}
+
+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+
+    if (ctx->threadpool && ctx->threadpool != threadpool) {
+        // already had a different threadpool, pause/suspend it before switching
+        ggml_threadpool_pause(ctx->threadpool);
+    }
+    ctx->threadpool = threadpool;
+}
+
+void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
+}
+
+// CPU backend - device
+
+struct ggml_backend_cpu_device_context {
+    std::string description = "CPU";
+
+    ggml_backend_cpu_device_context() {
+#ifdef __APPLE__
+        size_t len = 0;
+        if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
+            description.resize(len);
+            sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
+        }
+#elif defined(__linux__)
+        FILE * f = fopen("/proc/cpuinfo", "r");
+        if (f) {
+            char buf[1024];
+            while (fgets(buf, sizeof(buf), f)) {
+                if (strncmp(buf, "model name", 10) == 0) {
+                    char * p = strchr(buf, ':');
+                    if (p) {
+                        p++;
+                        while (std::isspace(*p)) {
+                            p++;
+                        }
+                        while (std::isspace(p[strlen(p) - 1])) {
+                            p[strlen(p) - 1] = '\0';
+                        }
+                        description = p;
+                        break;
+                    }
+                }
+            }
+            fclose(f);
+        }
+#elif defined(_WIN32)
+        HKEY hKey;
+        if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
+                        TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
+                        0,
+                        KEY_READ,
+                        &hKey) == ERROR_SUCCESS) {
+            DWORD cpu_brand_size = 0;
+            if (RegQueryValueExA(hKey,
+                                "ProcessorNameString",
+                                NULL,
+                                NULL,
+                                NULL,
+                                &cpu_brand_size) == ERROR_SUCCESS) {
+                description.resize(cpu_brand_size);
+                if (RegQueryValueExA(hKey,
+                                    "ProcessorNameString",
+                                    NULL,
+                                    NULL,
+                                    (LPBYTE)&description[0], // NOLINT
+                                    &cpu_brand_size) == ERROR_SUCCESS) {
+                    if (description.find('\0') != std::string::npos) {
+                        description.resize(description.find('\0'));
+                    }
+                }
+            }
+            RegCloseKey(hKey);
+        }
+#endif
+    }
+};
+
+static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
+    return "CPU";
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
+    struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
+
+    return ctx->description.c_str();
+}
+
+static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+#ifdef _WIN32
+    MEMORYSTATUSEX status;
+    status.dwLength = sizeof(status);
+    GlobalMemoryStatusEx(&status);
+    *total = status.ullTotalPhys;
+    *free = status.ullAvailPhys;
+#else
+    long pages = sysconf(_SC_PHYS_PAGES);
+    long page_size = sysconf(_SC_PAGE_SIZE);
+    *total = pages * page_size;
+
+    // "free" system memory is ill-defined, for practical purposes assume that all of it is free:
+    *free = *total;
+#endif // _WIN32
+
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
+    return GGML_BACKEND_DEVICE_TYPE_CPU;
+
+    GGML_UNUSED(dev);
+}
+
+static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_cpu_device_get_name(dev);
+    props->description = ggml_backend_cpu_device_get_description(dev);
+    props->type        = ggml_backend_cpu_device_get_type(dev);
+    ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ false,
+        /* .buffer_from_host_ptr  = */ true,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+    return ggml_backend_cpu_init();
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
+    return ggml_backend_cpu_buffer_type();
+
+    GGML_UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(max_tensor_size);
+}
+
+static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0];
+    const struct ggml_tensor * src1 = op->src[1];
+
+    if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
+        return true;
+    }
+
+    // check extra buffer types
+    // note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary
+    for (int i = 0; i < 4; i++) {
+        if (op->src[i] && op->src[i]->buffer &&
+            ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) {
+            auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context;
+            return buf_extra->supports_op(dev, op);
+        }
+    }
+
+    switch (op->op) {
+        case GGML_OP_CPY:
+        case GGML_OP_SET_ROWS:
+            return
+                op->type != GGML_TYPE_IQ3_XXS &&
+                op->type != GGML_TYPE_IQ3_S   &&
+                op->type != GGML_TYPE_IQ2_XXS &&
+                op->type != GGML_TYPE_IQ2_XS  &&
+                op->type != GGML_TYPE_IQ2_S   &&
+                op->type != GGML_TYPE_IQ1_S   &&
+                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
+        case GGML_OP_MUL_MAT:
+            return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
+        case GGML_OP_SOFT_MAX_BACK: {
+            if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) {
+                return false;
+            }
+            float max_bias = 0.0f;
+
+            memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
+
+            return max_bias == 0.0f;
+        }
+        case GGML_OP_IM2COL_BACK:
+            return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
+        case GGML_OP_GET_ROWS_BACK:
+            return src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16;
+        case GGML_OP_OUT_PROD:
+            return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
+                src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
+        default:
+            return true;
+    }
+}
+
+static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft);
+    GGML_UNUSED(dev);
+}
+
+static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
+    /* .get_name             = */ ggml_backend_cpu_device_get_name,
+    /* .get_description      = */ ggml_backend_cpu_device_get_description,
+    /* .get_memory           = */ ggml_backend_cpu_device_get_memory,
+    /* .get_type             = */ ggml_backend_cpu_device_get_type,
+    /* .get_props            = */ ggml_backend_cpu_device_get_props,
+    /* .init_backend         = */ ggml_backend_cpu_device_init_backend,
+    /* .get_buffer_type      = */ ggml_backend_cpu_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
+    /* .supports_op          = */ ggml_backend_cpu_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_cpu_device_supports_buft,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// CPU backend - backend (reg)
+
+static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
+    return "CPU";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
+    return 1;
+
+    GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+
+    static ggml_backend_cpu_device_context ctx;
+    static ggml_backend_device ggml_backend_cpu_device = {
+        /* .iface   = */ ggml_backend_cpu_device_i,
+        /* .reg     = */ reg,
+        /* .context = */ &ctx,
+    };
+
+    return &ggml_backend_cpu_device;
+}
+
+// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
+// and additionally to allow other backends to expose their own list of features that applications can query using the same API
+static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
+    static std::vector<ggml_backend_feature> features = []() {
+        ggml_cpu_init();
+
+        std::vector<ggml_backend_feature> features;
+        if (ggml_cpu_has_sse3()) {
+            features.push_back({ "SSE3", "1" });
+        }
+        if (ggml_cpu_has_ssse3()) {
+            features.push_back({ "SSSE3", "1" });
+        }
+        if (ggml_cpu_has_avx()) {
+            features.push_back({ "AVX", "1" });
+        }
+        if (ggml_cpu_has_avx_vnni()) {
+            features.push_back({ "AVX_VNNI", "1" });
+        }
+        if (ggml_cpu_has_avx2()) {
+            features.push_back({ "AVX2", "1" });
+        }
+        if (ggml_cpu_has_f16c()) {
+            features.push_back({ "F16C", "1" });
+        }
+        if (ggml_cpu_has_fma()) {
+            features.push_back({ "FMA", "1" });
+        }
+        if (ggml_cpu_has_bmi2()) {
+            features.push_back({ "BMI2", "1" });
+        }
+        if (ggml_cpu_has_avx512()) {
+            features.push_back({ "AVX512", "1" });
+        }
+        if (ggml_cpu_has_avx512_vbmi()) {
+            features.push_back({ "AVX512_VBMI", "1" });
+        }
+        if (ggml_cpu_has_avx512_vnni()) {
+            features.push_back({ "AVX512_VNNI", "1" });
+        }
+        if (ggml_cpu_has_avx512_bf16()) {
+            features.push_back({ "AVX512_BF16", "1" });
+        }
+        if (ggml_cpu_has_amx_int8()) {
+            features.push_back({ "AMX_INT8", "1" });
+        }
+        if (ggml_cpu_has_neon()) {
+            features.push_back({ "NEON", "1" });
+        }
+        if (ggml_cpu_has_arm_fma()) {
+            features.push_back({ "ARM_FMA", "1" });
+        }
+        if (ggml_cpu_has_fp16_va()) {
+            features.push_back({ "FP16_VA", "1" });
+        }
+        if (ggml_cpu_has_matmul_int8()) {
+            features.push_back({ "MATMUL_INT8", "1" });
+        }
+        if (ggml_cpu_has_sve()) {
+            features.push_back({ "SVE", "1" });
+        }
+        if (ggml_cpu_has_dotprod()) {
+            features.push_back({ "DOTPROD", "1" });
+        }
+        if (ggml_cpu_get_sve_cnt() > 0) {
+            static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
+            features.push_back({ "SVE_CNT", sve_cnt.c_str() });
+        }
+        if (ggml_cpu_has_sme()) {
+            features.push_back({ "SME", "1" });
+        }
+        if (ggml_cpu_has_riscv_v()) {
+            features.push_back({ "RISCV_V", "1" });
+        }
+        if (ggml_cpu_get_rvv_vlen() > 0) {
+            static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
+            features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
+        }
+        if (ggml_cpu_has_vsx()) {
+            features.push_back({ "VSX", "1" });
+        }
+        if (ggml_cpu_has_vxe()) {
+            features.push_back({ "VXE", "1" });
+        }
+        if (ggml_cpu_has_wasm_simd()) {
+            features.push_back({ "WASM_SIMD", "1" });
+        }
+        if (ggml_cpu_has_llamafile()) {
+            features.push_back({ "LLAMAFILE", "1" });
+        }
+    #ifdef GGML_USE_ACCELERATE
+        features.push_back({ "ACCELERATE", "1" });
+    #endif
+    #ifdef GGML_USE_CPU_HBM
+        features.push_back({ "CPU_HBM", "1" });
+    #endif
+    #ifdef GGML_USE_OPENMP
+        features.push_back({ "OPENMP", "1" });
+    #endif
+    #ifdef GGML_USE_CPU_KLEIDIAI
+        features.push_back({ "KLEIDIAI", "1" });
+    #endif
+    #ifdef GGML_USE_CPU_REPACK
+        features.push_back({ "REPACK", "1" });
+    #endif
+
+        features.push_back({ nullptr, nullptr });
+
+        return features;
+    }();
+
+    return features.data();
+
+    GGML_UNUSED(reg);
+}
+
+static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
+        ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads;
+        return (void *)fct;
+    }
+    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
+        ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type;
+        return (void *)fct;
+    }
+    if (strcmp(name, "ggml_backend_get_features") == 0) {
+        return (void *)ggml_backend_cpu_get_features;
+    }
+    if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
+        return (void *)ggml_backend_cpu_set_abort_callback;
+    }
+    if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
+        return (void *)ggml_numa_init;
+    }
+    if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
+        return (void *)ggml_is_numa;
+    }
+
+    // threadpool - TODO:  move to ggml-base
+    if (strcmp(name, "ggml_threadpool_new") == 0) {
+        return (void *)ggml_threadpool_new;
+    }
+    if (strcmp(name, "ggml_threadpool_free") == 0) {
+        return (void *)ggml_threadpool_free;
+    }
+    if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
+        return (void *)ggml_backend_cpu_set_threadpool;
+    }
+
+    return NULL;
+
+    GGML_UNUSED(reg);
+}
+
+static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
+    /* .get_name         = */ ggml_backend_cpu_reg_get_name,
+    /* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_cpu_reg_get_device,
+    /* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_cpu_reg(void) {
+    // init CPU feature detection
+    ggml_cpu_init();
+
+    static struct ggml_backend_reg ggml_backend_cpu_reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_cpu_reg_i,
+        /* .context     = */ NULL,
+    };
+
+    return &ggml_backend_cpu_reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)
--- a/ml/backend/ggml/ggml/src/ggml-cpu/hbm.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/hbm.cpp
@@ -0,0 +1,55 @@
+#ifdef GGML_USE_CPU_HBM
+
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
+
+#include "hbm.h"
+
+// buffer type HBM
+
+#include <hbwmalloc.h>
+
+static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_HBM";
+
+    GGML_UNUSED(buft);
+}
+
+static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    hbw_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                                                           size_t                     size) {
+    void * ptr;
+    int    result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
+    if (result != 0) {
+        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
+        return NULL;
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft                 = buft;
+    buffer->iface.free_buffer    = ggml_backend_cpu_hbm_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
+        /* .iface    = */ {
+                           /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
+                           /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
+                           /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+                           /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
+                           /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
+                           /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+                           },
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_cpu_buffer_type_hbm;
+}
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/hbm.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/hbm.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+// GGML CPU internal header
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
--- a/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/llamafile.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/llamafile.go
@@ -0,0 +1,5 @@
+package llamafile
+
+// #cgo CXXFLAGS: -std=c++17
+// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../.. -I${SRCDIR}/../../../include
+import "C"
--- a/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h
@@ -0,0 +1,333 @@
+#pragma once
+
+typedef vector unsigned char vec_t;
+typedef __vector_quad acc_t;
+
+template <typename TA>
+class tinyBLAS_Q0_PPC {
+  public:
+    tinyBLAS_Q0_PPC(int64_t k,
+                    const TA *A, int64_t lda,
+                    const block_q8_0 *B, int64_t ldb,
+                    float *C, int64_t ldc,
+                    int ith, int nth);
+
+    void matmul(int64_t m, int64_t n);
+    void matmul_tiled_q0(int64_t m, int64_t n, int64_t mc, int64_t nc, int64_t kc) {
+        vec_t A_pack[mc*kc*2];
+        vec_t B_pack[nc*kc*2];
+        int comparray[mc*kc];
+        constexpr bool is_Ablock_q4 = std::is_same_v<TA, block_q4_0>;
+        int64_t ytiles = m / mc;
+        int64_t xtiles = n / nc;
+        int64_t tiles  = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles) {
+            end = tiles;
+        }
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = (job / xtiles) * mc;
+            int64_t jj = (job % xtiles) * nc;
+            for (int64_t kk = 0; kk < k; kk += kc) {
+                if constexpr(is_Ablock_q4) {
+                    packNormalInt4_large(A + ii*lda + kk, lda, mc, 4, (int8_t*)A_pack, comparray);
+                } else {
+                    packNormal_large<int8_t, vector signed char>(A + ii*lda + kk, lda, mc, 8, (int8_t*)A_pack, false, comparray);
+                }
+                packNormal_large<uint8_t, vector unsigned char>(B + jj*ldb + kk, ldb, nc, 8, (uint8_t*)B_pack, true);
+                KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack, comparray);
+            }
+        }
+    }
+
+  private:
+    inline void save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
+        for (int I = 0; I < RM; I++) {
+            for (int J = 0; J < RN; J++) {
+                *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
+            }
+        }
+    }
+
+    inline void add_save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
+        for (int I = 0; I < RM; I++) {
+            for (int J = 0; J < RN; J++) {
+                float * c_ptr = (float *)(C+ii+((jj+J)*ldc)+I);
+                *c_ptr += *((float*)&fin_res[idx+I]+J);
+            }
+        }
+    }
+
+    template<typename ArrayType>
+    inline void compute(acc_t* ACC, int c_idx, int s_idx, ArrayType& comparray, vector float* vs, vector float* fin_res) {
+        vector signed int vec_C[4];
+        vector float CA[4] = {0};
+        vector float res[4] = {0};
+        __builtin_mma_disassemble_acc(vec_C, ACC);
+        for (int i = 0; i < 4; i++) {
+            CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
+            res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
+            fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
+        }
+    }
+
+    inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
+        const vector signed char lowMask = vec_splats((signed char)0xF);
+        const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+        const vector signed char v8 = vec_splats((signed char)0x8);
+        vector signed int vsum = {0};
+        vector signed int vsum2 = {0};
+        c[0] = vec_and(c[1], lowMask);
+        c[1] = vec_sr(c[1], v4);
+        c[0] = vec_sub(c[0], v8);
+        c[1] = vec_sub(c[1], v8);
+        vsum = vec_sum4s(c[0], vsum);
+        vsum2 = vec_sum4s(c[1], vsum2);
+        vsum = vec_add(vsum, vsum2);
+        *(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+    }
+
+    template <typename V1, typename V2>
+    inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
+        vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+        vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+        vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
+        vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
+        V2 t1, t2, t3, t4, t5, t6, t7, t8;
+        vector unsigned char xor_vector;
+        uint8_t flip_vec = 0x80;
+        xor_vector = vec_splats(flip_vec);
+        t1 = vec_perm(s1, s2, swiz1);
+        t2 = vec_perm(s1, s2, swiz2);
+        t3 = vec_perm(s3, s4, swiz1);
+        t4 = vec_perm(s3, s4, swiz2);
+        t5 = vec_perm(t1, t3, swiz3);
+        t6 = vec_perm(t1, t3, swiz4);
+        t7 = vec_perm(t2, t4, swiz3);
+        t8 = vec_perm(t2, t4, swiz4);
+        if (flip == true) {
+            t5 = vec_xor(t5, xor_vector);
+            t6 = vec_xor(t6, xor_vector);
+            t7 = vec_xor(t7, xor_vector);
+            t8 = vec_xor(t8, xor_vector);
+        }
+        vec_xst(t5, 0, vecOffset);
+        vec_xst(t6, 0, vecOffset+16);
+        vec_xst(t7, 0, vecOffset+32);
+        vec_xst(t8, 0, vecOffset+48);
+    }
+
+    template<int RM, int RN>
+    inline void kernel(int64_t ii, int64_t jj) {
+        if constexpr(RM == 4 && RN == 8) {
+            KERNEL_4x8(ii,jj);
+        } else if constexpr(RM == 8 && RN == 4) {
+            KERNEL_8x4(ii,jj);
+        } else if constexpr(RM == 8 && RN == 8) {
+            KERNEL_8x8(ii,jj);
+        } else {
+            assert(false && "RN/RM values not supported");
+        }
+    }
+    template<int size>
+    void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray);
+    template<typename VA, typename VB>
+    void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip);
+    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n);
+    void KERNEL_4x8(int64_t ii, int64_t jj);
+    void KERNEL_8x4(int64_t ii, int64_t jj);
+    void KERNEL_8x8(int64_t ii, int64_t jj);
+    void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN);
+    template <int RM, int RN>
+    void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n);
+
+    void compute_scale(int64_t ii, int64_t jj, int blk, vector float* vs){
+        for (int I = 0; I<8; I++) {
+            float a_scale = unhalf((A+((ii+I)*lda)+blk)->d);
+            for (int J = 0; J<4; J++) {
+                *((float*)&vs[I]+J) = (a_scale * unhalf((B+((jj+J)*ldb)+blk)->d));
+                *((float*)&vs[I+8]+J) = (a_scale * unhalf((B+((jj+J+4)*ldb)+blk)->d));
+             }
+         }
+    }
+
+    inline void process_q8_elements(const int8_t *qs, int *ca) {
+        vector signed char c1 = vec_xl(0, qs);
+        vector signed char c2 = vec_xl(16, qs);
+        vector signed int vsum1 = {0};
+        vector signed int vsum2 = {0};
+        vsum1 = vec_sum4s(c1, vsum1);
+        vsum2 = vec_sum4s(c2, vsum2);
+        vector signed int vsum = vec_add(vsum1, vsum2);
+        *ca = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+    }
+
+    template<typename VA, typename VB>
+    void packNormal_large(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip, int* comparray=nullptr) {
+        int64_t i, j;
+        block_q8_0 *aoffset = NULL;
+        VA *vecOffset = NULL;
+        block_q8_0* aoffsets[8];
+        __vector_pair arr[8];
+        VB c[8][2] = {0};
+        VB c1[8] = {0}; VB c2[8] = {0};
+        aoffset = const_cast<block_q8_0*>(a);
+        vecOffset = vec;
+        j = (rows >> 3);
+        int index = 0;
+        if (j > 0) {
+            do {
+                for (int it = 0; it < 8; it++)
+                    aoffsets[it] = aoffset + it*lda;
+                aoffset += 8 * lda;
+                for (int blk = 0; blk < kc; blk++) {
+                    for (int it = 0; it < 8; it++) {
+                        arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)(aoffsets[it]+blk)->qs);
+                        __builtin_vsx_disassemble_pair(c[it], &arr[it]);
+                        c1[it] = c[it][0];
+                        c2[it] = c[it][1];
+                        if (comparray){
+                            process_q8_elements((aoffsets[it]+ blk)->qs, &comparray[index + 8*blk + it]);
+                        }
+                    }
+                    vector_permute_store<VA, VB>(c1[0], c1[1], c1[2], c1[3], vecOffset, flip);
+                    vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip);
+                    vector_permute_store<VA, VB>(c1[4], c1[5], c1[6], c1[7], vecOffset+128, flip);
+                    vector_permute_store<VA, VB>(c2[4], c2[5], c2[6], c2[7], vecOffset+192, flip);
+                    vecOffset += 256;
+                }
+                j--;
+                index += 8*kc;
+            } while(j > 0);
+        }
+
+    }
+
+    void packNormalInt4_large(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, int*comparray) {
+        int64_t i, j;
+        TA *aoffset = NULL;
+        int8_t *vecOffset = NULL;
+        TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
+        TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
+        vector signed char c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
+        vector signed char c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
+        aoffset = const_cast<TA*>(a);
+        vecOffset = vec;
+        int index = 0;
+        j = (rows >> 3);
+        if (j > 0) {
+            do {
+                aoffset1 = aoffset;
+                aoffset2 = aoffset1 + lda;
+                aoffset3 = aoffset2 + lda;
+                aoffset4 = aoffset3 + lda;
+                aoffset5 = aoffset4 + lda;
+                aoffset6 = aoffset5 + lda;
+                aoffset7 = aoffset6 + lda;
+                aoffset8 = aoffset7 + lda;
+                aoffset += 8 * lda;
+                for (int blk = 0; blk < kc; blk++) {
+                    c1[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset1+blk)->qs));
+                    c2[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset2+blk)->qs));
+                    c3[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset3+blk)->qs));
+                    c4[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset4+blk)->qs));
+                    c5[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset5+blk)->qs));
+                    c6[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset6+blk)->qs));
+                    c7[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset7+blk)->qs));
+                    c8[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset8+blk)->qs));
+
+                    process_q4_elements(c1, &comparray[index + 8*blk+0]);
+                    process_q4_elements(c2, &comparray[index + 8*blk+1]);
+                    process_q4_elements(c3, &comparray[index + 8*blk+2]);
+                    process_q4_elements(c4, &comparray[index + 8*blk+3]);
+                    process_q4_elements(c5, &comparray[index + 8*blk+4]);
+                    process_q4_elements(c6, &comparray[index + 8*blk+5]);
+                    process_q4_elements(c7, &comparray[index + 8*blk+6]);
+                    process_q4_elements(c8, &comparray[index + 8*blk+7]);
+                    vector_permute_store<int8_t, vector signed char>(c1[0], c2[0], c3[0], c4[0], vecOffset, false);
+                    vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false);
+                    vector_permute_store<int8_t, vector signed char>(c5[0], c6[0], c7[0], c8[0], vecOffset+128, false);
+                    vector_permute_store<int8_t, vector signed char>(c5[1], c6[1], c7[1], c8[1], vecOffset+192, false);
+                    vecOffset += 256;
+                }
+                j--;
+                index += 8*kc;
+            } while (j > 0);
+        }
+    }
+
+    void KERNEL_Q0(int64_t ii, int64_t jj, int64_t mc, int64_t nc, int64_t kc, int64_t l, vec_t *vec_A, vec_t *vec_B, int *comparray) {
+        acc_t acc[8];
+        for (int i = 0; i < mc ; i += 8) {
+            for (int j = 0; j < nc; j += 8) {
+                vector float fin_res[16] = {0};
+                vector float vs[16] = {0};
+                for (int64_t kk = 0; kk < kc; kk+=2) {
+                    for (int x = 0; x < 8; x++) {
+                        __builtin_mma_xxsetaccz(&acc[x]);
+                    }
+                    int A_block_idx = (i/8)*(16*kc) + kk*16;
+                    int B_block_idx = (j/8)*(16*kc)+ kk*16;
+                    vec_t *A_block = &vec_A[A_block_idx];
+                    vec_t *B_block = &vec_B[B_block_idx];
+                    for (int x = 0; x < 8; x++) {
+                        __builtin_mma_xvi8ger4pp(&acc[0], A_block[x],     B_block[x]);
+                        __builtin_mma_xvi8ger4pp(&acc[1], A_block[x + 8], B_block[x]);
+                        __builtin_mma_xvi8ger4pp(&acc[2], A_block[x],     B_block[x+8]);
+                        __builtin_mma_xvi8ger4pp(&acc[3], A_block[x+8],   B_block[x+8]);
+                    }
+                    compute_scale(ii+i, jj+j, l+kk, vs);
+                    int c_index = (i/8)*(8*kc)+ kk*8;
+                    int* c_block = &comparray[c_index];
+                    compute(&acc[0], 0,  0,  c_block, vs, fin_res);
+                    compute(&acc[1], 4,  4,  c_block, vs, fin_res);
+                    compute(&acc[2], 0,  8,  c_block, vs, fin_res);
+                    compute(&acc[3], 4, 12,  c_block, vs, fin_res);
+
+                    A_block_idx = (i/8)*(16*kc) + (kk+1)*16;
+                    B_block_idx = (j/8)*(16*kc)+ (kk+1)*16;
+                    A_block = &vec_A[A_block_idx];
+                    B_block = &vec_B[B_block_idx];
+                    for (int x = 0; x < 8; x++) {
+                        __builtin_mma_xvi8ger4pp(&acc[4], A_block[x],     B_block[x]);
+                        __builtin_mma_xvi8ger4pp(&acc[5], A_block[x + 8], B_block[x]);
+                        __builtin_mma_xvi8ger4pp(&acc[6], A_block[x],     B_block[x+8]);
+                        __builtin_mma_xvi8ger4pp(&acc[7], A_block[x+8],   B_block[x+8]);
+                    }
+                    compute_scale(ii+i, jj+j, l+kk+1, vs);
+                    c_index = (i/8)*(8*kc)+ (kk+1)*8;
+                    c_block = &comparray[c_index];
+                    compute(&acc[4], 0,  0,  c_block, vs, fin_res);
+                    compute(&acc[5], 4,  4,  c_block, vs, fin_res);
+                    compute(&acc[6], 0,  8,  c_block, vs, fin_res);
+                    compute(&acc[7], 4, 12,  c_block, vs, fin_res);
+
+                }
+                if (l == 0) {
+                    save_res(ii+i,   jj+j,    0,  fin_res);
+                    save_res(ii+i+4, jj+j,    4,  fin_res);
+                    save_res(ii+i,   jj+j+4,  8,  fin_res);
+                    save_res(ii+i+4, jj+j+4, 12,  fin_res);
+                } else {
+                    add_save_res(ii+i,   jj+j,    0,  fin_res);
+                    add_save_res(ii+i+4, jj+j,    4,  fin_res);
+                    add_save_res(ii+i,   jj+j+4,  8,  fin_res);
+                    add_save_res(ii+i+4, jj+j+4, 12,  fin_res);
+                }
+            }
+        }
+    }
+
+    const TA *const A;
+    const block_q8_0 *const B;
+    float *C;
+    const int64_t k;
+    int64_t kc;
+    const int64_t lda;
+    const int64_t ldb;
+    const int64_t ldc;
+    const int ith;
+    const int nth;
+};
--- a/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp
--- a/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <stdint.h>
+#include <stdbool.h>
+
+#if defined(__VXE__) || defined(__VXE2__)
+#include <vecintrin.h>
+#endif
+
+#ifdef _MSC_VER
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE __attribute__((__noinline__))
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
+                     const void *, int64_t, const void *, int64_t, void *, int64_t,
+                     int, int, int);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include "ggml.h"
+
+//
+// cache line
+//
+
+#if defined(__cpp_lib_hardware_interference_size)
+#define CACHE_LINE_SIZE std::hardware_destructive_interference_size
+#else
+#if defined(__POWER9_VECTOR__)
+#define CACHE_LINE_SIZE 128
+#elif defined(__VXE__) || defined(__VXE2__)
+#define CACHE_LINE_SIZE 256
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+#endif
+
+static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
+
+// Work buffer size for im2col operations in CONV2D
+#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_compute_forward_dup(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_add(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_add_id(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cumsum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_repeat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_repeat_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_concat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_silu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rms_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rms_norm_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_group_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_l2_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_out_prod(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_scale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_soft_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_soft_max_ext_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rope(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rope_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_roll(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_top_k(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_fill(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_flash_attn_back(
+        const struct ggml_compute_params * params,
+        const bool masked,
+        struct ggml_tensor * dst);
+void ggml_compute_forward_ssm_conv(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_glu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_custom(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/quants.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/quants.c
--- a/ml/backend/ggml/ggml/src/ggml-cpu/quants.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/quants.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+
+#include "ggml.h"
+
+// GGML CPU internal header
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Quantization
+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+// Dot product
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+// Generic implementation
+void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc);
+void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/repack.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/repack.cpp
--- a/ml/backend/ggml/ggml/src/ggml-cpu/repack.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/repack.h
@@ -0,0 +1,126 @@
+#pragma once
+
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+
+#include "traits.h"
+#include "ggml.h"
+
+// GGML internal header
+
+ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void);
+
+template <int K> constexpr int QK_0() {
+    if constexpr (K == 4) {
+        return QK4_0;
+    }
+    if constexpr (K == 8) {
+        return QK8_0;
+    }
+    return -1;
+}
+
+template <int K, int N> struct block {
+    ggml_half d[N];                         // deltas for N qK_0 blocks
+    int8_t    qs[(QK_0<K>() * N * K) / 8];  // quants for N qK_0 blocks
+};
+
+// control size
+static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
+static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
+static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
+static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
+
+using block_q4_0x4 = block<4, 4>;
+using block_q4_0x8 = block<4, 8>;
+using block_q8_0x4 = block<8, 4>;
+using block_q8_0x8 = block<8, 8>;
+
+struct block_q4_Kx8 {
+    ggml_half d[8];      // super-block scale for quantized scales
+    ggml_half dmin[8];   // super-block scale for quantized mins
+    uint8_t scales[96];  // scales and mins, quantized with 6 bits
+    uint8_t qs[1024];    // 4--bit quants
+};
+
+static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
+struct block_q2_Kx8 {
+    ggml_half d[8];      // super-block scale for quantized scales
+    ggml_half dmin[8];   // super-block scale for quantized mins
+    uint8_t scales[128];  // scales and mins, quantized with 4 bits
+    uint8_t qs[512];    // 2--bit quants
+};
+
+static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
+struct block_q8_Kx4 {
+    float d[4];              // delta
+    int8_t qs[QK_K * 4];     // quants
+    int16_t bsums[QK_K / 4]; // sum of quants in groups of 16
+};
+
+static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding");
+
+struct block_iq4_nlx4 {
+    ggml_half d[4];            // deltas for 4 iq4_nl blocks
+    uint8_t   qs[QK4_NL * 2];  // nibbles / quants for 4 iq4_nl blocks
+};
+
+static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
+
+struct block_iq4_nlx8 {
+    ggml_half d[8];            // deltas for 8 iq4_nl blocks
+    uint8_t   qs[QK4_NL * 4];  // nibbles / quants for 8 iq4_nl blocks
+};
+
+static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
+// Native implementations
+void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/simd-mappings.h
--- a/ml/backend/ggml/ggml/src/ggml-cpu/traits.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/traits.cpp
@@ -0,0 +1,36 @@
+#include "traits.h"
+
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+namespace ggml::cpu {
+tensor_traits::~tensor_traits() {}
+
+extra_buffer_type::~extra_buffer_type() {}
+}  // namespace ggml::cpu
+
+bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
+        if (extra && extra->context) {
+            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
+            auto tensor_traits = buf_extra->get_tensor_traits(op);
+            if (tensor_traits && tensor_traits->compute_forward(params, op)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
+        if (extra && extra->context) {
+            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
+            auto tensor_traits = buf_extra->get_tensor_traits(op);
+            if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
--- a/ml/backend/ggml/ggml/src/ggml-cpu/traits.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/traits.h
@@ -0,0 +1,38 @@
+#pragma once
+#include "ggml-backend-impl.h"
+#include "ggml-cpu-impl.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+#    include <vector>
+extern "C" {
+#endif
+
+// return true if op part of extra "accelerator"
+bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
+bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
+
+#ifdef __cplusplus
+}
+
+namespace ggml::cpu {
+// register in tensor->extra
+class tensor_traits {
+  public:
+    virtual ~tensor_traits();
+    virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size)        = 0;
+    virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
+};
+
+class extra_buffer_type {
+  public:
+    virtual ~extra_buffer_type();
+    virtual bool            supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
+    virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op)                   = 0;
+};
+}  // namespace ggml::cpu
+
+// implemented in ggml-cpu.cpp.
+std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
+
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.cpp
@@ -0,0 +1,337 @@
+#include "unary-ops.h"
+
+static inline float op_abs(float x) {
+    return fabsf(x);
+}
+
+static inline float op_sgn(float x) {
+    return (x > 0.f) ? 1.f : ((x < 0.f) ? -1.f : 0.f);
+}
+
+static inline float op_neg(float x) {
+    return -x;
+}
+
+static inline float op_step(float x) {
+    return (x > 0.f) ? 1.f : 0.f;
+}
+
+static inline float op_tanh(float x) {
+    return tanhf(x);
+}
+
+static inline float op_elu(float x) {
+    return (x > 0.f) ? x : expm1f(x);
+}
+
+static inline float op_relu(float x) {
+    return (x > 0.f) ? x : 0.f;
+}
+
+static inline float op_sigmoid(float x) {
+    return 1.f / (1.f + expf(-x));
+}
+
+static inline float op_hardsigmoid(float x) {
+    return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
+}
+
+static inline float op_exp(float x) {
+    return expf(x);
+}
+
+static inline float op_hardswish(float x) {
+    return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
+}
+
+static inline float op_sqr(float x) {
+    return x * x;
+}
+
+static inline float op_sqrt(float x) {
+    return sqrtf(x);
+}
+
+static inline float op_xielu(float x, float alpha_n, float alpha_p, float beta, float eps) {
+    if (x > 0.0f) {
+        return alpha_p * x * x + beta * x;
+    } else {
+        const float min_x_eps = fminf(x, eps);
+        return (expm1f(min_x_eps) - x) * alpha_n + beta * x;
+    }
+}
+
+static inline float op_sin(float x) {
+    return sinf(x);
+}
+
+static inline float op_cos(float x) {
+    return cosf(x);
+}
+
+static inline float op_log(float x) {
+    return logf(x);
+}
+
+static inline float op_expm1(float x) {
+    return expf(x) - 1.0f;
+}
+
+static inline float op_softplus(float x) {
+    return (x > 20.0f) ? x : logf(1.0f + expf(x));
+}
+
+static inline float op_floor(float x) {
+    return floorf(x);
+}
+
+static inline float op_ceil(float x) {
+    return ceilf(x);
+}
+
+static inline float op_round(float x) {
+    return roundf(x);
+}
+
+static inline float op_trunc(float x) {
+    return truncf(x);
+}
+
+template <float (*op)(float), typename src0_t, typename dst_t>
+static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        y[i] = f32_to_dst(op(src0_to_f32(x[i])));
+    }
+}
+
+template <float (*op)(float), typename src0_t, typename dst_t>
+static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+        vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
+    }
+}
+
+// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
+template <float (*op)(float)>
+static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_unary_op<op, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_bf16_t, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_fp16_t, float>(params, dst);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+template <float (*op)(float, ggml_tensor *)>
+static void unary_op_params(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_unary_op<op, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_bf16_t, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_fp16_t, float>(params, dst);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+// Extend vec_unary_op to support functors
+template <typename Op, typename src0_t, typename dst_t>
+static inline void vec_unary_op_functor(int64_t n, dst_t * y, const src0_t * x, Op op) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        y[i] = f32_to_dst(op(src0_to_f32(x[i])));
+    }
+}
+
+// Extend apply_unary_op to support functors
+template <typename Op, typename src0_t, typename dst_t>
+static void apply_unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+        vec_unary_op_functor(ne0, dst_ptr, src0_ptr, op);
+    }
+}
+
+// Generic dispatcher for functors
+template <typename Op>
+static void unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_unary_op_functor<Op, float, float>(params, dst, op);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_unary_op_functor<Op, ggml_fp16_t, ggml_fp16_t>(params, dst, op);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_unary_op_functor<Op, ggml_bf16_t, ggml_bf16_t>(params, dst, op);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
+        apply_unary_op_functor<Op, ggml_bf16_t, float>(params, dst, op);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
+        apply_unary_op_functor<Op, ggml_fp16_t, float>(params, dst, op);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_abs>(params, dst);
+}
+
+void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sgn>(params, dst);
+}
+
+void ggml_compute_forward_neg(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_neg>(params, dst);
+}
+
+void ggml_compute_forward_step(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_step>(params, dst);
+}
+
+void ggml_compute_forward_tanh(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_tanh>(params, dst);
+}
+
+void ggml_compute_forward_elu(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_elu>(params, dst);
+}
+
+void ggml_compute_forward_relu(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_relu>(params, dst);
+}
+
+void ggml_compute_forward_sigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sigmoid>(params, dst);
+}
+
+void ggml_compute_forward_hardsigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_hardsigmoid>(params, dst);
+}
+
+void ggml_compute_forward_exp(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_exp>(params, dst);
+}
+
+void ggml_compute_forward_hardswish(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_hardswish>(params, dst);
+}
+
+void ggml_compute_forward_sqr(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sqr>(params, dst);
+}
+
+void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sqrt>(params, dst);
+}
+
+void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sin>(params, dst);
+}
+
+void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_cos>(params, dst);
+}
+
+void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_log>(params, dst);
+}
+
+void ggml_compute_forward_expm1(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_expm1>(params, dst);
+}
+
+void ggml_compute_forward_softplus(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_softplus>(params, dst);
+}
+
+void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_floor>(params, dst);
+}
+
+void ggml_compute_forward_ceil(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_ceil>(params, dst);
+}
+
+void ggml_compute_forward_round(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_round>(params, dst);
+}
+
+void ggml_compute_forward_trunc(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_trunc>(params, dst);
+}
+
+void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
+    const float alpha_n = ggml_get_op_params_f32(dst, 1);
+    const float alpha_p = ggml_get_op_params_f32(dst, 2);
+    const float beta = ggml_get_op_params_f32(dst, 3);
+    const float eps = ggml_get_op_params_f32(dst, 4);
+
+    const auto xielu_op_params = [alpha_n, alpha_p, beta, eps](float f) {
+        return op_xielu(f, alpha_n, alpha_p, beta, eps);
+    };
+
+    unary_op_functor(params, dst, xielu_op_params);
+}
+
--- a/ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/unary-ops.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_compute_forward_abs(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sgn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_neg(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_step(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_tanh(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_elu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_hardsigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_exp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_hardswish(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sqr(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_expm1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_softplus(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_trunc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/vec.cpp
@@ -0,0 +1,572 @@
+#include "vec.h"
+
+#include <cassert>
+
+// precomputed gelu table for f16 (128 KB)
+ggml_fp16_t ggml_table_gelu_f16[1 << 16];
+
+// precomputed quick gelu table for f16 (128 KB)
+ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
+
+void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
+   assert(nrc == 1);
+   GGML_UNUSED(nrc);
+   GGML_UNUSED(bx);
+   GGML_UNUSED(by);
+   GGML_UNUSED(bs);
+
+#if defined(GGML_SIMD)
+    float sumf = 0.0f;
+
+    #if defined(__ARM_FEATURE_SVE)
+        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
+        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
+        const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
+
+        const int np = (n & ~(ggml_f32_step - 1));
+        svfloat32_t sum1 = svdup_n_f32(0.0f);
+        svfloat32_t sum2 = svdup_n_f32(0.0f);
+        svfloat32_t sum3 = svdup_n_f32(0.0f);
+        svfloat32_t sum4 = svdup_n_f32(0.0f);
+        svfloat32_t sum5 = svdup_n_f32(0.0f);
+        svfloat32_t sum6 = svdup_n_f32(0.0f);
+        svfloat32_t sum7 = svdup_n_f32(0.0f);
+        svfloat32_t sum8 = svdup_n_f32(0.0f);
+        svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
+        svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
+        for (int i = 0; i < np; i += ggml_f32_step) {
+            ax1 = GGML_F32_VEC_LOAD(x + i);
+            ay1 = GGML_F32_VEC_LOAD(y + i);
+            sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
+
+            ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
+            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
+            sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
+
+            ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
+            ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
+            sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
+
+            ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
+            ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
+            sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
+
+            ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
+            ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
+            sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
+
+            ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
+            ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
+            sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
+
+            ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
+            ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
+            sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
+
+            ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
+            ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
+            sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
+        }
+        // leftovers
+        // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
+        const int np2 = (n & ~(ggml_f32_epr - 1));
+        for (int i = np; i < np2; i += ggml_f32_epr) {
+            ax1 = GGML_F32_VEC_LOAD(x + i);
+            ay1 = GGML_F32_VEC_LOAD(y + i);
+            sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
+        }
+        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
+        if (np2 < n) {
+            svbool_t pg = svwhilelt_b32(np2, n);
+            ax1 = svld1_f32(pg, x + np2);
+            ay1 = svld1_f32(pg, y + np2);
+            sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
+        }
+        // reduce sum1,sum2 to sum1
+        GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
+    #elif defined(__riscv_v_intrinsic)
+        int vl = __riscv_vsetvlmax_e32m8();
+        vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
+        vfloat32m8_t vsum;
+        vfloat32m8_t ax;
+        vfloat32m8_t ay;
+        vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
+        for (int i = 0; i < n; i += vl) {
+            vl = __riscv_vsetvl_e32m8(n - i);
+            ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
+            ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
+            vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
+        }
+        vl = __riscv_vsetvlmax_e32m8();
+        vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
+        sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
+    #else
+        const int np = (n & ~(GGML_F32_STEP - 1));
+
+        GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+
+        GGML_F32_VEC ax[GGML_F32_ARR];
+        GGML_F32_VEC ay[GGML_F32_ARR];
+
+        for (int i = 0; i < np; i += GGML_F32_STEP) {
+            for (int j = 0; j < GGML_F32_ARR; j++) {
+                ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+                sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+            }
+        }
+
+        // reduce sum0..sum3 to sum0
+        GGML_F32_VEC_REDUCE(sumf, sum);
+
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            sumf += x[i]*y[i];
+        }
+    #endif
+#else
+    // scalar
+    ggml_float sumf = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float)(x[i]*y[i]);
+    }
+#endif
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+    int i = 0;
+    ggml_float sumf = 0;
+
+#if defined(__AVX512BF16__)
+    __m512 c1 = _mm512_setzero_ps();
+    __m512 c2 = _mm512_setzero_ps();
+    for (; i + 64 <= n; i += 64) {
+        c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
+                             m512bh(_mm512_loadu_si512((y + i))));
+        c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
+                             m512bh(_mm512_loadu_si512((y + i + 32))));
+    }
+    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
+    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
+
+#elif defined(__AVX512F__)
+#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
+    __m512 c1 = _mm512_setzero_ps();
+    __m512 c2 = _mm512_setzero_ps();
+    for (; i + 32 <= n; i += 32) {
+        c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+        c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
+    }
+    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
+    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
+
+#undef LOAD
+#elif defined(__AVX2__) || defined(__AVX__)
+#if defined(__AVX2__)
+#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
+#else
+#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
+#endif
+    __m256 c1 = _mm256_setzero_ps();
+    __m256 c2 = _mm256_setzero_ps();
+    __m256 c3 = _mm256_setzero_ps();
+    __m256 c4 = _mm256_setzero_ps();
+    for (; i + 32 <= n; i += 32) {
+        c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+        c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
+        c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
+        c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
+    }
+    __m128 g;
+    c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
+                       _mm256_add_ps(c2, c4));
+    g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
+                   _mm256_castps256_ps128(c1));
+    g = _mm_add_ps(g, _mm_movehl_ps(g, g));
+    g = _mm_add_ss(g, _mm_movehdup_ps(g));
+    sumf += (ggml_float)_mm_cvtss_f32(g);
+
+#undef LOAD
+#endif
+
+    for (; i < n; ++i) {
+        sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
+                             GGML_BF16_TO_FP32(y[i]));
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+
+    ggml_float sumf = 0.0;
+
+
+#if defined(GGML_SIMD)
+    #if defined(__ARM_FEATURE_SVE)
+        const int sve_register_length = svcntb() * 8; //get vector length
+        const int ggml_f16_epr = sve_register_length / 16; // running when 16
+        const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
+
+        const int np= (n & ~(ggml_f16_step - 1));
+        svfloat16_t sum1 = svdup_n_f16(0.0f);
+        svfloat16_t sum2 = svdup_n_f16(0.0f);
+        svfloat16_t sum3 = svdup_n_f16(0.0f);
+        svfloat16_t sum4 = svdup_n_f16(0.0f);
+
+        svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
+        svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
+        for (int i = 0; i < np; i += ggml_f16_step) {
+            ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
+            ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
+            sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
+
+            ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
+            ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
+            sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
+
+            ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
+            ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
+            sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
+
+            ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
+            ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
+            sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
+
+            ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
+            ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
+            sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
+
+            ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
+            ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
+            sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
+
+            ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
+            ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
+            sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
+
+            ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
+            ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
+            sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
+        }
+
+        const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
+        for (int k = np; k < np2; k += ggml_f16_epr) {
+            svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
+            svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
+            sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
+        }
+
+        if (np2 < n) {
+            svbool_t pg = svwhilelt_b16(np2, n);
+            svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
+            svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
+
+            sum1 = svmad_f16_x(pg, hx, hy, sum1);
+        }
+        GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
+    #elif defined(__riscv_v_intrinsic)
+        #if defined(__riscv_zvfh)
+            int vl = __riscv_vsetvlmax_e32m2();
+            vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
+            vfloat32m2_t vsum;
+            vfloat16m1_t ax;
+            vfloat16m1_t ay;
+            vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
+            for (int i = 0; i < n; i += vl) {
+                vl = __riscv_vsetvl_e16m1(n - i);
+                ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
+                ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
+                vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
+            }
+            vl = __riscv_vsetvlmax_e32m1();
+            vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
+            vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
+            sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
+        #else
+            for (int i = 0; i < n; ++i) {
+                sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
+            }
+        #endif // __riscv_zvfh
+    #else
+        const int np = (n & ~(GGML_F16_STEP - 1));
+
+        GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
+
+        GGML_F16_VEC ax[GGML_F16_ARR];
+        GGML_F16_VEC ay[GGML_F16_ARR];
+
+        for (int i = 0; i < np; i += GGML_F16_STEP) {
+            for (int j = 0; j < GGML_F16_ARR; j++) {
+                ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+                ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+
+                sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
+            }
+        }
+
+        // reduce sum0..sum3 to sum0
+        GGML_F16_VEC_REDUCE(sumf, sum);
+
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
+        }
+        // if you hit this, you are likely running outside the FP range
+        assert(!isnan(sumf) && !isinf(sumf));
+    #endif
+#else
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
+    }
+#endif // GGML_SIMD
+
+    *s = sumf;
+}
+
+void ggml_vec_silu_f32(const int n, float * y, const float * x) {
+    int i = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
+    }
+#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
+    const int vlen = svcntw();
+    for (; i < n; i += vlen) {
+        const svbool_t pg = svwhilelt_b32_s32(i, n);
+        svst1_f32(pg, y + i, ggml_v_silu(pg, svld1_f32(pg, x + i)));
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
+    }
+#elif defined(__riscv_v_intrinsic)
+    for (int vl; i < n; i += vl) {
+        vl = __riscv_vsetvl_e32m2(n - i);
+        vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
+        vfloat32m2_t vy = ggml_v_silu_m2(vx, vl);
+        __riscv_vse32_v_f32m2(&y[i], vy, vl);
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = ggml_silu_f32(x[i]);
+    }
+}
+
+void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {
+    int i = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
+    }
+#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
+    const int vlen = svcntw();
+    for (; i < n; i += vlen) {
+        const svbool_t pg = svwhilelt_b32_s32(i, n);
+        svst1_f32(pg, y + i, svmul_f32_x(pg, ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i)));
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
+    }
+#elif defined(__riscv_v_intrinsic)
+    for (int vl; i < n; i += vl) {
+        vl = __riscv_vsetvl_e32m2(n - i);
+        vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
+        vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
+        vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl);
+        __riscv_vse32_v_f32m2(&y[i], vy, vl);
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = ggml_silu_f32(x[i]) * g[i];
+    }
+}
+
+ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean) {
+    int i = 0;
+    ggml_float sum = 0;
+// TODO: optimize to process the remaining elements in groups using the smaller vector sizes from AVX2 and SSE
+// ref: https://github.com/ggml-org/llama.cpp/pull/15953#pullrequestreview-3310928344
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        __m512 val = _mm512_sub_ps(_mm512_loadu_ps(x + i),
+                                   _mm512_set1_ps(mean));
+        _mm512_storeu_ps(y + i, val);
+        sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(val, val));
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        __m256 val = _mm256_sub_ps(_mm256_loadu_ps(x + i),
+                                   _mm256_set1_ps(mean));
+        _mm256_storeu_ps(y + i, val);
+        val = _mm256_mul_ps(val,val);
+        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
+                                 _mm256_castps256_ps128(val));
+        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
+        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
+        sum += (ggml_float)_mm_cvtss_f32(val2);
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        __m128 val = _mm_sub_ps(_mm_loadu_ps(x + i),
+                                _mm_set1_ps(mean));
+        _mm_storeu_ps(y + i, val);
+        val = _mm_mul_ps(val, val);
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
+        val = _mm_add_ss(val, _mm_movehdup_ps(val));
+#else
+        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
+        val = _mm_add_ps(val, tmp);
+        tmp = _mm_movehl_ps(tmp, val);
+        val = _mm_add_ss(val, tmp);
+#endif  // __AVX__ || __AVX2__ || __AVX512F__
+        sum += (ggml_float)_mm_cvtss_f32(val);
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        float32x4_t val = vsubq_f32(vld1q_f32(x + i),
+                                    vdupq_n_f32(mean));
+        vst1q_f32(y + i, val);
+        val = vmulq_f32(val, val);
+        sum += (ggml_float)vaddvq_f32(val);
+    }
+#elif defined(__VXE__) || defined(__VXE2__)
+    for (; i + 3 < n; i += 4) {
+        float32x4_t val = vec_sub(vec_xl(0, x + i), vec_splats(mean));
+        vec_xst(val, 0, y + i);
+        val = vec_mul(val, val);
+        sum += (ggml_float)vec_hsum_f32x4(val);
+    }
+#elif defined(__riscv_v_intrinsic)
+    vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
+    for (int vl; i < n; i += vl) {
+        vl = __riscv_vsetvl_e32m2(n - i);
+        vfloat32m2_t val = __riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], vl), mean, vl);
+        __riscv_vse32_v_f32m2(&y[i], val, vl);
+        val = __riscv_vfmul_vv_f32m2(val, val, vl);
+        vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, vl);
+    }
+    sum = (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
+#endif
+    for (; i < n; ++i) {
+        float val = x[i] - mean;
+        y[i] = val;
+        val *= val;
+        sum += (ggml_float)val;
+    }
+    return sum/n;
+}
+
+ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
+    int i = 0;
+    ggml_float sum = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
+                                               _mm512_set1_ps(max)));
+        _mm512_storeu_ps(y + i, val);
+        sum += (ggml_float)_mm512_reduce_add_ps(val);
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
+                                               _mm256_set1_ps(max)));
+        _mm256_storeu_ps(y + i, val);
+        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
+                                 _mm256_castps256_ps128(val));
+        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
+        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
+        sum += (ggml_float)_mm_cvtss_f32(val2);
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
+                                            _mm_set1_ps(max)));
+        _mm_storeu_ps(y + i, val);
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
+        val = _mm_add_ss(val, _mm_movehdup_ps(val));
+#else
+        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
+        val = _mm_add_ps(val, tmp);
+        tmp = _mm_movehl_ps(tmp, val);
+        val = _mm_add_ss(val, tmp);
+#endif
+        sum += (ggml_float)_mm_cvtss_f32(val);
+    }
+#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
+    const int vlen = svcntw();
+    for (; i < n; i += vlen) {
+        const svbool_t pg = svwhilelt_b32_s32(i, n);
+        svfloat32_t val = ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i),
+                                                svdup_n_f32_x(pg, max)));
+        svst1_f32(pg, y + i, val);
+        sum += (ggml_float)svaddv_f32(pg, val);
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
+                                                vdupq_n_f32(max)));
+        vst1q_f32(y + i, val);
+        sum += (ggml_float)vaddvq_f32(val);
+    }
+#elif defined(__riscv_v_intrinsic)
+    vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
+    for (int avl; i < n; i += avl) {
+        avl = __riscv_vsetvl_e32m2(n - i);
+        vfloat32m2_t val = ggml_v_expf_m2(__riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], avl), max, avl), avl);
+        __riscv_vse32_v_f32m2(&y[i], val, avl);
+        vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, avl);
+    }
+    return (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
+#endif
+    for (; i < n; ++i) {
+        float val = expf(x[i] - max);
+        sum += (ggml_float)val;
+        y[i] = val;
+    }
+    return sum;
+}
+
+ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
+    // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
+
+    int i = 0;
+    ggml_float sum = 0;
+    for (; i < n; ++i) {
+        float val = x[i] - max;
+        y[i] = val;
+        sum += (ggml_float)expf(val);
+    }
+    return sum = (ggml_float)logf(sum);
+}
--- a/ml/backend/ggml/ggml/src/ggml-cpu/vec.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/vec.h
--- a/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt
@@ -0,0 +1,191 @@
+cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
+
+find_package(CUDAToolkit)
+
+if (CUDAToolkit_FOUND)
+    message(STATUS "CUDA Toolkit found")
+
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        # native == GPUs available at build time
+        # 50     == Maxwell, lowest CUDA 12 standard
+        # 60     == P100, FP16 CUDA intrinsics
+        # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
+        # 70     == V100, FP16 tensor cores
+        # 75     == Turing, int8 tensor cores
+        # 80     == Ampere, asynchronous data loading, faster tensor core instructions
+        # 86     == RTX 3000, needs CUDA v11.1
+        # 89     == RTX 4000, needs CUDA v11.8
+        #
+        # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
+        # XX-real    == compile CUDA code as device code for this specific architecture
+        # no suffix  == compile as both PTX and device code
+        #
+        # The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
+        #     for best performance and to also build real architectures for the most commonly used GPUs.
+        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
+            set(CMAKE_CUDA_ARCHITECTURES "native")
+        else()
+            if (CUDAToolkit_VERSION VERSION_LESS "13")
+                list(APPEND CMAKE_CUDA_ARCHITECTURES 50-virtual 61-virtual 70-virtual)
+            endif ()
+
+            list(APPEND CMAKE_CUDA_ARCHITECTURES 75-virtual 80-virtual 86-real)
+
+            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
+                list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real)
+            endif()
+        endif()
+    endif()
+    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
+    enable_language(CUDA)
+
+    file(GLOB   GGML_HEADERS_CUDA "*.cuh")
+    list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
+
+    file(GLOB   GGML_SOURCES_CUDA "*.cu")
+    file(GLOB   SRCS "template-instances/fattn-tile*.cu")
+    list(APPEND GGML_SOURCES_CUDA ${SRCS})
+    file(GLOB   SRCS "template-instances/fattn-mma*.cu")
+    list(APPEND GGML_SOURCES_CUDA ${SRCS})
+    file(GLOB   SRCS "template-instances/mmq*.cu")
+    list(APPEND GGML_SOURCES_CUDA ${SRCS})
+    file(GLOB   SRCS "template-instances/mmf*.cu")
+    list(APPEND GGML_SOURCES_CUDA ${SRCS})
+
+    if (GGML_CUDA_FA_ALL_QUANTS)
+        file(GLOB   SRCS "template-instances/fattn-vec*.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+    else()
+        file(GLOB   SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "template-instances/fattn-vec*f16-f16.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+    endif()
+
+    ggml_add_backend_library(ggml-cuda
+                             ${GGML_HEADERS_CUDA}
+                             ${GGML_SOURCES_CUDA}
+                            )
+
+    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
+
+    if (GGML_CUDA_GRAPHS)
+        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
+    endif()
+
+    if (GGML_CUDA_FORCE_MMQ)
+        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+    endif()
+
+    if (GGML_CUDA_FORCE_CUBLAS)
+        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        add_compile_definitions(GGML_CUDA_NO_VMM)
+    endif()
+
+    if (NOT GGML_CUDA_FA)
+        add_compile_definitions(GGML_CUDA_NO_FA)
+    endif()
+
+    if (GGML_CUDA_NO_PEER_COPY)
+        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+    endif()
+
+    if (GGML_STATIC)
+        if (WIN32)
+            # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
+            target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas)
+        else ()
+            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.1")
+                target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            else()
+                target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static)
+            endif()
+        endif()
+    else()
+        target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
+    else()
+        target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
+    endif()
+
+    set(CUDA_CXX_FLAGS "")
+
+    set(CUDA_FLAGS -use_fast_math -extended-lambda)
+
+    if (GGML_CUDA_DEBUG)
+        list(APPEND CUDA_FLAGS -lineinfo)
+        add_compile_definitions(GGML_CUDA_DEBUG)
+    endif()
+
+    if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
+        # Options are:
+        # - none (not recommended)
+        # - speed (nvcc's default)
+        # - balance
+        # - size
+        list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE})
+    endif()
+
+    if (GGML_FATAL_WARNINGS)
+        list(APPEND CUDA_FLAGS -Werror all-warnings)
+    endif()
+
+    if (GGML_ALL_WARNINGS AND NOT MSVC)
+        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
+        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
+            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
+        endif()
+
+        execute_process(
+            COMMAND ${NVCC_CMD} -Xcompiler --version
+            OUTPUT_VARIABLE CUDA_CCFULLVER
+            ERROR_QUIET
+        )
+
+        if (NOT CUDA_CCFULLVER MATCHES clang)
+            set(CUDA_CCID "GNU")
+            execute_process(
+                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
+                OUTPUT_VARIABLE CUDA_CCVER
+                ERROR_QUIET
+                OUTPUT_STRIP_TRAILING_WHITESPACE
+            )
+        else()
+            if (CUDA_CCFULLVER MATCHES Apple)
+                set(CUDA_CCID "AppleClang")
+            else()
+                set(CUDA_CCID "Clang")
+            endif()
+            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
+        endif()
+
+        message(STATUS "CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
+
+        ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
+        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
+    endif()
+
+    if (NOT MSVC)
+        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
+    endif()
+
+    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
+
+    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
+        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
+    endif()
+
+    target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
+else()
+    message(FATAL_ERROR "CUDA Toolkit not found")
+endif()
--- a/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu
@@ -0,0 +1,61 @@
+#include "acc.cuh"
+
+static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
+    const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i >= ne) {
+        return;
+    }
+
+    int64_t src1_idx = i - offset;
+
+    int64_t tmp = src1_idx;
+    const int64_t i13 = tmp / s13;
+    tmp -= i13 * s13;
+    const int64_t i12 = tmp / s12;
+    tmp -= i12 * s12;
+    const int64_t i11 = tmp / s11;
+    tmp -= i11 * s11;
+    const int64_t i10 = tmp;
+
+    float val = x[i];
+    if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) {
+        val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
+    }
+    dst[i] = val;
+}
+
+static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) {
+    const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
+    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
+}
+
+void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       * dst_d  = (float       *)  dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(dst->nb[0] == ggml_element_size(dst));
+    GGML_ASSERT(ggml_is_contiguously_allocated(dst));
+
+    const int64_t s1     = dst->op_params[0] / sizeof(float);
+    const int64_t s2     = dst->op_params[1] / sizeof(float);
+    const int64_t s3     = dst->op_params[2] / sizeof(float);
+    const int64_t offset = dst->op_params[3] / sizeof(float);
+
+    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream);
+}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/acc.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_ACC_BLOCK_SIZE 256
+
+void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/add-id.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/add-id.cu
@@ -0,0 +1,58 @@
+#include "add-id.cuh"
+
+static __global__ void add_id_kernel(
+        const float * src0, const float * src1, const int32_t * src2, float * dst,
+        int64_t ne0, int64_t ne1,
+        size_t nb01, size_t nb02,
+        size_t nb11,
+        size_t nb21
+    ) {
+
+    const int64_t i1 = blockIdx.x;
+    const int64_t i2 = blockIdx.y;
+
+    const int i11 = *(const int32_t *) ((const char *) src2 + i1*sizeof(int32_t) + i2*nb21);
+
+    const size_t nb1 = ne0 * sizeof(float);
+    const size_t nb2 = ne1 * nb1;
+
+    float * dst_row = (float *)((char *)dst + i1*nb1 + i2*nb2);
+    const float * src0_row = (const float *)((const char *)src0 +  i1*nb01 + i2*nb02);
+    const float * src1_row = (const float *)((const char *)src1 + i11*nb11);
+
+    for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
+        dst_row[i0] = src0_row[i0] + src1_row[i0];
+    }
+}
+
+void ggml_cuda_op_add_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+    const ggml_tensor * src2 = dst->src[2];
+
+    GGML_TENSOR_TERNARY_OP_LOCALS
+
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(src2->type == GGML_TYPE_I32);
+
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(nb10 == sizeof(float));
+    GGML_ASSERT(nb20 == sizeof(int32_t));
+
+    const float * src0_d = (const float *)src0->data;
+    const float * src1_d = (const float *)src1->data;
+    const int32_t * src2_d = (const int32_t *)src2->data;
+    float * dst_d = (float *)dst->data;
+
+    int threads = std::min((int)ne00, 768); // cols
+    dim3 blocks(ne01, ne02); // n_experts_used, n_tokens
+    add_id_kernel<<<blocks, threads, 0, ctx.stream()>>>(
+        src0_d, src1_d, src2_d, dst_d,
+        ne0, ne1,
+        nb01, nb02,
+        nb11,
+        nb21
+    );
+}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/add-id.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/add-id.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_add_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/arange.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/arange.cu
@@ -0,0 +1,34 @@
+#include "arange.cuh"
+
+static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+    dst[nidx] = start + step * nidx;
+}
+
+static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
+    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
+    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
+}
+
+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    float start;
+    float stop;
+    float step;
+    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
+    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
+    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
+
+    int64_t steps = (int64_t)ceil((stop - start) / step);
+    GGML_ASSERT(ggml_nelements(dst) == steps);
+
+    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
+}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/arange.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/arange.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_ARANGE_BLOCK_SIZE 256
+
+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/argmax.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argmax.cu
@@ -0,0 +1,91 @@
+#include <algorithm>
+#include <cstdint>
+
+#include "argmax.cuh"
+#include "common.cuh"
+#include "sum.cuh"
+
+static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __restrict__ dst, const int64_t ncols) {
+    const int64_t row = blockIdx.x;
+
+    float maxval = -FLT_MAX;
+    int   argmax = -1;
+    const float * rowx = x + row * ncols;
+
+    for (int32_t col = threadIdx.x; col < ncols; col += blockDim.x) {
+        const float val = rowx[col];
+        if (val > maxval) {
+            maxval = val;
+            argmax = col;
+        }
+    }
+
+#pragma unroll
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
+        const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
+        if (val > maxval) {
+            maxval = val;
+            argmax = col;
+        }
+    }
+
+    const int n_warps = blockDim.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    if (n_warps > 1) {
+        constexpr int    max_warps = 1024 / WARP_SIZE;
+        __shared__ float shared_maxval[max_warps];
+        __shared__ int   shared_argmax[max_warps];
+        if (lane_id == 0) {
+            shared_maxval[warp_id] = maxval;
+            shared_argmax[warp_id] = argmax;
+        }
+
+        __syncthreads();
+
+        if (warp_id == 0) {
+            if (lane_id < n_warps) {
+                maxval = shared_maxval[lane_id];
+                argmax = shared_argmax[lane_id];
+            }
+#pragma unroll
+            for (int offset = 16; offset > 0; offset >>= 1) {
+                const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
+                const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
+                if (val > maxval) {
+                    maxval = val;
+                    argmax = col;
+                }
+            }
+        }
+    }
+
+    if (warp_id == 0 && lane_id == 0) {
+        dst[row] = argmax;
+    }
+}
+
+void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int64_t ne00  = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    const float * src0_d = (const float *) src0->data;
+    int32_t     * dst_d  = (int32_t     *) dst->data;
+
+    cudaStream_t stream = ctx.stream();
+
+    const int64_t num_blocks = nrows;
+    const int64_t num_threads = std::min<int64_t>(1024, (ne00 + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE);
+    const dim3 blocks_dim(num_threads, 1, 1);
+    const dim3 blocks_num(num_blocks, 1, 1);
+
+    argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00);
+}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/argmax.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argmax.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
@@ -0,0 +1,298 @@
+#include "argsort.cuh"
+
+#ifdef GGML_CUDA_USE_CUB
+#    include <cub/cub.cuh>
+using namespace cub;
+#endif  // GGML_CUDA_USE_CUB
+
+static __global__ void init_indices(int * indices, const int ncols, const int nrows) {
+    const int col = blockIdx.x * blockDim.x + threadIdx.x;
+    const int row = blockIdx.y;
+
+    if (col < ncols && row < nrows) {
+        indices[row * ncols + col] = col;
+    }
+}
+
+static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx <= nrows) {
+        offsets[idx] = idx * ncols;
+    }
+}
+
+#ifdef GGML_CUDA_USE_CUB
+static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
+                                     const float *    x,
+                                     int *            dst,
+                                     const int        ncols,
+                                     const int        nrows,
+                                     ggml_sort_order  order,
+                                     cudaStream_t     stream) {
+    ggml_cuda_pool_alloc<int>   temp_indices_alloc(pool, ncols * nrows);
+    ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
+    ggml_cuda_pool_alloc<int>   offsets_alloc(pool, nrows + 1);
+
+    int *   temp_indices = temp_indices_alloc.get();
+    float * temp_keys    = temp_keys_alloc.get();
+    int *   d_offsets    = offsets_alloc.get();
+
+    static const int block_size = 256;
+    const dim3 grid_size((ncols + block_size - 1) / block_size, nrows);
+    init_indices<<<grid_size, block_size, 0, stream>>>(temp_indices, ncols, nrows);
+
+    const dim3 offset_grid((nrows + block_size - 1) / block_size);
+    init_offsets<<<offset_grid, block_size, 0, stream>>>(d_offsets, ncols, nrows);
+
+    CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+
+    size_t temp_storage_bytes = 0;
+
+    if (order == GGML_SORT_ORDER_ASC) {
+        DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
+                                            temp_indices, dst,                                  // values (indices)
+                                            ncols * nrows, nrows,                            // num items, num segments
+                                            d_offsets, d_offsets + 1, 0, sizeof(float) * 8,  // all bits
+                                            stream);
+    } else {
+        DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
+                                                      dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, 0,
+                                                      sizeof(float) * 8, stream);
+    }
+
+    ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
+    void *                        d_temp_storage = temp_storage_alloc.get();
+
+    if (order == GGML_SORT_ORDER_ASC) {
+        DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
+                                            ncols * nrows, nrows, d_offsets, d_offsets + 1, 0, sizeof(float) * 8,
+                                            stream);
+    } else {
+        DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
+                                                      temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
+                                                      0, sizeof(float) * 8, stream);
+    }
+}
+#endif  // GGML_CUDA_USE_CUB
+
+// Bitonic sort implementation
+template<typename T>
+static inline __device__ void ggml_cuda_swap(T & a, T & b) {
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+template<ggml_sort_order order>
+static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
+    // bitonic sort
+    int col = threadIdx.x;
+    int row = blockIdx.x;
+
+    if (col >= ncols_pad) {
+        return;
+    }
+
+    const float * x_row = x + row * ncols;
+    extern __shared__ int dst_row[];
+
+    // initialize indices
+    dst_row[col] = col;
+
+    __syncthreads();
+
+    for (int k = 2; k <= ncols_pad; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (dst_row[col] >= ncols ||
+                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+                    ) {
+                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
+                    }
+                } else {
+                    if (dst_row[ixj] >= ncols ||
+                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+                    ) {
+                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // copy the result to dst without the padding
+    if (col < ncols) {
+        dst[row * ncols + col] = dst_row[col];
+    }
+}
+
+static int next_power_of_2(int x) {
+    int n = 1;
+    while (n < x) {
+        n *= 2;
+    }
+    return n;
+}
+
+static void argsort_f32_i32_cuda_bitonic(const float *   x,
+                                         int *           dst,
+                                         const int       ncols,
+                                         const int       nrows,
+                                         ggml_sort_order order,
+                                         cudaStream_t    stream) {
+    // bitonic sort requires ncols to be power of 2
+    const int ncols_pad = next_power_of_2(ncols);
+
+    const dim3 block_dims(ncols_pad, 1, 1);
+    const dim3 block_nums(nrows, 1, 1);
+    const size_t shared_mem = ncols_pad * sizeof(int);
+
+    // FIXME: this limit could be raised by ~2-4x on Ampere or newer
+    GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
+
+    if (order == GGML_SORT_ORDER_ASC) {
+        k_argsort_f32_i32<GGML_SORT_ORDER_ASC>
+            <<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else if (order == GGML_SORT_ORDER_DESC) {
+        k_argsort_f32_i32<GGML_SORT_ORDER_DESC>
+            <<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+
+template<ggml_sort_order order>
+static __global__ void k_argsort_i32_i32(const int32_t * x, int * dst, const int ncols, const int ncols_pad) {
+    extern __shared__ int shared_mem[];
+    int * indices = shared_mem;
+
+    const int tid = threadIdx.x;
+    const int row = blockIdx.y;
+
+    // Initialize all indices, handling the case where threads < ncols_pad
+    for (int i = tid; i < ncols_pad; i += blockDim.x) {
+        indices[i] = i < ncols ? i : 0; // Use 0 for padding indices
+    }
+    __syncthreads();
+
+    // Bitonic sort
+    for (int k = 2; k <= ncols_pad; k *= 2) {
+        for (int j = k/2; j > 0; j /= 2) {
+            for (int i = tid; i < ncols_pad; i += blockDim.x) {
+                const int ij = i ^ j;
+                if (ij > i) {
+                    // Only compare values within the actual data range
+                    if (i < ncols && ij < ncols) {
+                        if ((i & k) == 0) {
+                            if (order == GGML_SORT_ORDER_ASC) {
+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            } else {
+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            }
+                        } else {
+                            if (order == GGML_SORT_ORDER_ASC) {
+                                if (x[row * ncols + indices[i]] < x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            } else {
+                                if (x[row * ncols + indices[i]] > x[row * ncols + indices[ij]]) {
+                                    int tmp = indices[i];
+                                    indices[i] = indices[ij];
+                                    indices[ij] = tmp;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Write sorted indices to output, only threads handling valid data
+    for (int i = tid; i < ncols; i += blockDim.x) {
+        dst[row * ncols + i] = indices[i];
+    }
+}
+
+static void argsort_i32_i32_cuda(const int32_t * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
+    // Bitonic sort requires ncols to be power of 2
+    const int ncols_pad = next_power_of_2(ncols);
+
+    // Ensure thread count doesn't exceed maximum (typically 1024)
+    const int max_threads = 1024;  // This is the typical max for most GPUs
+    const int threads_per_block = ncols_pad > max_threads ? max_threads : ncols_pad;
+
+    const dim3 block_dims(threads_per_block, 1, 1);
+    const dim3 block_nums(1, nrows, 1);
+    const size_t shared_mem = ncols_pad * sizeof(int);
+
+    // Check if shared memory size is within limits
+    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
+    // Instead of logging an error, use GGML_ASSERT with a descriptive message
+    GGML_ASSERT(shared_mem <= max_shared_mem && "argsort: required shared memory exceeds device limit");
+
+    // Launch kernels with the updated thread configuration
+    if (order == GGML_SORT_ORDER_ASC) {
+        k_argsort_i32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else if (order == GGML_SORT_ORDER_DESC) {
+        k_argsort_i32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+
+void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+    if (src0->type == GGML_TYPE_I32) {
+        argsort_i32_i32_cuda((const int32_t *)src0_d, (int *)dst_d, ncols, nrows, order, stream);
+    } else {
+#ifdef GGML_CUDA_USE_CUB
+        const int    ncols_pad      = next_power_of_2(ncols);
+        const size_t shared_mem     = ncols_pad * sizeof(int);
+        const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
+
+        if (shared_mem > max_shared_mem || ncols > 1024) {
+            ggml_cuda_pool & pool = ctx.pool();
+            argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
+        } else {
+            argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
+        }
+#else
+        argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
+#endif
+    }
+}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cu
@@ -0,0 +1,502 @@
+#include "binbcast.cuh"
+#include <cstdint>
+#include <utility>
+
+static __device__ __forceinline__ float op_repeat(const float a, const float b) {
+    return b;
+    GGML_UNUSED(a);
+}
+
+static __device__ __forceinline__ float op_add(const float a, const float b) {
+    return a + b;
+}
+
+static __device__ __forceinline__ float op_sub(const float a, const float b) {
+    return a - b;
+}
+
+static __device__ __forceinline__ float op_mul(const float a, const float b) {
+    return a * b;
+}
+
+static __device__ __forceinline__ float op_div(const float a, const float b) {
+    return a / b;
+}
+
+template <float (*bin_op)(const float, const float),
+          typename src0_t,
+          typename src1_t,
+          typename dst_t,
+          typename... src1_ptrs>
+static __global__ void k_bin_bcast(const src0_t *         src0,
+                                   const src1_t *         src1,
+                                   dst_t *                dst,
+                                   const int              ne0,
+                                   const int              ne1,
+                                   const int              ne2,
+                                   const uint3            ne3,
+                                   const uint3            ne10,
+                                   const uint3            ne11,
+                                   const uint3            ne12,
+                                   const uint3            ne13,
+                                   /*int s0, */ const int s1,
+                                   const int              s2,
+                                   const int              s3,
+                                   /*int s00,*/ const int s01,
+                                   const int              s02,
+                                   const int              s03,
+                                   /*int s10,*/ const int s11,
+                                   const int              s12,
+                                   const int              s13,
+                                   src1_ptrs... src1s) {
+    const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
+    const uint32_t i1  = (blockDim.y * blockIdx.y + threadIdx.y);
+    const uint32_t i2  = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
+    const uint32_t i3  = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
+
+    if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
+        return;
+    }
+
+    const uint32_t i11 = fastmodulo(i1, ne11);
+    const uint32_t i12 = fastmodulo(i2, ne12);
+    const uint32_t i13 = fastmodulo(i3, ne13);
+
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+
+    const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
+    dst_t * dst_row = dst + i_dst;
+
+    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
+        const uint32_t i10 = fastmodulo(i0, ne10);
+
+        float result = src0_row ? (float) src0_row[i0] : 0.0f;
+        if constexpr (sizeof...(src1_ptrs) > 0) {
+            result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10])));
+        } else {
+            result = bin_op(result, (float)src1[i_src1 + i10]);
+        }
+
+        dst_row[i0] = (dst_t) result;
+    }
+}
+
+template <float (*bin_op)(const float, const float),
+          typename src0_t,
+          typename src1_t,
+          typename dst_t,
+          typename... src1_ptrs>
+static __global__ void k_bin_bcast_unravel(const src0_t *         src0,
+                                           const src1_t *         src1,
+                                           dst_t *                dst,
+                                           const uint3            ne0,
+                                           const uint3            ne1,
+                                           const uint3            ne2,
+                                           const uint32_t         ne3,
+                                           const uint3            prod_012,
+                                           const uint3            prod_01,
+                                           const uint3            ne10,
+                                           const uint3            ne11,
+                                           const uint3            ne12,
+                                           const uint3            ne13,
+                                           /*int s0, */ const int s1,
+                                           const int              s2,
+                                           const int              s3,
+                                           /*int s00,*/ const int s01,
+                                           const int              s02,
+                                           const int              s03,
+                                           /*int s10,*/ const int s11,
+                                           const int              s12,
+                                           const int              s13,
+                                           src1_ptrs... src1s) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    const uint32_t i3 = fastdiv(i, prod_012);
+    const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01);
+    const uint32_t i1 = fastdiv(i - i3 * prod_012.z - i2 * prod_01.z, ne0);
+    const uint32_t i0 = i - i3 * prod_012.z - i2 * prod_01.z - i1 * ne0.z;
+
+    if (i0 >= ne0.z || i1 >= ne1.z || i2 >= ne2.z || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = fastmodulo(i1, ne11);
+    const int i12 = fastmodulo(i2, ne12);
+    const int i13 = fastmodulo(i3, ne13);
+
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+
+    const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
+    dst_t * dst_row = dst + i_dst;
+
+    const int i10 = fastmodulo(i0, ne10);
+
+    float result = src0_row ? (float) src0_row[i0] : 0.0f;
+    if constexpr (sizeof...(src1_ptrs) > 0) {
+        result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10])));
+    } else {
+        result = bin_op(result, (float)src1[i_src1 + i10]);
+    }
+
+    dst_row[i0] = (dst_t) result;
+}
+
+template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t, size_t... I>
+static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+                                  const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
+                                  cudaStream_t stream, std::index_sequence<I...>) {
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    int nr0 = ne10 / ne0;
+    int nr1 = ne11 / ne1;
+    int nr2 = ne12 / ne2;
+    int nr3 = ne13 / ne3;
+
+    int nr[4] = { nr0, nr1, nr2, nr3 };
+
+    int64_t cne[]  = { ne0, ne1, ne2, ne3 };
+    int64_t cne0[] = { ne00, ne01, ne02, ne03 };
+    int64_t cne1[] = { ne10, ne11, ne12, ne13 };
+
+    size_t cnb[]  = { nb0, nb1, nb2, nb3 };
+    size_t cnb0[] = { nb00, nb01, nb02, nb03 };
+    size_t cnb1[] = { nb10, nb11, nb12, nb13 };
+
+    auto collapse = [](int64_t cne[]) {
+        cne[0] *= cne[1];
+        cne[1] = cne[2];
+        cne[2] = cne[3];
+        cne[3] = 1;
+    };
+
+    auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
+        cnb[1] *= cne[1];
+        cnb[2] *= cne[2];
+        cnb[3] *= cne[3];
+    };
+
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
+        for (int i = 0; i < 4; i++) {
+            if (nr[i] != 1) {
+                break;
+            }
+            if (i > 0) {
+                collapse_nb(cnb, cne);
+                collapse_nb(cnb0, cne0);
+                collapse_nb(cnb1, cne1);
+                collapse(cne);
+                collapse(cne0);
+                collapse(cne1);
+            }
+        }
+    }
+
+    {
+        int64_t ne0 = cne[0];
+        int64_t ne1 = cne[1];
+        int64_t ne2 = cne[2];
+        int64_t ne3 = cne[3];
+
+        //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
+        //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
+        //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
+        //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
+
+        size_t nb0 = cnb[0];
+        size_t nb1 = cnb[1];
+        size_t nb2 = cnb[2];
+        size_t nb3 = cnb[3];
+
+        size_t nb00 = cnb0[0];
+        size_t nb01 = cnb0[1];
+        size_t nb02 = cnb0[2];
+        size_t nb03 = cnb0[3];
+
+        size_t nb10 = cnb1[0];
+        size_t nb11 = cnb1[1];
+        size_t nb12 = cnb1[2];
+        size_t nb13 = cnb1[3];
+
+        size_t s0 = nb0 / sizeof(dst_t);
+        size_t s1 = nb1 / sizeof(dst_t);
+        size_t s2 = nb2 / sizeof(dst_t);
+        size_t s3 = nb3 / sizeof(dst_t);
+
+        size_t s10 = nb10 / sizeof(src1_t);
+        size_t s11 = nb11 / sizeof(src1_t);
+        size_t s12 = nb12 / sizeof(src1_t);
+        size_t s13 = nb13 / sizeof(src1_t);
+
+        size_t s00 = nb00 / sizeof(src0_t);
+        size_t s01 = nb01 / sizeof(src0_t);
+        size_t s02 = nb02 / sizeof(src0_t);
+        size_t s03 = nb03 / sizeof(src0_t);
+
+        GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
+        GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
+        GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
+        GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
+
+        GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
+        GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
+        GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
+        GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
+
+        GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
+        GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
+        GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
+        GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
+
+        GGML_ASSERT(s0 == 1);
+        GGML_ASSERT(s00 == 1);
+        GGML_ASSERT(s10 == 1);
+
+        const int block_size = 128;
+
+        int64_t hne0 = std::max(ne0 / 2LL, 1LL);
+
+        dim3 block_dims;
+        block_dims.x = std::min<unsigned int>(hne0, block_size);
+        block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
+        block_dims.z = std::min(std::min<unsigned int>(ne2 * ne3, block_size / block_dims.x / block_dims.y), 64U);
+
+        dim3 block_nums((hne0 + block_dims.x - 1) / block_dims.x, (ne1 + block_dims.y - 1) / block_dims.y,
+                        (ne2 * ne3 + block_dims.z - 1) / block_dims.z);
+
+        const uint3 ne10 = init_fastdiv_values((uint32_t) cne1[0]);
+        const uint3 ne11 = init_fastdiv_values((uint32_t) cne1[1]);
+        const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]);
+        const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
+
+        if (block_nums.z > 65535 || block_nums.y > 65535) {
+            int         block_num  = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
+            const uint3 prod_012    = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
+            const uint3 prod_01     = init_fastdiv_values((uint32_t) (ne0 * ne1));
+            const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0);
+            const uint3 ne1_fastdiv = init_fastdiv_values((uint32_t) ne1);
+            const uint3 ne2_fastdiv = init_fastdiv_values((uint32_t) ne2);
+
+            if constexpr (sizeof...(I) > 0) {
+                k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t><<<block_num, block_size, 0, stream>>>(
+                    src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11,
+                    ne12, ne13,
+                    /* s0, */ s1, s2, s3,
+                    /* s00,*/ s01, s02, s03,
+                    /* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
+            } else {
+                k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t>
+                    <<<block_num, block_size, 0, stream>>>(src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv,
+                                                           ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13,
+                                                           /* s0, */ s1, s2, s3,
+                                                           /* s00,*/ s01, s02, s03,
+                                                           /* s10,*/ s11, s12, s13);
+            }
+        } else {
+            const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
+            if constexpr (sizeof...(I) > 0) {
+                k_bin_bcast<bin_op, src0_t, src1_t, dst_t><<<block_nums, block_dims, 0, stream>>>(
+                    src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13,
+                    /* s0, */ s1, s2, s3,
+                    /* s00,*/ s01, s02, s03,
+                    /* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
+            } else {
+                k_bin_bcast<bin_op, src0_t, src1_t, dst_t><<<block_nums, block_dims, 0, stream>>>(
+                    src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13,
+                    /* s0, */ s1, s2, s3,
+                    /* s00,*/ s01, s02, s03,
+                    /* s10,*/ s11, s12, s13);
+            }
+        }
+    }
+}
+
+template <typename T>
+static __global__ void k_repeat_back(
+    const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+    const size_t s00, const size_t s01, const size_t s02, const size_t s03,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
+
+    const int64_t tid0  = int64_t(blockIdx.x)*blockDim.x + threadIdx.x;
+    const int64_t tid1  = int64_t(blockIdx.y)*blockDim.y + threadIdx.y;
+    const int64_t tid23 = int64_t(blockIdx.z)*blockDim.z + threadIdx.z;
+    const int64_t tid2  = tid23 % ne2;
+    const int64_t tid3  = tid23 / ne2;
+
+    if (tid0 >= ne0) {
+        return;
+    }
+
+    T sum = 0;
+    for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) {
+        for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
+            for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
+                for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
+                    sum += src[i3*s03 + i2*s02 + i1*s01 + i0*s00];
+                }
+            }
+        }
+    }
+    dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
+}
+
+template <float (*bin_op)(const float, const float), int n_fuse = 1>
+struct bin_bcast_cuda {
+    template<typename src0_t, typename src1_t, typename dst_t>
+    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
+            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
+            cudaStream_t stream) {
+        launch_bin_bcast_pack<bin_op, src0_t, src1_t, dst_t>(
+            src0, src1, dst, src0_dd, src1_dd, dst_dd, stream, std::make_index_sequence<n_fuse>{});
+    }
+};
+
+template <typename T>
+static void repeat_back_cuda(
+    const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+    const size_t s00, const size_t s01, const size_t s02, const size_t s03,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
+
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2*ne3);
+    k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>
+        (src, dst, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3);
+}
+
+template<class op>
+static void ggml_cuda_op_bin_bcast(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
+
+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+        op()(src0, src1, dst, (const half *) src0_dd, (const half *)src1_dd, (half *) dst_dd, stream);
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
+        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat, 0>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
+}
+
+void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
+}
+
+void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
+}
+
+void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
+}
+
+void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
+}
+
+template <float (*op)(const float, const float), int n_fuse>
+static void ggml_cuda_op_fused_binbcast_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    cudaStream_t stream = ctx.stream();
+
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        launch_bin_bcast_pack<op, float, float, float>(src0, src1, dst,
+            (const float *) src0->data, (const float *) src1->data, (float *) dst->data,
+            stream, std::make_index_sequence<n_fuse>{});
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+        launch_bin_bcast_pack<op, half, half, half>(src0, src1, dst,
+            (const half *) src0->data, (const half *) src1->data, (half *) dst->data,
+            stream, std::make_index_sequence<n_fuse>{});
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
+        launch_bin_bcast_pack<op, half, float, half>(src0, src1, dst,
+            (const half *) src0->data, (const float *) src1->data, (half *) dst->data,
+            stream, std::make_index_sequence<n_fuse>{});
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+        launch_bin_bcast_pack<op, half, float, float>(src0, src1, dst,
+            (const half *) src0->data, (const float *) src1->data, (float *) dst->data,
+            stream, std::make_index_sequence<n_fuse>{});
+    } else {
+        fprintf(stderr,
+                "%s: unsupported types for fusion: dst: %s, src0: %s, src1: %s\n",
+                __func__, ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+
+void ggml_cuda_op_fused_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst, int n_fuse) {
+    GGML_ASSERT(2 <= n_fuse && n_fuse <= 8);
+
+    switch (n_fuse) {
+        case 2:
+            ggml_cuda_op_fused_binbcast_impl<op_add, 2>(ctx, dst);
+            break;
+        case 3:
+            ggml_cuda_op_fused_binbcast_impl<op_add, 3>(ctx, dst);
+            break;
+        case 4:
+            ggml_cuda_op_fused_binbcast_impl<op_add, 4>(ctx, dst);
+            break;
+        case 5:
+            ggml_cuda_op_fused_binbcast_impl<op_add, 5>(ctx, dst);
+            break;
+        case 6:
+            ggml_cuda_op_fused_binbcast_impl<op_add, 6>(ctx, dst);
+            break;
+        case 7:
+            ggml_cuda_op_fused_binbcast_impl<op_add, 7>(ctx, dst);
+            break;
+        case 8:
+            ggml_cuda_op_fused_binbcast_impl<op_add, 8>(ctx, dst);
+            break;
+        default:
+            GGML_ASSERT(false && "Unsupported n_fuse value");
+    }
+}
+
+void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == dst->type);
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_can_repeat(dst, src0));
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_TENSOR_UNARY_OP_LOCALS;
+
+    GGML_ASSERT(ne2*ne3 <= (1 << 15));
+
+    const size_t ts = ggml_type_size(src0->type);
+    const size_t s00 = nb00 / ts;
+    const size_t s01 = nb01 / ts;
+    const size_t s02 = nb02 / ts;
+    const size_t s03 = nb03 / ts;
+
+    switch (dst->type) {
+        case GGML_TYPE_F32: {
+            const float * src0_d = (const float *) src0->data;
+            float       * dst_d  = (float       *) dst->data;
+            repeat_back_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3, stream);
+        } break;
+        default: {
+            GGML_ASSERT(false);
+        } break;
+    }
+}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cuh
@@ -0,0 +1,11 @@
+#include "common.cuh"
+
+void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_fused_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst, int n_fuse);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cu
@@ -0,0 +1,45 @@
+#include "clamp.cuh"
+
+static __device__ __forceinline__ float op_clamp(float x, float min, float max) {
+    return fminf(fmaxf(x, min), max);
+}
+
+template <class T>
+static __global__ void op_clamp_kernel(const T * x, T * dst, const T min, const T max, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = (T)op_clamp((float)x[i], (float)min, (float)max);
+}
+
+template <class T>
+static void clamp_cuda(const T * x, T * dst, const T min, const T max, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
+    op_clamp_kernel<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
+}
+
+
+void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const void * src0_d = src0->data;
+    void * dst_d = dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == dst->type);
+
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    if (src0->type == GGML_TYPE_F16) {
+        clamp_cuda((const half *)src0_d, (half *)dst_d, (half)min, (half)max, ggml_nelements(src0), stream);
+    } else {
+        clamp_cuda((const float *)src0_d, (float *)dst_d, (float)min, (float)max, ggml_nelements(src0), stream);
+    }
+}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CLAMP_BLOCK_SIZE 256
+
+void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
--- a/ml/backend/ggml/ggml/src/ggml-cuda/concat.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/concat.cu
@@ -0,0 +1,221 @@
+#include "concat.cuh"
+
+// contiguous kernels
+static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+
+    if (nidx < ne00) { // src0
+        int offset_src =
+            nidx +
+            blockIdx.y * ne00 +
+            blockIdx.z * ne00 * gridDim.y;
+        dst[offset_dst] = x[offset_src];
+    } else {
+        int offset_src =
+            (nidx - ne00) +
+            blockIdx.y * (ne0 - ne00) +
+            blockIdx.z * (ne0 - ne00) * gridDim.y;
+        dst[offset_dst] = y[offset_src];
+    }
+}
+
+static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+
+    if (blockIdx.y < (unsigned)ne01) { // src0
+        int offset_src =
+            nidx +
+            blockIdx.y * ne0 +
+            blockIdx.z * ne0 * ne01;
+        dst[offset_dst] = x[offset_src];
+    } else {
+        int offset_src =
+            nidx +
+            (blockIdx.y - ne01) * ne0 +
+            blockIdx.z * ne0 * (gridDim.y - ne01);
+        dst[offset_dst] = y[offset_src];
+    }
+}
+
+static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+
+    int offset_dst =
+        nidx +
+        blockIdx.y * ne0 +
+        blockIdx.z * ne0 * gridDim.y;
+
+    if (blockIdx.z < (unsigned)ne02) { // src0
+        int offset_src =
+            nidx +
+            blockIdx.y * ne0 +
+            blockIdx.z * ne0 * gridDim.y;
+        dst[offset_dst] = x[offset_src];
+    } else {
+        int offset_src =
+            nidx +
+            blockIdx.y * ne0 +
+            (blockIdx.z - ne02) * ne0 *  gridDim.y;
+        dst[offset_dst] = y[offset_src];
+    }
+}
+
+static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
+    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
+    dim3 gridDim(num_blocks, ne1, ne2);
+    if (dim == 0) {
+        concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
+        return;
+    }
+    if (dim == 1) {
+        concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
+        return;
+    }
+    concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
+}
+
+// non-contiguous kernel (slow)
+template <int dim>
+static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
+    concat_f32_non_cont(
+        const char * src0,
+        const char * src1,
+              char * dst,
+           int64_t   ne00,
+           int64_t   ne01,
+           int64_t   ne02,
+           int64_t   ne03,
+          uint64_t   nb00,
+          uint64_t   nb01,
+          uint64_t   nb02,
+          uint64_t   nb03,
+           int64_t /*ne10*/,
+           int64_t /*ne11*/,
+           int64_t /*ne12*/,
+           int64_t /*ne13*/,
+          uint64_t   nb10,
+          uint64_t   nb11,
+          uint64_t   nb12,
+          uint64_t   nb13,
+           int64_t   ne0,
+           int64_t /*ne1*/,
+           int64_t /*ne2*/,
+           int64_t /*ne3*/,
+          uint64_t   nb0,
+          uint64_t   nb1,
+          uint64_t   nb2,
+          uint64_t   nb3){
+    static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
+
+    const int64_t i3 = blockIdx.z;
+    const int64_t i2 = blockIdx.y;
+    const int64_t i1 = blockIdx.x;
+
+    const float * x;
+
+    for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
+        if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+            x = (const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
+        } else {
+            if constexpr (dim == 0) {
+                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
+            } else if constexpr (dim == 1) {
+                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
+            } else if constexpr (dim == 2) {
+                x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
+            } else if constexpr (dim == 3) {
+                x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
+            }
+        }
+
+        float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+        *y = *x;
+    }
+}
+
+
+void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    cudaStream_t stream = ctx.stream();
+
+    const int32_t dim = ((int32_t *) dst->op_params)[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+        const float * src0_d = (const float *)src0->data;
+        const float * src1_d = (const float *)src1->data;
+
+        float * dst_d = (float *)dst->data;
+
+        if (dim != 3) {
+            for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+                concat_f32_cuda(
+                        src0_d + i3 * (src0->nb[3] / 4),
+                        src1_d + i3 * (src1->nb[3] / 4),
+                        dst_d + i3 * ( dst->nb[3] / 4),
+                        src0->ne[0], src0->ne[1], src0->ne[2],
+                        dst->ne[0],  dst->ne[1],  dst->ne[2], dim, stream);
+            }
+        } else {
+            const size_t size0 = ggml_nbytes(src0);
+            const size_t size1 = ggml_nbytes(src1);
+
+            CUDA_CHECK(cudaMemcpyAsync(dst_d,           src0_d, size0, cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
+        }
+    } else {
+        dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
+        auto launch_kernel = [&](auto dim) {
+            concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
+                (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
+                src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+                src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+                src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+                src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
+                dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+                dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
+        };
+        switch (dim) {
+            case 0:
+                launch_kernel(std::integral_constant<int, 0>{});
+                break;
+            case 1:
+                launch_kernel(std::integral_constant<int, 1>{});
+                break;
+            case 2:
+                launch_kernel(std::integral_constant<int, 2>{});
+                break;
+            case 3:
+                launch_kernel(std::integral_constant<int, 3>{});
+                break;
+            default:
+                GGML_ABORT("Invalid dim: %d", dim);
+                break;
+        }
+    }
+}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/concat.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/concat.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CONCAT_BLOCK_SIZE 256
+
+void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cu
@@ -0,0 +1,86 @@
+#include "conv-transpose-1d.cuh"
+
+static  __global__ void conv_transpose_1d_kernel(
+        const int s0, const int p0, const int d0, const int output_size,
+        const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
+        const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
+        const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
+        const float * src0, const float * src1,  float * dst) {
+    int global_index = threadIdx.x + blockIdx.x * blockDim.x;
+    if (global_index >= output_size) {
+        return;
+    }
+
+    int out_index = global_index / dst_ne0;
+
+    float accumulator = 0;
+
+    for (int c = 0; c < src0_ne2; c++) {
+        int idx = global_index % dst_ne0;
+
+        int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
+        int input_offset = src1_ne0 * c;
+
+        for (int i = 0; i < src1_ne0; i++) {
+            if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
+                continue;
+            }
+            int weight_idx = idx - i*s0;
+
+            float kernel_weight = src0[kernel_offset + weight_idx];
+            float input_value =  src1[input_offset+i];
+
+            accumulator += kernel_weight * input_value;
+        }
+    }
+    dst[global_index] = accumulator;
+    GGML_UNUSED_VARS(p0, d0, src0_ne3, src1_ne3, dst_ne3, src1_ne1, dst_ne1, src1_ne2, dst_ne2);
+}
+
+static void conv_transpose_1d_f32_f32_cuda(
+        const int s0, const int p0, const int d0, const int output_size,
+        const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
+        const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
+        const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
+        const float * src0, const float * src1,  float * dst,
+        cudaStream_t stream) {
+
+    const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
+    conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(
+        s0,p0,d0,output_size,
+        src0_ne0, src0_ne1,  src0_ne2, src0_ne3,
+        src1_ne0, src1_ne1,  src1_ne2, src1_ne3,
+        dst_ne0,  dst_ne1,   dst_ne2,  dst_ne3,
+        src0,src1, dst);
+}
+
+void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+
+    const ggml_tensor * src1 = dst->src[1];
+    const float * src1_d = (const float *)src1->data;
+
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+
+    const int s0 = opts[0];
+    const int p0 = 0;//opts[3];
+    const int d0 = 1;//opts[4];
+
+    const int64_t output_size = ggml_nelements(dst);
+
+    conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+        src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+        src0_d, src1_d, dst_d, stream);
+}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
+
+void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ml/backend/ggml/ggml/src/ggml-cuda/conv2d-dw.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/conv2d-dw.cu
@@ -0,0 +1,161 @@
+#include "conv2d-dw.cuh"
+
+struct conv_params {
+    int in_w, in_h;
+    int out_w, out_h;
+    int kernel_w, kernel_h;
+    int stride_x, stride_y;
+    int padding_x, padding_y;
+    int dilation_x, dilation_y;
+    int channels, batches;
+};
+
+struct kernel_bounds {
+    int y_min, y_max;
+    int x_min, x_max;
+};
+
+__device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
+    kernel_bounds bounds;
+    bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+    bounds.y_max =
+        min(params.kernel_h,
+            (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+    bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+    bounds.x_max =
+        min(params.kernel_w,
+            (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+    return bounds;
+}
+
+__device__ __forceinline__ int calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
+    return out_coord * stride + kern_coord * dilation - padding;
+}
+
+struct whcn_layout {
+    __device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
+        return n * (params.channels * params.in_w * params.in_h) + c * params.in_w * params.in_h + y * params.in_w + x;
+    }
+
+    __device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
+        return c * params.kernel_h * params.kernel_w + ky * params.kernel_w + kx;
+    }
+
+    __device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
+        return n * (params.channels * params.out_w * params.out_h) + c * params.out_w * params.out_h +
+               y * params.out_w + x;
+    }
+
+    __device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
+                                          int & out_x) {
+        out_x = global_idx % params.out_w;
+        out_y = (global_idx / params.out_w) % params.out_h;
+        c     = (global_idx / (params.out_w * params.out_h)) % params.channels;
+        n     = global_idx / (params.out_w * params.out_h * params.channels);
+    }
+};
+
+struct cwhn_layout {
+    __device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
+        return n * (params.channels * params.in_w * params.in_h) + (y * params.in_w + x) * params.channels + c;
+    }
+
+    __device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
+        return (ky * params.kernel_w + kx) * params.channels + c;
+    }
+
+    __device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
+        return n * (params.channels * params.out_w * params.out_h) + y * (params.out_w * params.channels) +
+               x * params.channels + c;
+    }
+
+    __device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
+                                          int & out_x) {
+        c     = global_idx % params.channels;
+        out_x = (global_idx / params.channels) % params.out_w;
+        out_y = (global_idx / (params.channels * params.out_w)) % params.out_h;
+        n     = global_idx / (params.channels * params.out_w * params.out_h);
+    }
+};
+
+template <typename T, typename Layout>
+__global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restrict__ kernel, T * __restrict__ output,
+                                 const int in_w, const int in_h, const int out_w, const int out_h,
+                                 const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
+                                 const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
+                                 const int channels, const int batches) {
+    const int global_idx     = blockIdx.x * blockDim.x + threadIdx.x;
+    const int total_elements = batches * channels * out_h * out_w;
+
+    if (global_idx >= total_elements) {
+        return;
+    }
+
+    conv_params params = { in_w,     in_h,      out_w,     out_h,      kernel_w,   kernel_h, stride_x,
+                           stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches };
+
+    int batch_idx, channel_idx, out_y_idx, out_x_idx;
+    Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx);
+
+    T accumulator = 0;
+    kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params);
+
+    for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
+        int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
+
+        for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
+            int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
+
+            const T input_val  = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)];
+            const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
+
+            accumulator += input_val * kernel_val;
+        }
+    }
+
+    output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator;
+}
+
+void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * kernel = dst->src[0];
+    const ggml_tensor * input  = dst->src[1];
+
+    GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    const float * w_d = (const float *) kernel->data;
+    const float * x_d = (const float *) input->data;
+    float *       y_d = (float *) dst->data;
+
+    const int32_t * p          = (const int32_t *) dst->op_params;
+    const int       stride_x   = p[0];
+    const int       stride_y   = p[1];
+    const int       padding_x  = p[2];
+    const int       padding_y  = p[3];
+    const int       dilation_x = p[4];
+    const int       dilation_y = p[5];
+
+    const int in_w     = input->ne[0];
+    const int in_h     = input->ne[1];
+    const int kernel_w = kernel->ne[0];
+    const int kernel_h = kernel->ne[1];
+    const int out_w    = dst->ne[0];
+    const int out_h    = dst->ne[1];
+    const int channels = dst->ne[2];
+    const int batches  = dst->ne[3];
+
+    cudaStream_t st = ctx.stream();
+
+    const int total  = batches * channels * out_h * out_w;
+    const int blocks = (total + CUDA_CONV2D_DW_BLOCK_SIZE - 1) / CUDA_CONV2D_DW_BLOCK_SIZE;
+
+    if (ggml_is_contiguous(input)) {
+        conv2d_dw_kernel<float, whcn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
+            x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
+            dilation_x, dilation_y, channels, batches);
+    } else if (ggml_is_contiguous_channels(input)) {
+        conv2d_dw_kernel<float, cwhn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
+            x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
+            dilation_x, dilation_y, channels, batches);
+    } else {
+        GGML_ABORT("Unsupported memory layout for conv_2d_dw");
+    }
+}
--- a/ml/backend/ggml/ggml/src/ggml-cuda/conv2d-dw.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/conv2d-dw.cuh
@@ -0,0 +1,5 @@
+#pragma once
+#include "common.cuh"
+
+#define CUDA_CONV2D_DW_BLOCK_SIZE 256
+void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/Show More
+++ b/Show More