201 lines
8.7 KiB
Go
201 lines
8.7 KiB
Go
package create
|
|
|
|
import (
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
|
|
st "github.com/ollama/ollama/x/safetensors"
|
|
)
|
|
|
|
func TestCreateSafetensorsModel_LagunaHFFP8RespectsSourceTensorPrecision(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
requested string
|
|
wantFP8Gate string
|
|
wantFP8Up string
|
|
wantFP8Down string
|
|
wantBF16QProj string
|
|
}{
|
|
{
|
|
name: "default mxfp8 import keeps source bf16 tensors",
|
|
requested: "",
|
|
wantFP8Gate: "mxfp8",
|
|
wantFP8Up: "mxfp8",
|
|
wantFP8Down: "mxfp8",
|
|
wantBF16QProj: "",
|
|
},
|
|
{
|
|
name: "nvfp4 import keeps source bf16 tensors and preserves down_proj at mxfp8",
|
|
requested: "nvfp4",
|
|
wantFP8Gate: "nvfp4",
|
|
wantFP8Up: "nvfp4",
|
|
wantFP8Down: "mxfp8",
|
|
wantBF16QProj: "",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
dir := t.TempDir()
|
|
configJSON := `{
|
|
"model_type": "laguna",
|
|
"architectures": ["LagunaForCausalLM"],
|
|
"quantization_config": {"quant_method": "fp8", "weight_block_size": [128, 128]}
|
|
}`
|
|
if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil {
|
|
t.Fatalf("failed to write config.json: %v", err)
|
|
}
|
|
|
|
createTestSafetensors(t, filepath.Join(dir, "model.safetensors"), []*st.TensorData{
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.gate_proj.weight", "F8_E4M3", []int32{128, 128}, make([]byte, 128*128)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.gate_proj.weight_scale_inv", "BF16", []int32{1, 1}, make([]byte, 2)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.up_proj.weight", "F8_E4M3", []int32{128, 128}, make([]byte, 128*128)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.up_proj.weight_scale_inv", "BF16", []int32{1, 1}, make([]byte, 2)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.down_proj.weight", "F8_E4M3", []int32{128, 128}, make([]byte, 128*128)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.down_proj.weight_scale_inv", "BF16", []int32{1, 1}, make([]byte, 2)),
|
|
st.NewTensorDataFromBytes("model.layers.0.self_attn.q_proj.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("model.embed_tokens.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("lm_head.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.gate.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
})
|
|
|
|
quantizeByName := make(map[string]string)
|
|
|
|
createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
|
|
if _, err := io.ReadAll(r); err != nil {
|
|
return LayerInfo{}, err
|
|
}
|
|
return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil
|
|
}
|
|
createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
|
|
if _, err := io.ReadAll(r); err != nil {
|
|
return nil, err
|
|
}
|
|
quantizeByName[name] = quantize
|
|
return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil
|
|
}
|
|
writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil }
|
|
|
|
if err := CreateSafetensorsModel("test-model", dir, tt.requested, createLayer, createTensorLayer, writeManifest, func(string) {}); err != nil {
|
|
t.Fatalf("CreateSafetensorsModel failed: %v", err)
|
|
}
|
|
|
|
if got := quantizeByName["model.layers.0.mlp.experts.0.gate_proj.weight"]; got != tt.wantFP8Gate {
|
|
t.Fatalf("gate_proj quantization = %q, want %q", got, tt.wantFP8Gate)
|
|
}
|
|
if got := quantizeByName["model.layers.0.mlp.experts.0.up_proj.weight"]; got != tt.wantFP8Up {
|
|
t.Fatalf("up_proj quantization = %q, want %q", got, tt.wantFP8Up)
|
|
}
|
|
if got := quantizeByName["model.layers.0.mlp.experts.0.down_proj.weight"]; got != tt.wantFP8Down {
|
|
t.Fatalf("down_proj quantization = %q, want %q", got, tt.wantFP8Down)
|
|
}
|
|
for _, name := range []string{
|
|
"model.layers.0.self_attn.q_proj.weight",
|
|
"model.embed_tokens.weight",
|
|
"lm_head.weight",
|
|
"model.layers.0.mlp.gate.weight",
|
|
} {
|
|
if got := quantizeByName[name]; got != tt.wantBF16QProj {
|
|
t.Fatalf("%s quantization = %q, want %q", name, got, tt.wantBF16QProj)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCreateSafetensorsModel_LagunaBF16QuantizesOnlyRoutedExperts(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
requested string
|
|
want map[string]string
|
|
}{
|
|
{
|
|
name: "int8 quantizes only routed experts",
|
|
requested: "int8",
|
|
want: map[string]string{
|
|
"model.layers.0.mlp.experts.0.gate_proj.weight": "int8",
|
|
"model.layers.0.mlp.experts.0.up_proj.weight": "int8",
|
|
"model.layers.0.mlp.experts.0.down_proj.weight": "int8",
|
|
"model.layers.0.mlp.shared_experts.gate_proj.weight": "",
|
|
"model.layers.0.mlp.shared_experts.down_proj.weight": "",
|
|
"model.layers.0.self_attn.q_proj.weight": "",
|
|
"model.layers.0.mlp.down_proj.weight": "",
|
|
"model.embed_tokens.weight": "",
|
|
"lm_head.weight": "",
|
|
"model.layers.0.mlp.gate.weight": "",
|
|
},
|
|
},
|
|
{
|
|
name: "int4 keeps routed down_proj at int8 and leaves others bf16",
|
|
requested: "int4",
|
|
want: map[string]string{
|
|
"model.layers.0.mlp.experts.0.gate_proj.weight": "int4",
|
|
"model.layers.0.mlp.experts.0.up_proj.weight": "int4",
|
|
"model.layers.0.mlp.experts.0.down_proj.weight": "int8",
|
|
"model.layers.0.mlp.shared_experts.gate_proj.weight": "",
|
|
"model.layers.0.mlp.shared_experts.down_proj.weight": "",
|
|
"model.layers.0.self_attn.q_proj.weight": "",
|
|
"model.layers.0.mlp.down_proj.weight": "",
|
|
"model.embed_tokens.weight": "",
|
|
"lm_head.weight": "",
|
|
"model.layers.0.mlp.gate.weight": "",
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
dir := t.TempDir()
|
|
configJSON := `{
|
|
"model_type": "laguna",
|
|
"architectures": ["LagunaForCausalLM"]
|
|
}`
|
|
if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil {
|
|
t.Fatalf("failed to write config.json: %v", err)
|
|
}
|
|
|
|
createTestSafetensors(t, filepath.Join(dir, "model.safetensors"), []*st.TensorData{
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.gate_proj.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.up_proj.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.down_proj.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.shared_experts.gate_proj.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.shared_experts.down_proj.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("model.layers.0.self_attn.q_proj.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.down_proj.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("model.embed_tokens.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("lm_head.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
st.NewTensorDataFromBytes("model.layers.0.mlp.gate.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)),
|
|
})
|
|
|
|
quantizeByName := make(map[string]string)
|
|
|
|
createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) {
|
|
if _, err := io.ReadAll(r); err != nil {
|
|
return LayerInfo{}, err
|
|
}
|
|
return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil
|
|
}
|
|
createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) {
|
|
if _, err := io.ReadAll(r); err != nil {
|
|
return nil, err
|
|
}
|
|
quantizeByName[name] = quantize
|
|
return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil
|
|
}
|
|
writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil }
|
|
|
|
if err := CreateSafetensorsModel("test-model", dir, tt.requested, createLayer, createTensorLayer, writeManifest, func(string) {}); err != nil {
|
|
t.Fatalf("CreateSafetensorsModel failed: %v", err)
|
|
}
|
|
|
|
for name, want := range tt.want {
|
|
if got := quantizeByName[name]; got != want {
|
|
t.Fatalf("%s quantization = %q, want %q", name, got, want)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
}
|