llama.cpp verification source 2026-05-22
Some checks are pending
Copilot Setup Steps / copilot-setup-steps (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
Python Type-Check / python type-check (push) Waiting to run
Update Operations Documentation / update-ops-docs (push) Waiting to run
Some checks are pending
Copilot Setup Steps / copilot-setup-steps (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
Python Type-Check / python type-check (push) Waiting to run
Update Operations Documentation / update-ops-docs (push) Waiting to run
This commit is contained in:
98
tools/mtmd/models/cogvlm.cpp
Normal file
98
tools/mtmd/models/cogvlm.cpp
Normal file
@@ -0,0 +1,98 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_cogvlm::build() {
|
||||
GGML_ASSERT(model.class_embedding != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
|
||||
const int n_pos = n_patches + 1; // +1 for [CLS]
|
||||
|
||||
// build input and concatenate class embedding
|
||||
ggml_tensor * inp = build_inp();
|
||||
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||
|
||||
inp = ggml_add(ctx0, inp, model.position_embeddings);
|
||||
cb(inp, "inp_pos", -1);
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
auto & layer = model.layers[il];
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
cur = build_mm(layer.qkv_w, cur);
|
||||
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
|
||||
cur->nb[1], 0);
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
|
||||
cur->nb[1], n_embd * sizeof(float));
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
|
||||
cur->nb[1], 2 * n_embd * sizeof(float));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
inpL = cur;
|
||||
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "ffn_post_norm", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cb(cur, "layer_out", il);
|
||||
inpL = cur;
|
||||
|
||||
}
|
||||
|
||||
// remove CLS token (like build_llama4 does)
|
||||
ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
|
||||
n_embd, n_patches,
|
||||
ggml_row_size(inpL->type, n_embd), 0);
|
||||
|
||||
// Multiply with mm_model_proj
|
||||
cur = build_mm(model.mm_model_proj, cur);
|
||||
|
||||
// Apply layernorm, weight, bias
|
||||
cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
|
||||
// Apply GELU
|
||||
cur = ggml_gelu_inplace(ctx0, cur);
|
||||
|
||||
// Branch 1: multiply with mm_h_to_4h_w
|
||||
ggml_tensor * h_to_4h = build_mm(model.mm_h_to_4h_w, cur);
|
||||
|
||||
// Branch 2: multiply with mm_gate_w
|
||||
ggml_tensor * gate = build_mm(model.mm_gate_w, cur);
|
||||
|
||||
// Apply silu
|
||||
gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
|
||||
|
||||
// Apply mm_4h_to_h_w
|
||||
cur = build_mm(model.mm_4h_to_h_w, gate);
|
||||
|
||||
// Concatenate with boi and eoi
|
||||
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
|
||||
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
216
tools/mtmd/models/conformer.cpp
Normal file
216
tools/mtmd/models/conformer.cpp
Normal file
@@ -0,0 +1,216 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_conformer::build() {
|
||||
const int n_frames = img.nx;
|
||||
const int n_pos = n_frames / 2;
|
||||
const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
|
||||
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
|
||||
|
||||
ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
|
||||
ggml_set_name(pos_emb, "pos_emb");
|
||||
ggml_set_input(pos_emb);
|
||||
ggml_build_forward_expand(gf, pos_emb);
|
||||
|
||||
ggml_tensor * inp = build_inp_raw(1);
|
||||
|
||||
auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
|
||||
// pre encode, conv subsampling
|
||||
{
|
||||
// layer.0 - conv2d
|
||||
cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
|
||||
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
|
||||
cb(cur, "conformer.pre_encode.conv.{}", 0);
|
||||
|
||||
// layer.1 - relu
|
||||
cur = ggml_relu_inplace(ctx0, cur);
|
||||
|
||||
// layer.2 conv2d dw
|
||||
cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
|
||||
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
|
||||
cb(cur, "conformer.pre_encode.conv.{}", 2);
|
||||
|
||||
// layer.3 conv2d
|
||||
cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
|
||||
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
|
||||
cb(cur, "conformer.pre_encode.conv.{}", 3);
|
||||
|
||||
// layer.4 - relu
|
||||
cur = ggml_relu_inplace(ctx0, cur);
|
||||
|
||||
// layer.5 conv2d dw
|
||||
cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
|
||||
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
|
||||
cb(cur, "conformer.pre_encode.conv.{}", 5);
|
||||
|
||||
// layer.6 conv2d
|
||||
cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
|
||||
cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
|
||||
cb(cur, "conformer.pre_encode.conv.{}", 6);
|
||||
|
||||
// layer.7 - relu
|
||||
cur = ggml_relu_inplace(ctx0, cur);
|
||||
|
||||
// flatten channel and frequency axis
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
|
||||
|
||||
// calculate out
|
||||
cur = build_mm(model.pre_encode_out_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
|
||||
cb(cur, "conformer.pre_encode.out", -1);
|
||||
}
|
||||
|
||||
// pos_emb
|
||||
cb(pos_emb, "pos_emb", -1);
|
||||
|
||||
for (int il = 0; il < hparams.n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
|
||||
auto * residual = cur;
|
||||
|
||||
cb(cur, "layer.in", il);
|
||||
|
||||
// feed_forward1
|
||||
cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||
cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
|
||||
|
||||
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
|
||||
il);
|
||||
cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
|
||||
|
||||
const auto fc_factor = 0.5f;
|
||||
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||
cb(cur, "conformer.layers.{}.norm_self_att", il);
|
||||
|
||||
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
|
||||
ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
|
||||
Q_bias_u = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
|
||||
ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
|
||||
Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
|
||||
|
||||
// TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
|
||||
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
|
||||
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
||||
|
||||
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
|
||||
Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
|
||||
|
||||
// build_attn won't fit due to matrix_ac and matrix_bd separation
|
||||
ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
|
||||
matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
|
||||
cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
|
||||
|
||||
auto * p = build_mm(layer.linear_pos_w, pos_emb);
|
||||
cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
|
||||
p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
|
||||
p = ggml_permute(ctx0, p, 0, 2, 1, 3);
|
||||
|
||||
auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
|
||||
matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
|
||||
|
||||
// rel shift
|
||||
{
|
||||
const auto pos_len = matrix_bd->ne[0];
|
||||
const auto q_len = matrix_bd->ne[1];
|
||||
const auto h = matrix_bd->ne[2];
|
||||
matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
|
||||
matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
|
||||
matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
|
||||
matrix_bd = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
|
||||
matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
|
||||
matrix_bd = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
|
||||
}
|
||||
|
||||
matrix_bd = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
|
||||
matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
|
||||
auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
|
||||
scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
|
||||
cb(scores, "conformer.layers.{}.self_attn.id0", il);
|
||||
|
||||
ggml_tensor * attn = ggml_soft_max(ctx0, scores);
|
||||
ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur);
|
||||
x = ggml_permute(ctx0, x, 2, 0, 1, 3);
|
||||
x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
|
||||
|
||||
ggml_tensor * out = build_mm(layer.o_w, x);
|
||||
out = ggml_add(ctx0, out, layer.o_b);
|
||||
cb(out, "conformer.layers.{}.self_attn.linear_out", il);
|
||||
|
||||
cur = out;
|
||||
}
|
||||
|
||||
residual = ggml_add(ctx0, residual, cur);
|
||||
cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||
cb(cur, "conformer.layers.{}.norm_conv", il);
|
||||
|
||||
// conv
|
||||
{
|
||||
auto * x = cur;
|
||||
x = build_mm(layer.conv_pw1_w, x);
|
||||
x = ggml_add(ctx0, x, layer.conv_pw1_b);
|
||||
cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
|
||||
|
||||
// ggml_glu doesn't support sigmoid
|
||||
// TODO @ngxson : support this ops in ggml
|
||||
{
|
||||
int64_t d = x->ne[0] / 2;
|
||||
ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
|
||||
x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
|
||||
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
|
||||
}
|
||||
|
||||
// use ggml_ssm_conv for f32 precision
|
||||
x = ggml_pad(ctx0, x, 4, 0, 0, 0);
|
||||
x = ggml_roll(ctx0, x, 4, 0, 0, 0);
|
||||
x = ggml_pad(ctx0, x, 4, 0, 0, 0);
|
||||
x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
|
||||
x = ggml_add(ctx0, x, layer.conv_dw_b);
|
||||
|
||||
x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
|
||||
x = ggml_silu(ctx0, x);
|
||||
|
||||
// pointwise_conv2
|
||||
x = build_mm(layer.conv_pw2_w, x);
|
||||
x = ggml_add(ctx0, x, layer.conv_pw2_b);
|
||||
|
||||
cur = x;
|
||||
}
|
||||
|
||||
residual = ggml_add(ctx0, residual, cur);
|
||||
|
||||
cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||
cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
|
||||
|
||||
cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
|
||||
FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams
|
||||
cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
|
||||
|
||||
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
|
||||
cb(residual, "conformer.layers.{}.conv.id", il);
|
||||
|
||||
cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||
cb(cur, "conformer.layers.{}.norm_out", il);
|
||||
}
|
||||
|
||||
// audio adapter
|
||||
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cb(cur, "audio_adapter.model.{}", 0);
|
||||
cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
|
||||
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
324
tools/mtmd/models/deepseekocr.cpp
Normal file
324
tools/mtmd/models/deepseekocr.cpp
Normal file
@@ -0,0 +1,324 @@
|
||||
#include "models.h"
|
||||
|
||||
// Implementation based on approach suggested by Acly
|
||||
// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
|
||||
static ggml_tensor * window_partition(ggml_context * ctx0, ggml_tensor * x, const int window) {
|
||||
auto [c, w, h, b] = x->ne;
|
||||
// same as
|
||||
// x = ggml_win_part(m, x, window);
|
||||
// x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]);
|
||||
|
||||
const int64_t px = (window - w % window) % window;
|
||||
const int64_t py = (window - h % window) % window;
|
||||
const int64_t npw = (w + px) / window;
|
||||
const int64_t nph = (h + py) / window;
|
||||
|
||||
ggml_tensor * cur = x;
|
||||
if (px > 0 || py > 0) {
|
||||
cur = ggml_pad(ctx0, cur, 0, static_cast<int>(px), static_cast<int>(py), 0);
|
||||
}
|
||||
cur = ggml_reshape_4d(ctx0, cur, c * window, npw, window, nph * b);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
|
||||
cur = ggml_reshape_4d(ctx0, cur, c, window, window, npw * nph * b);
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Implementation based on approach suggested by Acly
|
||||
// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
|
||||
static ggml_tensor * window_unpartition(ggml_context * ctx0,
|
||||
ggml_tensor * x,
|
||||
const int w,
|
||||
const int h,
|
||||
const int window) {
|
||||
const int64_t c = x->ne[0];
|
||||
// same as
|
||||
// x = ggml_reshape_4d(m, x, c, window, window, x->ne[2]);
|
||||
// x = ggml_win_unpart(m, x, w, h, window);
|
||||
|
||||
const int64_t px = (window - w % window) % window;
|
||||
const int64_t py = (window - h % window) % window;
|
||||
const int64_t npw = (w + px) / window;
|
||||
const int64_t nph = (h + py) / window;
|
||||
|
||||
const int64_t b = x->ne[3] / (npw * nph);
|
||||
ggml_tensor * cur = x;
|
||||
cur = ggml_reshape_4d(ctx0, cur, c * window, window, npw, nph * b);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
|
||||
cur = ggml_reshape_4d(ctx0, cur, c, w + px, h + py, b);
|
||||
cur = ggml_view_4d(ctx0, cur, cur->ne[0], w, h, cur->ne[3], cur->nb[1], cur->nb[2], cur->nb[3], 0);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
return cur;
|
||||
}
|
||||
|
||||
static ggml_tensor * get_rel_pos(ggml_context * ctx0,
|
||||
ggml_tensor * rel_pos, // [L, C]
|
||||
ggml_tensor * indices, // [q_size, k_size]
|
||||
const int q_size,
|
||||
const int k_size) {
|
||||
const int64_t C = rel_pos->ne[0]; // channels
|
||||
const int64_t L = rel_pos->ne[1]; // length
|
||||
|
||||
GGML_ASSERT(indices != nullptr);
|
||||
GGML_ASSERT(indices->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(indices->ne[0] == k_size);
|
||||
GGML_ASSERT(indices->ne[1] == q_size);
|
||||
|
||||
const auto max_rel_dist = 2 * std::max(q_size, k_size) - 1;
|
||||
ggml_tensor * cur = rel_pos;
|
||||
|
||||
if (max_rel_dist != L) {
|
||||
// Linear interpolation
|
||||
const int64_t ne0 = cur->ne[0];
|
||||
const int64_t ne1 = cur->ne[1];
|
||||
const int64_t ne2 = cur->ne[2];
|
||||
const int64_t ne3 = cur->ne[3];
|
||||
|
||||
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)), ne1, 1, ne0 * ne2 * ne3);
|
||||
cur = ggml_reshape_4d(
|
||||
ctx0, ggml_interpolate(ctx0, cur, max_rel_dist, 1, ne0 * ne2 * ne3, 1, GGML_SCALE_MODE_BILINEAR),
|
||||
max_rel_dist, ne0, ne2, ne3);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
|
||||
}
|
||||
|
||||
// Flatten indices to 1D for ggml_get_rows
|
||||
const int qk = q_size * k_size;
|
||||
|
||||
cur = ggml_reshape_3d(ctx0, ggml_get_rows(ctx0, cur, ggml_reshape_1d(ctx0, indices, qk)), C, k_size, q_size);
|
||||
|
||||
return cur; // [C, k_size, q_size]
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_deepseekocr::build() {
|
||||
// patch embedding
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
|
||||
ggml_tensor * sam_out;
|
||||
// Building SAM
|
||||
{
|
||||
const int n_embd = hparams.sam_n_embd;
|
||||
const int n_layer = hparams.sam_n_layer;
|
||||
const int n_heads = hparams.sam_n_head;
|
||||
const int d_heads = n_embd / n_heads;
|
||||
const int window = hparams.attn_window_size;
|
||||
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw);
|
||||
inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd));
|
||||
inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3));
|
||||
|
||||
ggml_tensor * rel_pos_indices_local;
|
||||
ggml_tensor * rel_pos_indices_global;
|
||||
|
||||
rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window);
|
||||
rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]);
|
||||
ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local");
|
||||
ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global");
|
||||
ggml_set_input(rel_pos_indices_local);
|
||||
ggml_set_input(rel_pos_indices_global);
|
||||
|
||||
ggml_tensor * cur;
|
||||
const auto tgt_size = inpL->ne[1];
|
||||
const auto str_size = model.pos_embed->ne[1];
|
||||
|
||||
if (str_size != tgt_size) {
|
||||
ggml_tensor * old_pos_embed = nullptr;
|
||||
old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3));
|
||||
ggml_tensor * new_pos_embed =
|
||||
ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC);
|
||||
new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3));
|
||||
cur = ggml_add(ctx0, inpL, new_pos_embed);
|
||||
} else {
|
||||
cur = ggml_add(ctx0, inpL, model.pos_embed);
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
auto & layer = model.sam_layers[il];
|
||||
ggml_tensor * shortcut = cur;
|
||||
|
||||
// layernorm1
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||
|
||||
const int64_t w0 = cur->ne[1];
|
||||
const int64_t h0 = cur->ne[2];
|
||||
|
||||
ggml_tensor * indices;
|
||||
|
||||
if (hparams.is_global_attn(il)) {
|
||||
indices = rel_pos_indices_global;
|
||||
} else {
|
||||
// local attention layer - apply window partition
|
||||
cur = window_partition(ctx0, cur, window);
|
||||
indices = rel_pos_indices_local;
|
||||
}
|
||||
|
||||
const int64_t W = cur->ne[1];
|
||||
const int64_t H = cur->ne[2];
|
||||
// self-attention
|
||||
{
|
||||
const int B = cur->ne[3];
|
||||
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
|
||||
cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
|
||||
|
||||
ggml_tensor * Q;
|
||||
ggml_tensor * K;
|
||||
ggml_tensor * V;
|
||||
|
||||
Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]);
|
||||
Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B);
|
||||
|
||||
K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]);
|
||||
K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B);
|
||||
|
||||
V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]);
|
||||
V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B);
|
||||
|
||||
ggml_tensor * mask;
|
||||
ggml_tensor * rw;
|
||||
ggml_tensor * rh;
|
||||
ggml_tensor * qr;
|
||||
|
||||
rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C]
|
||||
rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C]
|
||||
qr = ggml_permute(ctx0, Q, 0, 2, 1, 3);
|
||||
qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads);
|
||||
|
||||
rw = ggml_mul_mat(ctx0, rw,
|
||||
ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W]
|
||||
rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W]
|
||||
rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B);
|
||||
rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B);
|
||||
rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H]
|
||||
rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B);
|
||||
mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W]
|
||||
mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B);
|
||||
mask = ggml_cast(ctx0, mask, GGML_TYPE_F16);
|
||||
|
||||
const float scale = 1.0f / sqrtf(static_cast<float>(d_heads));
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale,
|
||||
il); // [B, H*W, n_embd]
|
||||
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B);
|
||||
}
|
||||
|
||||
if (hparams.is_global_attn(il) == false) {
|
||||
// local attention layer - reverse window partition
|
||||
cur = window_unpartition(ctx0, cur, w0, h0, window);
|
||||
}
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, shortcut);
|
||||
|
||||
ggml_tensor * inpFF = cur;
|
||||
|
||||
// layernorm2
|
||||
cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||
|
||||
// ffn
|
||||
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, cur, inpFF);
|
||||
cb(cur, "sam_layer_out", il);
|
||||
}
|
||||
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||
|
||||
cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||
cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||
|
||||
cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||
cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||
|
||||
cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1);
|
||||
cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1);
|
||||
cb(cur, "sam_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
sam_out = cur;
|
||||
}
|
||||
|
||||
ggml_tensor * clip_out;
|
||||
// Building DS-OCR CLIP
|
||||
{
|
||||
ggml_tensor * inp;
|
||||
|
||||
inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out));
|
||||
inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]);
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
||||
|
||||
ggml_tensor * new_pos_embd =
|
||||
ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings));
|
||||
|
||||
int n_pos = new_pos_embd->ne[1]; // +1 for [CLS]
|
||||
const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
|
||||
const auto src_size = static_cast<int>(std::sqrt(n_pos - 1));
|
||||
|
||||
if (tgt_size != src_size) {
|
||||
ggml_tensor * old_pos_embd;
|
||||
ggml_tensor * cls_tok;
|
||||
|
||||
old_pos_embd = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], src_size * src_size,
|
||||
ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), 0);
|
||||
cls_tok = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], 1,
|
||||
ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), src_size * src_size);
|
||||
new_pos_embd = ggml_interpolate(ctx0, old_pos_embd, tgt_size, tgt_size, new_pos_embd->ne[0], 1,
|
||||
GGML_SCALE_MODE_BICUBIC);
|
||||
new_pos_embd = ggml_reshape_3d(ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1);
|
||||
new_pos_embd = ggml_concat(ctx0, new_pos_embd, cls_tok, 1);
|
||||
n_pos = tgt_size * tgt_size + 1;
|
||||
}
|
||||
|
||||
// add CLS token
|
||||
inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
|
||||
|
||||
// for selecting learned pos embd, used by ViT
|
||||
ggml_tensor * positions = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32);
|
||||
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions);
|
||||
|
||||
ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
clip_out = cur;
|
||||
}
|
||||
|
||||
const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
|
||||
|
||||
sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
|
||||
sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
|
||||
clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);
|
||||
|
||||
ggml_tensor * cur;
|
||||
cur = ggml_concat(ctx0, clip_out, sam_out, 0);
|
||||
cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
||||
|
||||
const auto h = static_cast<int>(std::sqrt(static_cast<float>(cur->ne[1])));
|
||||
const auto w = h;
|
||||
const auto n_dim = cur->ne[0];
|
||||
|
||||
ggml_tensor * imgnl;
|
||||
ggml_tensor * vs;
|
||||
|
||||
imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
|
||||
vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1)
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
|
||||
cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
|
||||
cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1)
|
||||
|
||||
cb(cur, "dsocr_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
49
tools/mtmd/models/dotsocr.cpp
Normal file
49
tools/mtmd/models/dotsocr.cpp
Normal file
@@ -0,0 +1,49 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_dotsocr::build() {
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
||||
|
||||
// note: similar to PaddleOCR
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return ggml_rope_multi(
|
||||
ctx0, cur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
|
||||
32768, 10000, 1, 0, 1, 32, 1);
|
||||
};
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_RMS,
|
||||
hparams.ffn_op,
|
||||
nullptr,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
// dots.ocr patch merger + projector
|
||||
{
|
||||
GGML_ASSERT(hparams.n_merge > 0);
|
||||
cur = build_norm(cur, model.mm_input_norm_w, model.mm_input_norm_b, NORM_TYPE_NORMAL, 1e-6, -1);
|
||||
cur = build_patch_merge_permute(cur, hparams.n_merge);
|
||||
cb(cur, "after_patch_merger", -1);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr, // no gate
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF, -1); // nn.GELU() defaults to exact erf-based GELU
|
||||
cb(cur, "after_projector", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
288
tools/mtmd/models/gemma4a.cpp
Normal file
288
tools/mtmd/models/gemma4a.cpp
Normal file
@@ -0,0 +1,288 @@
|
||||
/**
|
||||
* Gemma 4 Audio Conformer Encoder (clip_graph_gemma4a)
|
||||
*
|
||||
* Architecture: Conformer with dual half-step FFN, full self-attention
|
||||
* with sinusoidal RPE, depthwise light conv, and output projection.
|
||||
*/
|
||||
|
||||
#include "models.h"
|
||||
#include <cmath>
|
||||
|
||||
ggml_cgraph * clip_graph_gemma4a::build() {
|
||||
const float res_weight = 0.5f;
|
||||
const float norm_eps = 1e-6f;
|
||||
|
||||
// 1. Input
|
||||
ggml_tensor * inp = build_inp_raw(1);
|
||||
auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
|
||||
// 2. Subsampling Conv2D (symmetric padding=1, matching PyTorch)
|
||||
{
|
||||
for (int i = 0; i < 2; i++) {
|
||||
cur = ggml_conv_2d(ctx0, model.sscp_conv_w[i], cur, 2, 2, 1, 1, 1, 1);
|
||||
if (model.sscp_conv_b[i]) {
|
||||
cur = ggml_add(ctx0, cur, model.sscp_conv_b[i]);
|
||||
}
|
||||
// nn.LayerNorm(channels): permute ch to ne[0], normalize, permute back
|
||||
if (model.sscp_norm_w[i]) {
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||
cur = ggml_norm(ctx0, cur, norm_eps);
|
||||
cur = ggml_mul(ctx0, cur, model.sscp_norm_w[i]);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
|
||||
}
|
||||
cur = ggml_relu(ctx0, cur);
|
||||
}
|
||||
// Flatten [freq, time, ch, 1] -> [ch*freq, time]
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
|
||||
if (model.sscp_inp_proj_w) {
|
||||
cur = build_mm(model.sscp_inp_proj_w, cur);
|
||||
if (model.sscp_inp_proj_b) {
|
||||
cur = ggml_add(ctx0, cur, model.sscp_inp_proj_b);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t n_pos = cur->ne[1];
|
||||
|
||||
// Chunked local attention parameters
|
||||
const int64_t C = 12; // chunk_size
|
||||
const int64_t P = 12; // max_past_horizon (context_left - 1)
|
||||
const int64_t S = C + P; // context_size = 24
|
||||
const int64_t R = P + 1; // RPE positions = 13
|
||||
const int64_t B = (n_pos + C - 1) / C; // num_blocks
|
||||
const int64_t Np = B * C; // padded sequence length
|
||||
const int64_t pad_seq = Np - n_pos;
|
||||
|
||||
// Input tensors: blocked RPE and blocked attention mask
|
||||
ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_head * d_head, R);
|
||||
ggml_set_name(pos_emb, "pos_emb");
|
||||
ggml_set_input(pos_emb);
|
||||
|
||||
ggml_tensor * kq_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, S, C, B);
|
||||
ggml_set_name(kq_mask, "kq_mask");
|
||||
ggml_set_input(kq_mask);
|
||||
|
||||
// 3. Conformer Blocks
|
||||
for (int il = 0; il < hparams.n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
auto * residual = cur;
|
||||
|
||||
// FFN 1 (half-step)
|
||||
if (layer.ff_norm_w && layer.ff_up_w && layer.ff_down_w) {
|
||||
cur = build_norm(cur, layer.ff_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, nullptr, nullptr, nullptr,
|
||||
layer.ff_down_w, nullptr, FFN_SILU, il);
|
||||
if (layer.ff_post_norm_w) {
|
||||
cur = build_norm(cur, layer.ff_post_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
}
|
||||
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, res_weight));
|
||||
}
|
||||
|
||||
// Chunked local self-attention with RPE
|
||||
if (layer.q_w && layer.k_w && layer.v_w && layer.o_w) {
|
||||
const float q_scale = (1.0f / sqrtf((float)d_head)) / logf(2.0f);
|
||||
const float k_scale = logf(1.0f + expf(1.0f)) / logf(2.0f);
|
||||
const float softcap = 50.0f;
|
||||
|
||||
ggml_tensor * attn_norm_w = layer.attn_pre_norm_w ? layer.attn_pre_norm_w : layer.ln_1_w;
|
||||
cur = attn_norm_w
|
||||
? build_norm(residual, attn_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il)
|
||||
: residual;
|
||||
|
||||
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
||||
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
||||
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
||||
|
||||
// [n_embd, n_pos] -> [D, H, N]
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
|
||||
// Q/K scaling
|
||||
Qcur = ggml_scale(ctx0, Qcur, q_scale);
|
||||
if (layer.per_dim_scale_w) {
|
||||
Qcur = ggml_mul(ctx0, Qcur, ggml_reshape_3d(ctx0, layer.per_dim_scale_w, d_head, 1, 1));
|
||||
}
|
||||
Kcur = ggml_scale(ctx0, Kcur, k_scale);
|
||||
if (layer.per_dim_k_scale_w) {
|
||||
Kcur = ggml_mul(ctx0, Kcur, ggml_reshape_3d(ctx0, layer.per_dim_k_scale_w, d_head, 1, 1));
|
||||
}
|
||||
|
||||
// Q blocking: [D, H, N] -> pad to Np -> reshape [D, H, C, B]
|
||||
// ggml permute: ne[ax_i] = src->ne[i], so (0,3,1,2) sends H->3, C->1, B->2
|
||||
Qcur = ggml_pad(ctx0, Qcur, 0, 0, pad_seq, 0); // [D, H, Np]
|
||||
Qcur = ggml_reshape_4d(ctx0, Qcur, d_head, n_head, C, B); // [D, H, C, B]
|
||||
Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 3, 1, 2)); // [D, C, B, H]
|
||||
|
||||
// K/V block context extraction via overlapping view:
|
||||
// Pad to S*B elements, roll right by P to create left-padding,
|
||||
// then view with stride C in the block dimension (overlapping windows).
|
||||
auto extract_blocks = [&](ggml_tensor * t) -> ggml_tensor * {
|
||||
// [D, H, N] -> pad to S*B -> roll right by P -> cont (materialize)
|
||||
const int64_t pad_kv = S * B - n_pos;
|
||||
t = ggml_pad(ctx0, t, 0, 0, pad_kv, 0); // [D, H, S*B]
|
||||
t = ggml_roll(ctx0, t, 0, 0, P, 0); // left-pad by P
|
||||
t = ggml_cont(ctx0, t); // materialize roll (removes view offset)
|
||||
// Overlapping view: stride for B dim is C positions, not S
|
||||
// ne = [D, H, S, B], data_size = D*H*S*B*sizeof = source_nbytes (exact fit)
|
||||
// nb1=D*sizeof, nb2=D*H*sizeof, nb3=C*D*H*sizeof (overlap: C < S)
|
||||
t = ggml_view_4d(ctx0, t, d_head, n_head, S, B,
|
||||
t->nb[1], t->nb[2], C * t->nb[2], 0);
|
||||
t = ggml_cont(ctx0, t); // materialize overlapping windows
|
||||
return t;
|
||||
};
|
||||
|
||||
ggml_tensor * Kblk = extract_blocks(Kcur);
|
||||
// [D, H, S, B] -> [D, S, B, H] via permute(0,3,1,2)
|
||||
Kblk = ggml_cont(ctx0, ggml_permute(ctx0, Kblk, 0, 3, 1, 2));
|
||||
|
||||
ggml_tensor * Vblk = extract_blocks(Vcur);
|
||||
// [D, H, S, B] -> [S, D, B, H] via permute(1,3,0,2)
|
||||
Vblk = ggml_cont(ctx0, ggml_permute(ctx0, Vblk, 1, 3, 0, 2));
|
||||
|
||||
// Content attention: Q @ K^T
|
||||
// Kblk=[D,S,B,H], Qcur=[D,C,B,H] -> mul_mat contracts on D -> [S,C,B,H]
|
||||
ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Kblk, Qcur);
|
||||
|
||||
// Relative position attention
|
||||
if (layer.attn_k_rel_w) {
|
||||
// RPE: [n_embd, R] -> project -> [D, H, R] -> [D, R, H]
|
||||
auto * p = ggml_mul_mat(ctx0, layer.attn_k_rel_w, pos_emb);
|
||||
p = ggml_reshape_3d(ctx0, p, d_head, n_head, R);
|
||||
p = ggml_cont(ctx0, ggml_permute(ctx0, p, 0, 2, 1, 3)); // [D, R, H]
|
||||
|
||||
// Q_flat @ RPE^T: [D, C*B, H] @ [D, R, H] -> [R, C*B, H]
|
||||
auto * Q_flat = ggml_reshape_3d(ctx0, Qcur, d_head, C * B, n_head);
|
||||
auto * matrix_bd = ggml_mul_mat(ctx0, p, Q_flat); // [R, C*B, H]
|
||||
matrix_bd = ggml_reshape_4d(ctx0, matrix_bd, R, C, B, n_head); // [R, C, B, H]
|
||||
|
||||
// Blocked relative shift (appendix B of Transformer-XL)
|
||||
{
|
||||
matrix_bd = ggml_pad(ctx0, matrix_bd, S + 1 - R, 0, 0, 0); // [S+1, C, B, H]
|
||||
matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, (S + 1) * C, B, n_head);
|
||||
matrix_bd = ggml_view_3d(ctx0, matrix_bd,
|
||||
C * S, B, n_head,
|
||||
matrix_bd->nb[1], matrix_bd->nb[2], 0);
|
||||
matrix_bd = ggml_cont(ctx0, matrix_bd); // [C*S, B, H]
|
||||
matrix_bd = ggml_reshape_4d(ctx0, matrix_bd, S, C, B, n_head); // [S, C, B, H]
|
||||
}
|
||||
|
||||
matrix_ac = ggml_add(ctx0, matrix_ac, matrix_bd);
|
||||
}
|
||||
|
||||
auto * scores = matrix_ac; // [S, C, B, H]
|
||||
|
||||
// Softcap
|
||||
scores = ggml_scale(ctx0, scores, 1.0f / softcap);
|
||||
scores = ggml_tanh(ctx0, scores);
|
||||
scores = ggml_scale(ctx0, scores, softcap);
|
||||
|
||||
// Blocked attention mask: [S, C, B] broadcasts over H
|
||||
scores = ggml_add(ctx0, scores, kq_mask);
|
||||
|
||||
ggml_tensor * attn = ggml_soft_max(ctx0, scores);
|
||||
|
||||
// attn @ V: [S,C,B,H] @ [S,D,B,H] -> [D,C,B,H]
|
||||
ggml_tensor * x = ggml_mul_mat(ctx0, Vblk, attn);
|
||||
|
||||
// [D,C,B,H] -> [D,H,C,B] via permute(0,2,3,1) -> flatten -> trim
|
||||
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 3, 1));
|
||||
x = ggml_cont_2d(ctx0, x, d_head * n_head, C * B);
|
||||
if (pad_seq > 0) {
|
||||
x = ggml_view_2d(ctx0, x, d_head * n_head, n_pos, x->nb[1], 0);
|
||||
x = ggml_cont(ctx0, x);
|
||||
}
|
||||
|
||||
x = build_mm(layer.o_w, x);
|
||||
if (layer.o_b) { x = ggml_add(ctx0, x, layer.o_b); }
|
||||
|
||||
if (layer.attn_post_norm_w) {
|
||||
x = build_norm(x, layer.attn_post_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
}
|
||||
residual = ggml_add(ctx0, residual, x);
|
||||
}
|
||||
|
||||
// Convolution Module
|
||||
if (layer.norm_conv_w && layer.conv_pw1_w && layer.conv_dw_w && layer.conv_pw2_w) {
|
||||
cur = build_norm(residual, layer.norm_conv_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
auto * x = build_mm(layer.conv_pw1_w, cur);
|
||||
|
||||
// GLU
|
||||
{
|
||||
int64_t d = x->ne[0] / 2;
|
||||
ggml_tensor * gate = ggml_sigmoid(ctx0,
|
||||
ggml_cont(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])));
|
||||
x = ggml_mul(ctx0,
|
||||
ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
|
||||
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
|
||||
}
|
||||
|
||||
// Causal depthwise Conv1D via ggml_ssm_conv (pad+roll for left-only padding).
|
||||
x = ggml_pad(ctx0, x, 4, 0, 0, 0);
|
||||
x = ggml_roll(ctx0, x, 4, 0, 0, 0);
|
||||
x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
|
||||
if (layer.conv_dw_b) {
|
||||
x = ggml_add(ctx0, x, layer.conv_dw_b);
|
||||
}
|
||||
|
||||
if (layer.conv_norm_w) {
|
||||
x = ggml_rms_norm(ctx0, x, norm_eps);
|
||||
x = ggml_mul(ctx0, x, layer.conv_norm_w);
|
||||
}
|
||||
x = ggml_silu(ctx0, x);
|
||||
x = build_mm(layer.conv_pw2_w, x);
|
||||
residual = ggml_add(ctx0, residual, x);
|
||||
}
|
||||
|
||||
// FFN 2 (half-step)
|
||||
if (layer.ff_norm_1_w && layer.ff_up_1_w && layer.ff_down_1_w) {
|
||||
cur = build_norm(residual, layer.ff_norm_1_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_1_w, nullptr, nullptr, nullptr,
|
||||
layer.ff_down_1_w, nullptr, FFN_SILU, il);
|
||||
if (layer.ff_post_norm_1_w) {
|
||||
cur = build_norm(cur, layer.ff_post_norm_1_w, nullptr, NORM_TYPE_RMS, norm_eps, il);
|
||||
}
|
||||
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, res_weight));
|
||||
}
|
||||
|
||||
// Layer output norm
|
||||
cur = layer.ln_2_w
|
||||
? build_norm(residual, layer.ln_2_w, nullptr, NORM_TYPE_RMS, norm_eps, il)
|
||||
: residual;
|
||||
|
||||
}
|
||||
|
||||
// 4. Output Projection
|
||||
if (model.audio_out_proj_w) {
|
||||
cur = build_mm(model.audio_out_proj_w, cur);
|
||||
if (model.audio_out_proj_b) {
|
||||
cur = ggml_add(ctx0, cur, model.audio_out_proj_b);
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Audio Multimodal Embedder
|
||||
cur = ggml_rms_norm(ctx0, cur, norm_eps);
|
||||
if (model.mm_soft_emb_norm_w) {
|
||||
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
|
||||
}
|
||||
if (model.mm_input_proj_w) {
|
||||
cur = build_mm(model.mm_input_proj_w, cur);
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph_gemma4a::build_mm(ggml_tensor * w, ggml_tensor * x) const {
|
||||
auto it = model.clamp_info_map.find(w->name);
|
||||
if (it == model.clamp_info_map.end()) {
|
||||
return ggml_mul_mat(ctx0, w, x);
|
||||
}
|
||||
const auto & ci = it->second;
|
||||
ggml_tensor * clamped = ggml_clamp(ctx0, x, ci.inp_min, ci.inp_max);
|
||||
ggml_tensor * out = ggml_mul_mat(ctx0, w, clamped);
|
||||
return ggml_clamp(ctx0, out, ci.out_min, ci.out_max);
|
||||
}
|
||||
151
tools/mtmd/models/gemma4v.cpp
Normal file
151
tools/mtmd/models/gemma4v.cpp
Normal file
@@ -0,0 +1,151 @@
|
||||
#include "models.h"
|
||||
#include <cmath>
|
||||
|
||||
ggml_cgraph * clip_graph_gemma4v::build() {
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
|
||||
// patches = 2 * (patches - 0.5)
|
||||
// equivalent to: patches * 2 - 1
|
||||
inp_raw = ggml_scale_bias(ctx0, inp_raw, 2.0f, -1.0f);
|
||||
ggml_set_name(inp_raw, "inp_raw_scaled");
|
||||
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
|
||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
ggml_set_name(inp, "inp");
|
||||
// note: no patch bias
|
||||
|
||||
ggml_tensor * pos_x = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_x, "pos_x");
|
||||
ggml_set_input(pos_x);
|
||||
|
||||
ggml_tensor * pos_y = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_y, "pos_y");
|
||||
ggml_set_input(pos_y);
|
||||
|
||||
{
|
||||
const int64_t pos_size = model.position_embeddings->ne[1];
|
||||
const size_t nb1 = ggml_row_size(model.position_embeddings->type, n_embd);
|
||||
|
||||
// positional embeddings are stored as lookup tables (one for x, one for y)
|
||||
ggml_tensor * tbl_x = ggml_view_2d(ctx0, model.position_embeddings,
|
||||
n_embd, pos_size, nb1, 0);
|
||||
ggml_tensor * tbl_y = ggml_view_2d(ctx0, model.position_embeddings,
|
||||
n_embd, pos_size, nb1, pos_size * nb1);
|
||||
|
||||
// ggml_get_rows: [n_embd, n_patches]
|
||||
ggml_tensor * emb_x = ggml_get_rows(ctx0, tbl_x, pos_x);
|
||||
ggml_tensor * emb_y = ggml_get_rows(ctx0, tbl_y, pos_y);
|
||||
|
||||
inp = ggml_add(ctx0, inp, emb_x);
|
||||
inp = ggml_add(ctx0, inp, emb_y);
|
||||
cb(inp, "pos_embd", -1);
|
||||
}
|
||||
|
||||
// similar to build_rope_2d, but use neox ordering
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
const int64_t n_dim = cur->ne[0];
|
||||
const int64_t n_head = cur->ne[1];
|
||||
const int64_t n_pos = cur->ne[2];
|
||||
|
||||
// first half
|
||||
ggml_tensor * first;
|
||||
{
|
||||
first = ggml_view_3d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos,
|
||||
cur->nb[1],
|
||||
cur->nb[2],
|
||||
0);
|
||||
first = ggml_rope_ext(
|
||||
ctx0,
|
||||
first,
|
||||
pos_x, // positions
|
||||
nullptr, // freq factors
|
||||
n_dim/2, // n_dims
|
||||
GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta,
|
||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
||||
);
|
||||
}
|
||||
|
||||
// second half
|
||||
ggml_tensor * second;
|
||||
{
|
||||
second = ggml_view_3d(ctx0, cur,
|
||||
n_dim/2, n_head, n_pos,
|
||||
cur->nb[1],
|
||||
cur->nb[2],
|
||||
n_dim/2 * ggml_element_size(cur));
|
||||
second = ggml_rope_ext(
|
||||
ctx0,
|
||||
second,
|
||||
pos_y, // positions
|
||||
nullptr, // freq factors
|
||||
n_dim/2, // n_dims
|
||||
GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta,
|
||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
||||
);
|
||||
}
|
||||
|
||||
cur = ggml_concat(ctx0, first, second, 0);
|
||||
return cur;
|
||||
};
|
||||
|
||||
kq_scale = 1.0f;
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_RMS,
|
||||
hparams.ffn_op,
|
||||
nullptr, // pos embd is already handled above
|
||||
add_pos);
|
||||
|
||||
// Gemma4VisionPooler
|
||||
{
|
||||
const int kernel_size = hparams.n_merge;
|
||||
GGML_ASSERT(kernel_size > 0);
|
||||
|
||||
// [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1]
|
||||
cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1);
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG,
|
||||
kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||
const int out_x = n_patches_x / kernel_size;
|
||||
const int out_y = n_patches_y / kernel_size;
|
||||
// [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y]
|
||||
cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd));
|
||||
cb(cur, "pooled", -1);
|
||||
}
|
||||
|
||||
// hidden_states = (hidden_states - self.std_bias) * self.std_scale
|
||||
if (model.std_bias && model.std_scale) {
|
||||
cur = ggml_sub(ctx0, cur, model.std_bias);
|
||||
cur = ggml_mul(ctx0, cur, model.std_scale);
|
||||
cb(cur, "std_scaled", -1);
|
||||
}
|
||||
|
||||
// Gemma4MultimodalEmbedder
|
||||
cur = build_mm(model.mm_input_proj_w, cur);
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
// embedding_post_projection_norm
|
||||
cur = ggml_rms_norm(ctx0, cur, hparams.eps);
|
||||
cb(cur, "projected_normed", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph_gemma4v::build_mm(ggml_tensor * w, ggml_tensor * x) const {
|
||||
// Gemma4ClippableLinear
|
||||
|
||||
auto it = model.clamp_info_map.find(w->name);
|
||||
if (it == model.clamp_info_map.end()) {
|
||||
return ggml_mul_mat(ctx0, w, x);
|
||||
} else {
|
||||
const auto & clamp_info = it->second;
|
||||
ggml_tensor * clamped = ggml_clamp(ctx0, x, clamp_info.inp_min, clamp_info.inp_max);
|
||||
ggml_tensor * out = ggml_mul_mat(ctx0, w, clamped);
|
||||
out = ggml_clamp(ctx0, out, clamp_info.out_min, clamp_info.out_max);
|
||||
return out;
|
||||
}
|
||||
}
|
||||
122
tools/mtmd/models/glm4v.cpp
Normal file
122
tools/mtmd/models/glm4v.cpp
Normal file
@@ -0,0 +1,122 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_glm4v::build() {
|
||||
GGML_ASSERT(model.patch_bias != nullptr);
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
|
||||
const int batch_size = 1;
|
||||
|
||||
norm_type norm_t = NORM_TYPE_RMS;
|
||||
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
|
||||
// second conv dimension
|
||||
{
|
||||
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, inp_1);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
inp = ggml_reshape_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||
inp = ggml_cont_3d(
|
||||
ctx0, inp,
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
|
||||
// add patch bias
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
cb(inp, "patch_bias", -1);
|
||||
|
||||
// pos-conv norm
|
||||
inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
|
||||
|
||||
ggml_tensor * learned_pos_embd = nullptr;
|
||||
// Note: GLM-OCR does not have learned position embeddings
|
||||
if (model.position_embeddings != nullptr) {
|
||||
learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
|
||||
learned_pos_embd = ggml_cont_4d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
learned_pos_embd = ggml_reshape_4d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
|
||||
learned_pos_embd = ggml_cont_3d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
cb(learned_pos_embd, "learned_pos_embd", -1);
|
||||
}
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return ggml_rope_multi(
|
||||
ctx0, cur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
|
||||
32768, hparams.rope_theta, 1, 0, 1, 32, 1);
|
||||
};
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
norm_t,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
// cb(ggml_sum(ctx0, cur), "vit_out_sum", -1);
|
||||
|
||||
// GLM4V projector
|
||||
// ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
|
||||
|
||||
// patch merger (downsample)
|
||||
{
|
||||
int n_merge = hparams.n_merge;
|
||||
GGML_ASSERT(n_merge > 0);
|
||||
|
||||
int n_token_out = n_patches / n_merge / n_merge;
|
||||
cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out]
|
||||
cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1);
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out]
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
|
||||
}
|
||||
|
||||
// FC projector
|
||||
{
|
||||
cur = build_mm(model.mm_fc_w, cur);
|
||||
// default LayerNorm (post_projection_norm)
|
||||
cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
cb(cur, "after_fc_proj", -1);
|
||||
}
|
||||
|
||||
// FFN projector
|
||||
{
|
||||
cur = build_ffn(cur,
|
||||
model.mm_ffn_up_w, model.mm_ffn_up_b,
|
||||
model.mm_ffn_gate_w, model.mm_ffn_gate_b,
|
||||
model.mm_ffn_down_w, model.mm_ffn_down_b,
|
||||
hparams.ffn_op, -1);
|
||||
cb(cur, "after_ffn_proj", -1);
|
||||
// cb(ggml_sum(ctx0, cur), "merged_sum", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
275
tools/mtmd/models/granite-speech.cpp
Normal file
275
tools/mtmd/models/granite-speech.cpp
Normal file
@@ -0,0 +1,275 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_granite_speech::build() {
|
||||
const int n_frames = img.nx;
|
||||
const int context_size = hparams.audio_chunk_size;
|
||||
const int ctc_layer = n_layer / 2;
|
||||
const int conv_kernel = hparams.audio_conv_kernel_size;
|
||||
const int conv_pad = conv_kernel / 2;
|
||||
|
||||
const int num_blocks = (n_frames + context_size - 1) / context_size;
|
||||
const int padded_len = num_blocks * context_size;
|
||||
const int remainder = n_frames % context_size;
|
||||
|
||||
ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size);
|
||||
ggml_set_name(attn_dists, "attn_dists");
|
||||
ggml_set_input(attn_dists);
|
||||
|
||||
ggml_tensor * attn_mask = nullptr;
|
||||
if (remainder > 0) {
|
||||
attn_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32,
|
||||
context_size, context_size, 1, num_blocks);
|
||||
ggml_set_name(attn_mask, "attn_mask");
|
||||
ggml_set_input(attn_mask);
|
||||
}
|
||||
|
||||
ggml_tensor * inp = build_inp_raw(1);
|
||||
auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
|
||||
cb(cur, "inp_transposed", -1);
|
||||
|
||||
cur = build_mm(model.inp_proj_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.inp_proj_b);
|
||||
cb(cur, "inp_linear", -1);
|
||||
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
auto * residual = cur;
|
||||
|
||||
// ffn1 (half-step)
|
||||
{
|
||||
auto * ffn1 = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b,
|
||||
NORM_TYPE_NORMAL, eps, il);
|
||||
cb(ffn1, "ffn1_norm", il);
|
||||
|
||||
ffn1 = build_ffn(ffn1,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
nullptr, nullptr,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
FFN_SILU, il);
|
||||
cb(ffn1, "ffn1_out", il);
|
||||
|
||||
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, ffn1, 0.5f));
|
||||
cb(residual, "ffn1_residual", il);
|
||||
}
|
||||
|
||||
// build_attn not used here: Shaw RPE needs pos_attn = mul_mat(pos_emb, Q)
|
||||
// injected between KQ product and softmax, which build_attn doesn't support
|
||||
{
|
||||
auto * normed = build_norm(residual, layer.ln_1_w, layer.ln_1_b,
|
||||
NORM_TYPE_NORMAL, eps, il);
|
||||
cb(normed, "attn_norm", il);
|
||||
|
||||
if (n_frames < padded_len) {
|
||||
normed = ggml_pad(ctx0, normed, 0, padded_len - n_frames, 0, 0);
|
||||
}
|
||||
|
||||
ggml_tensor * Q = build_mm(layer.q_w, normed);
|
||||
ggml_tensor * K = build_mm(layer.k_w, normed);
|
||||
ggml_tensor * V = build_mm(layer.v_w, normed);
|
||||
|
||||
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, context_size, num_blocks);
|
||||
K = ggml_reshape_4d(ctx0, K, d_head, n_head, context_size, num_blocks);
|
||||
V = ggml_reshape_4d(ctx0, V, d_head, n_head, context_size, num_blocks);
|
||||
|
||||
ggml_tensor * Q_perm = ggml_permute(ctx0, Q, 0, 2, 1, 3);
|
||||
ggml_tensor * K_perm = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||
|
||||
ggml_tensor * kq = ggml_mul_mat(ctx0, K_perm, Q_perm);
|
||||
|
||||
// Shaw RPE: pos_emb ne[2]=1 broadcasts against Q ne[2]=num_blocks in mul_mat
|
||||
ggml_tensor * pos_emb = ggml_get_rows(ctx0, layer.attn_rel_pos_emb, attn_dists);
|
||||
pos_emb = ggml_reshape_3d(ctx0, pos_emb, d_head, context_size, context_size);
|
||||
pos_emb = ggml_reshape_4d(ctx0, pos_emb, d_head, context_size, 1, context_size);
|
||||
|
||||
ggml_tensor * Q_shaw = ggml_permute(ctx0, Q, 0, 1, 3, 2);
|
||||
ggml_tensor * pos_attn = ggml_mul_mat(ctx0, pos_emb, Q_shaw);
|
||||
pos_attn = ggml_cont(ctx0, ggml_permute(ctx0, pos_attn, 0, 2, 3, 1));
|
||||
|
||||
ggml_tensor * scores = ggml_add(ctx0, kq, pos_attn);
|
||||
ggml_tensor * attn_weights = ggml_soft_max_ext(ctx0, scores, attn_mask,
|
||||
kq_scale, 0.0f);
|
||||
|
||||
ggml_tensor * V_perm = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||
ggml_tensor * attn_out = ggml_mul_mat(ctx0, V_perm, attn_weights);
|
||||
|
||||
attn_out = ggml_permute(ctx0, attn_out, 0, 2, 1, 3);
|
||||
attn_out = ggml_cont_2d(ctx0, attn_out, n_embd, padded_len);
|
||||
|
||||
if (n_frames < padded_len) {
|
||||
attn_out = ggml_view_2d(ctx0, attn_out,
|
||||
n_embd, n_frames, attn_out->nb[1], 0);
|
||||
}
|
||||
|
||||
cur = build_mm(layer.o_w, attn_out);
|
||||
cur = ggml_add(ctx0, cur, layer.o_b);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
residual = ggml_add(ctx0, residual, cur);
|
||||
|
||||
// conv module
|
||||
{
|
||||
cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b,
|
||||
NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "conv_norm", il);
|
||||
|
||||
auto * x = build_mm(layer.conv_pw1_w, cur);
|
||||
x = ggml_add(ctx0, x, layer.conv_pw1_b);
|
||||
cb(x, "conv_pw1", il);
|
||||
|
||||
// GLU: ggml has no fused op, manual split + sigmoid gate
|
||||
{
|
||||
int64_t d = x->ne[0] / 2;
|
||||
ggml_tensor * gate = ggml_sigmoid(ctx0,
|
||||
ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
|
||||
x = ggml_mul(ctx0,
|
||||
ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
|
||||
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
|
||||
}
|
||||
cb(x, "conv_glu", il);
|
||||
|
||||
x = ggml_pad(ctx0, x, conv_pad, 0, 0, 0);
|
||||
x = ggml_roll(ctx0, x, conv_pad, 0, 0, 0);
|
||||
x = ggml_pad(ctx0, x, conv_pad, 0, 0, 0);
|
||||
x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
|
||||
cb(x, "conv_dw", il);
|
||||
|
||||
// folded batch norm
|
||||
x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
|
||||
x = ggml_silu(ctx0, x);
|
||||
cb(x, "conv_bn_silu", il);
|
||||
|
||||
x = build_mm(layer.conv_pw2_w, x);
|
||||
x = ggml_add(ctx0, x, layer.conv_pw2_b);
|
||||
cb(x, "conv_pw2", il);
|
||||
|
||||
cur = x;
|
||||
}
|
||||
|
||||
residual = ggml_add(ctx0, residual, cur);
|
||||
|
||||
// ffn2 (half-step)
|
||||
{
|
||||
auto * ffn2 = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b,
|
||||
NORM_TYPE_NORMAL, eps, il);
|
||||
cb(ffn2, "ffn2_norm", il);
|
||||
|
||||
ffn2 = build_ffn(ffn2,
|
||||
layer.ff_up_1_w, layer.ff_up_1_b,
|
||||
nullptr, nullptr,
|
||||
layer.ff_down_1_w, layer.ff_down_1_b,
|
||||
FFN_SILU, il);
|
||||
cb(ffn2, "ffn2_out", il);
|
||||
|
||||
residual = ggml_add(ctx0, residual, ggml_scale(ctx0, ffn2, 0.5f));
|
||||
}
|
||||
|
||||
cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b,
|
||||
NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
// CTC branch
|
||||
if (il + 1 == ctc_layer) {
|
||||
auto * mid = build_mm(model.ctc_out_w, cur);
|
||||
mid = ggml_add(ctx0, mid, model.ctc_out_b);
|
||||
mid = ggml_soft_max(ctx0, mid);
|
||||
mid = build_mm(model.ctc_out_mid_w, mid);
|
||||
mid = ggml_add(ctx0, mid, model.ctc_out_mid_b);
|
||||
cur = ggml_add(ctx0, cur, mid);
|
||||
cb(cur, "ctc_branch", il);
|
||||
}
|
||||
}
|
||||
|
||||
cb(cur, "encoder_out", -1);
|
||||
|
||||
// QFormer projector
|
||||
{
|
||||
const int window_size = hparams.audio_proj_window_size;
|
||||
const int num_queries = window_size / hparams.audio_proj_downsample_rate;
|
||||
const int proj_n_head = hparams.audio_proj_head_count;
|
||||
const int proj_d_head = n_embd / proj_n_head;
|
||||
const float proj_kq_scale = 1.0f / sqrtf((float)proj_d_head);
|
||||
const float proj_eps = 1e-12f;
|
||||
const int nblocks_proj = (n_frames + window_size - 1) / window_size;
|
||||
const int padded_proj = nblocks_proj * window_size;
|
||||
|
||||
if (n_frames < padded_proj) {
|
||||
cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0);
|
||||
}
|
||||
|
||||
ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
|
||||
|
||||
ggml_tensor * queries = build_norm(model.qf_proj_query,
|
||||
model.qf_proj_norm_w, model.qf_proj_norm_b,
|
||||
NORM_TYPE_NORMAL, proj_eps, -1);
|
||||
{
|
||||
ggml_tensor * q_3d = ggml_reshape_3d(ctx0, queries, n_embd, num_queries, 1);
|
||||
ggml_tensor * q_shape = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32,
|
||||
n_embd, num_queries, nblocks_proj);
|
||||
queries = ggml_repeat(ctx0, q_3d, q_shape);
|
||||
}
|
||||
|
||||
for (int il = 0; il < (int)model.qf_proj_layers.size(); il++) {
|
||||
const auto & pl = model.qf_proj_layers[il];
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Q = ggml_add(ctx0, build_mm(pl.q_w, queries), pl.q_b);
|
||||
ggml_tensor * K = ggml_add(ctx0, build_mm(pl.k_w, queries), pl.k_b);
|
||||
ggml_tensor * V = ggml_add(ctx0, build_mm(pl.v_w, queries), pl.v_b);
|
||||
|
||||
Q = ggml_reshape_4d(ctx0, Q, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
||||
K = ggml_reshape_4d(ctx0, K, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
||||
V = ggml_reshape_4d(ctx0, V, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
||||
|
||||
ggml_tensor * sa_out = build_attn(pl.o_w, pl.o_b,
|
||||
Q, K, V, nullptr, proj_kq_scale, il);
|
||||
sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, num_queries, nblocks_proj);
|
||||
|
||||
queries = build_norm(ggml_add(ctx0, sa_out, queries),
|
||||
pl.ln_1_w, pl.ln_1_b,
|
||||
NORM_TYPE_NORMAL, proj_eps, il);
|
||||
}
|
||||
|
||||
// cross-attention
|
||||
{
|
||||
ggml_tensor * Q = ggml_add(ctx0, build_mm(pl.cross_attn_q_w, queries), pl.cross_attn_q_b);
|
||||
ggml_tensor * K = ggml_add(ctx0, build_mm(pl.cross_attn_k_w, enc_windows), pl.cross_attn_k_b);
|
||||
ggml_tensor * V = ggml_add(ctx0, build_mm(pl.cross_attn_v_w, enc_windows), pl.cross_attn_v_b);
|
||||
|
||||
Q = ggml_reshape_4d(ctx0, Q, proj_d_head, proj_n_head, num_queries, nblocks_proj);
|
||||
K = ggml_reshape_4d(ctx0, K, proj_d_head, proj_n_head, window_size, nblocks_proj);
|
||||
V = ggml_reshape_4d(ctx0, V, proj_d_head, proj_n_head, window_size, nblocks_proj);
|
||||
|
||||
ggml_tensor * ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b,
|
||||
Q, K, V, nullptr, proj_kq_scale, il);
|
||||
ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, num_queries, nblocks_proj);
|
||||
|
||||
queries = build_norm(ggml_add(ctx0, ca_out, queries),
|
||||
pl.cross_attn_norm_w, pl.cross_attn_norm_b,
|
||||
NORM_TYPE_NORMAL, proj_eps, il);
|
||||
}
|
||||
|
||||
// ffn
|
||||
{
|
||||
ggml_tensor * ffn_out = build_ffn(queries,
|
||||
pl.ff_up_w, pl.ff_up_b,
|
||||
nullptr, nullptr,
|
||||
pl.ff_down_w, pl.ff_down_b,
|
||||
FFN_GELU, il);
|
||||
|
||||
queries = build_norm(ggml_add(ctx0, ffn_out, queries),
|
||||
pl.ln_2_w, pl.ln_2_b,
|
||||
NORM_TYPE_NORMAL, proj_eps, il);
|
||||
}
|
||||
}
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, queries, n_embd, num_queries * nblocks_proj);
|
||||
cur = ggml_add(ctx0, build_mm(model.qf_proj_linear_w, cur), model.qf_proj_linear_b);
|
||||
cb(cur, "projector_out", -1);
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
73
tools/mtmd/models/hunyuanocr.cpp
Normal file
73
tools/mtmd/models/hunyuanocr.cpp
Normal file
@@ -0,0 +1,73 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_hunyuanocr::build() {
|
||||
const int merge = hparams.n_merge;
|
||||
const int pw = n_patches_x;
|
||||
const int ph = n_patches_y;
|
||||
|
||||
// Position embedding interpolation.
|
||||
// HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard
|
||||
// ggml_interpolate cannot express. To avoid adding a new ggml op, the
|
||||
// resize is computed on CPU in clip_image_batch_encode and uploaded here
|
||||
// as a graph input (named "hunyuanvl_pos_embd").
|
||||
// HunyuanOCR uses the same square layout and the standard ratio-based
|
||||
// interpolation provided by resize_position_embeddings().
|
||||
ggml_tensor * pos_embd = nullptr;
|
||||
if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) {
|
||||
pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw);
|
||||
ggml_set_name(pos_embd, "hunyuanvl_pos_embd");
|
||||
ggml_set_input(pos_embd);
|
||||
} else {
|
||||
pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
|
||||
}
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
|
||||
|
||||
// perceiver projector
|
||||
cur = build_norm(cur, model.mm_pre_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
|
||||
|
||||
// [C, W*H] -> [W, H, C] for conv2d
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, pw, ph);
|
||||
cur = ggml_permute(ctx0, cur, 2, 0, 1, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
// Conv2d(1152->2304, k=2, s=2) + GELU + Conv2d(2304->4608, k=1, s=1)
|
||||
cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, merge, merge, 0, 0, 1, 1);
|
||||
if (model.mm_0_b) {
|
||||
cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_0_b, 1, 1, model.mm_0_b->ne[0]));
|
||||
}
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (model.mm_1_b) {
|
||||
cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_1_b, 1, 1, model.mm_1_b->ne[0]));
|
||||
}
|
||||
|
||||
const int ow = pw / merge;
|
||||
const int oh = ph / merge;
|
||||
const int idim = (int)cur->ne[2]; // OC = 4608
|
||||
|
||||
// append newline along W (dim 0)
|
||||
ggml_tensor * nl = ggml_reshape_4d(ctx0, model.image_newline, 1, 1, idim, 1);
|
||||
nl = ggml_repeat_4d(ctx0, nl, 1, oh, idim, 1);
|
||||
cur = ggml_concat(ctx0, cur, nl, 0);
|
||||
|
||||
// [OW+1, OH, OC] -> [OC, (OW+1)*OH]
|
||||
cur = ggml_permute(ctx0, cur, 1, 2, 0, 3);
|
||||
cur = ggml_cont_2d(ctx0, cur, idim, (ow + 1) * oh);
|
||||
|
||||
// project to LLM hidden size
|
||||
cur = build_mm(model.mm_model_proj, cur);
|
||||
if (model.mm_model_proj_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mm_model_proj_b);
|
||||
}
|
||||
|
||||
// wrap with begin/end tokens
|
||||
cur = ggml_concat(ctx0, ggml_reshape_2d(ctx0, model.mm_img_begin, model.mm_img_begin->ne[0], 1), cur, 1);
|
||||
cur = ggml_concat(ctx0, cur, ggml_reshape_2d(ctx0, model.mm_img_end, model.mm_img_end->ne[0], 1), 1);
|
||||
|
||||
cur = build_norm(cur, model.mm_post_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
69
tools/mtmd/models/internvl.cpp
Normal file
69
tools/mtmd/models/internvl.cpp
Normal file
@@ -0,0 +1,69 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_internvl::build() {
|
||||
GGML_ASSERT(model.class_embedding != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
|
||||
const int n_pos = n_patches + 1;
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
// add CLS token
|
||||
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||
|
||||
// The larger models use a different ViT, which uses RMS norm instead of layer norm
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
|
||||
norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
|
||||
? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
|
||||
: NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_pos,
|
||||
norm_t,
|
||||
hparams.ffn_op,
|
||||
model.position_embeddings,
|
||||
nullptr);
|
||||
|
||||
// remove CLS token
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
n_embd, n_patches,
|
||||
ggml_row_size(cur->type, n_embd), 0);
|
||||
|
||||
// pixel shuffle
|
||||
{
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
||||
const int height = n_patches_y;
|
||||
const int width = n_patches_x;
|
||||
GGML_ASSERT(scale_factor > 0);
|
||||
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_cont_4d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
height / scale_factor,
|
||||
width / scale_factor,
|
||||
bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
// flatten to 2D
|
||||
cur = ggml_cont_2d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
cur->ne[1] * cur->ne[2]);
|
||||
}
|
||||
|
||||
// projector (always using GELU activation)
|
||||
{
|
||||
// projector LayerNorm uses pytorch's default eps = 1e-5
|
||||
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
|
||||
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_3_w, model.mm_3_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
101
tools/mtmd/models/kimik25.cpp
Normal file
101
tools/mtmd/models/kimik25.cpp
Normal file
@@ -0,0 +1,101 @@
|
||||
#include "models.h"
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
|
||||
// note: this is similar to clip_graph::resize_position_embeddings, major difference is having
|
||||
// the w/h in ne[1] and ne[2] instead of assuming with sqrt. Could try storing the tensor in 2D instead
|
||||
// with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3).
|
||||
ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) {
|
||||
ggml_tensor * pos_embd = model.position_embeddings;
|
||||
const int height = img.ny / patch_size;
|
||||
const int width = img.nx / patch_size;
|
||||
const uint32_t mode = interpolation_mode;
|
||||
|
||||
GGML_ASSERT(pos_embd);
|
||||
|
||||
const int64_t stored_c = pos_embd->ne[0]; // C = 1152
|
||||
const int64_t orig_w = pos_embd->ne[1]; // W = 64
|
||||
const int64_t orig_h = pos_embd->ne[2]; // H = 64
|
||||
|
||||
GGML_ASSERT(stored_c == n_embd);
|
||||
|
||||
if (height == (int)orig_h && width == (int)orig_w) {
|
||||
// No interpolation needed, just flatten to [C, H*W]
|
||||
return ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);
|
||||
}
|
||||
|
||||
pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3);
|
||||
pos_embd = ggml_interpolate(ctx0, pos_embd, height, width, n_embd, 1, mode);
|
||||
pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3);
|
||||
pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);
|
||||
return pos_embd;
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_kimik25::build() {
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings_3d(GGML_SCALE_MODE_BICUBIC);
|
||||
|
||||
// Kimi-K2.5 uses interleaved 2D RoPE pattern natively, but
|
||||
// Q / K are permuted during conversion to use split format.
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
cur = build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||
return cur;
|
||||
};
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
// I don't know why, but doing this in the build_vit lead to the ggml_add not occurring?
|
||||
// Doing it manually here does work.
|
||||
inp = ggml_add(ctx0, inp, learned_pos_embd);
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
nullptr,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
{
|
||||
// patch_merger
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
|
||||
// projection norm
|
||||
int proj_inp_dim = cur->ne[0];
|
||||
int n_merged_patches = cur->ne[1];
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
n_embd, n_merged_patches * scale_factor * scale_factor,
|
||||
ggml_row_size(cur->type, n_embd), 0);
|
||||
cur = ggml_norm(ctx0, cur, hparams.eps);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
proj_inp_dim, n_merged_patches,
|
||||
ggml_row_size(cur->type, proj_inp_dim), 0);
|
||||
cb(cur, "proj_inp_normed", -1);
|
||||
|
||||
// projection mlp
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
cb(cur, "proj_out", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
63
tools/mtmd/models/kimivl.cpp
Normal file
63
tools/mtmd/models/kimivl.cpp
Normal file
@@ -0,0 +1,63 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_kimivl::build() {
|
||||
// 2D input positions
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||
|
||||
// build ViT with 2D position embeddings
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
// first half is X axis and second half is Y axis
|
||||
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||
};
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
{
|
||||
// patch_merger
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
|
||||
// projection norm
|
||||
int proj_inp_dim = cur->ne[0];
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
n_embd, cur->ne[1] * scale_factor * scale_factor,
|
||||
ggml_row_size(cur->type, n_embd), 0);
|
||||
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
|
||||
ggml_row_size(cur->type, proj_inp_dim), 0);
|
||||
cb(cur, "proj_inp_normed", -1);
|
||||
|
||||
// projection mlp
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
cb(cur, "proj_out", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
96
tools/mtmd/models/llama4.cpp
Normal file
96
tools/mtmd/models/llama4.cpp
Normal file
@@ -0,0 +1,96 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_llama4::build() {
|
||||
GGML_ASSERT(model.class_embedding != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
|
||||
const int n_pos = n_patches + 1; // +1 for [CLS]
|
||||
|
||||
// 2D input positions
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
ggml_tensor * inp = build_inp_raw();
|
||||
|
||||
// Llama4UnfoldConvolution
|
||||
{
|
||||
ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
|
||||
patch_size, patch_size, 3, n_embd);
|
||||
inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
|
||||
inp = build_mm(model.patch_embeddings_0, inp);
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
|
||||
cb(inp, "patch_conv", -1);
|
||||
}
|
||||
|
||||
// add CLS token
|
||||
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||
|
||||
// build ViT with 2D position embeddings
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
// first half is X axis and second half is Y axis
|
||||
// ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
|
||||
// ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
|
||||
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||
};
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_pos,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
model.position_embeddings,
|
||||
add_pos);
|
||||
|
||||
// remove CLS token
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
n_embd, n_patches,
|
||||
ggml_row_size(cur->type, n_embd), 0);
|
||||
|
||||
// pixel shuffle
|
||||
// based on Llama4VisionPixelShuffleMLP
|
||||
// https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
|
||||
{
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
||||
GGML_ASSERT(scale_factor > 0);
|
||||
GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
|
||||
cur = ggml_reshape_4d(ctx0, cur,
|
||||
n_embd * scale_factor,
|
||||
n_patches_x / scale_factor,
|
||||
n_patches_y,
|
||||
bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_cont_4d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
n_patches_x / scale_factor,
|
||||
n_patches_y / scale_factor,
|
||||
bsz);
|
||||
//cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
// flatten to 2D
|
||||
cur = ggml_cont_2d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
n_patches / scale_factor / scale_factor);
|
||||
cb(cur, "pixel_shuffle", -1);
|
||||
}
|
||||
|
||||
// based on Llama4VisionMLP2 (always uses GELU activation, no bias)
|
||||
{
|
||||
cur = build_mm(model.mm_model_mlp_1_w, cur);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = build_mm(model.mm_model_mlp_2_w, cur);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cb(cur, "adapter_mlp", -1);
|
||||
}
|
||||
|
||||
// Llama4MultiModalProjector
|
||||
cur = build_mm(model.mm_model_proj, cur);
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
374
tools/mtmd/models/llava.cpp
Normal file
374
tools/mtmd/models/llava.cpp
Normal file
@@ -0,0 +1,374 @@
|
||||
#include "models.h"
|
||||
|
||||
// this graph is used by llava, granite and glm
|
||||
// due to having embedding_stack (used by granite), we cannot reuse build_vit
|
||||
ggml_cgraph * clip_graph_llava::build() {
|
||||
const int batch_size = 1;
|
||||
const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
|
||||
|
||||
GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
|
||||
|
||||
// Calculate the deepest feature layer based on hparams and projector type
|
||||
int max_feature_layer = n_layer;
|
||||
{
|
||||
// Get the index of the second to last layer; this is the default for models that have a llava projector
|
||||
int il_last = hparams.n_layer - 1;
|
||||
int deepest_feature_layer = -1;
|
||||
|
||||
if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
||||
il_last += 1;
|
||||
}
|
||||
|
||||
// If we set explicit vision feature layers, only go up to the deepest one
|
||||
// NOTE: only used by granite-vision models for now
|
||||
for (const auto & feature_layer : hparams.vision_feature_layer) {
|
||||
if (feature_layer > deepest_feature_layer) {
|
||||
deepest_feature_layer = feature_layer;
|
||||
}
|
||||
}
|
||||
max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
|
||||
}
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
// concat class_embeddings and patch_embeddings
|
||||
if (model.class_embedding) {
|
||||
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||
}
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
|
||||
// pre-layernorm
|
||||
if (model.pre_ln_w) {
|
||||
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
cb(inpL, "pre_ln", -1);
|
||||
}
|
||||
|
||||
std::vector<ggml_tensor *> embedding_stack;
|
||||
const auto & vision_feature_layer = hparams.vision_feature_layer;
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < max_feature_layer; il++) {
|
||||
auto & layer = model.layers[il];
|
||||
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
||||
|
||||
// If this is an embedding feature layer, save the output.
|
||||
// NOTE: 0 index here refers to the input to the encoder.
|
||||
if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
|
||||
embedding_stack.push_back(cur);
|
||||
}
|
||||
|
||||
// layernorm1
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "layer_inp_normed", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
inpL = cur; // inpL = residual, cur = hidden_states
|
||||
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
// layernorm2
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
// ffn
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
if (model.post_ln_w) {
|
||||
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
}
|
||||
|
||||
ggml_tensor * embeddings = inpL;
|
||||
|
||||
// process vision feature layers (used by granite)
|
||||
{
|
||||
// final layer is a vision feature layer
|
||||
if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
|
||||
embedding_stack.push_back(inpL);
|
||||
}
|
||||
|
||||
// If feature layers are explicitly set, stack them (if we have multiple)
|
||||
if (!embedding_stack.empty()) {
|
||||
embeddings = embedding_stack[0];
|
||||
for (size_t i = 1; i < embedding_stack.size(); i++) {
|
||||
embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// llava projector (also used by granite)
|
||||
if (hparams.has_llava_projector) {
|
||||
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||
|
||||
ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(patches, "patches");
|
||||
ggml_set_input(patches);
|
||||
|
||||
// shape [1, 576, 1024]
|
||||
// ne is whcn, ne = [1024, 576, 1, 1]
|
||||
embeddings = ggml_get_rows(ctx0, embeddings, patches);
|
||||
|
||||
// print_tensor_info(embeddings, "embeddings");
|
||||
|
||||
// llava projector
|
||||
if (proj_type == PROJECTOR_TYPE_MLP) {
|
||||
embeddings = build_mm(model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
if (model.mm_2_w) {
|
||||
embeddings = build_mm(model.mm_2_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||
}
|
||||
}
|
||||
else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
embeddings = build_mm(model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
||||
// First LayerNorm
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
|
||||
model.mm_1_b);
|
||||
|
||||
// GELU activation
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
// Second linear layer
|
||||
embeddings = build_mm(model.mm_3_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
|
||||
|
||||
// Second LayerNorm
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
|
||||
model.mm_4_b);
|
||||
}
|
||||
else if (proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projector
|
||||
int n_patch = 24;
|
||||
ggml_tensor * mlp_1 = build_mm(model.mm_model_mlp_1_w, embeddings);
|
||||
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
|
||||
mlp_1 = ggml_gelu(ctx0, mlp_1);
|
||||
ggml_tensor * mlp_3 = build_mm(model.mm_model_mlp_3_w, mlp_1);
|
||||
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
|
||||
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
|
||||
|
||||
// block 1
|
||||
ggml_tensor * block_1 = nullptr;
|
||||
{
|
||||
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
|
||||
mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
|
||||
mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
||||
// stride = 1, padding = 1, bias is nullptr
|
||||
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
|
||||
|
||||
// layer norm
|
||||
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
|
||||
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
// hardswish
|
||||
ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||
|
||||
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = build_mm(model.mm_model_block_1_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_1_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||
|
||||
int w = block_1->ne[0], h = block_1->ne[1];
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = build_mm(model.mm_model_block_1_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
// residual
|
||||
block_1 = ggml_add(ctx0, mlp_3, block_1);
|
||||
}
|
||||
|
||||
// block_2
|
||||
{
|
||||
// stride = 2
|
||||
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
|
||||
|
||||
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||
// layer norm
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||
// hardswish
|
||||
ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||
|
||||
// not sure the parameters is right for globalAvgPooling
|
||||
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = build_mm(model.mm_model_block_2_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_2_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
|
||||
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||
|
||||
int w = block_1->ne[0], h = block_1->ne[1];
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = build_mm(model.mm_model_block_2_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
|
||||
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
|
||||
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
|
||||
}
|
||||
embeddings = block_1;
|
||||
}
|
||||
else if (proj_type == PROJECTOR_TYPE_LDPV2)
|
||||
{
|
||||
int n_patch = 24;
|
||||
ggml_tensor * mlp_0 = build_mm(model.mm_model_mlp_0_w, embeddings);
|
||||
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
|
||||
mlp_0 = ggml_gelu(ctx0, mlp_0);
|
||||
ggml_tensor * mlp_2 = build_mm(model.mm_model_mlp_2_w, mlp_0);
|
||||
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
||||
// mlp_2 ne = [2048, 576, 1, 1]
|
||||
// // AVG Pool Layer 2*2, strides = 2
|
||||
mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
|
||||
// mlp_2 ne = [576, 2048, 1, 1]
|
||||
mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
|
||||
// mlp_2 ne [24, 24, 2048, 1]
|
||||
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
||||
// weight ne = [3, 3, 2048, 1]
|
||||
ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
|
||||
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
|
||||
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
|
||||
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
|
||||
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
|
||||
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
||||
embeddings = peg_0;
|
||||
}
|
||||
else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
// glm projector
|
||||
else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
||||
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
||||
embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
|
||||
embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
||||
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
|
||||
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
|
||||
// GLU
|
||||
{
|
||||
embeddings = build_mm(model.mm_model_mlp_0_w, embeddings);
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
||||
embeddings = ggml_gelu_inplace(ctx0, embeddings);
|
||||
ggml_tensor * x = embeddings;
|
||||
embeddings = build_mm(model.mm_model_mlp_2_w, embeddings);
|
||||
x = build_mm(model.mm_model_mlp_1_w,x);
|
||||
embeddings = ggml_swiglu_split(ctx0, embeddings, x);
|
||||
embeddings = build_mm(model.mm_model_mlp_3_w, embeddings);
|
||||
}
|
||||
// arrangement of BOI/EOI token embeddings
|
||||
// note: these embeddings are not present in text model, hence we cannot process them as text tokens
|
||||
// see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
|
||||
{
|
||||
embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
|
||||
embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
|
||||
}
|
||||
}
|
||||
|
||||
else {
|
||||
GGML_ABORT("llava: unknown projector type");
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
return gf;
|
||||
}
|
||||
209
tools/mtmd/models/mimovl.cpp
Normal file
209
tools/mtmd/models/mimovl.cpp
Normal file
@@ -0,0 +1,209 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_tensor * clip_graph_mimovl::build_mm(ggml_tensor * w, ggml_tensor * x) const {
|
||||
ggml_tensor * cur = ggml_mul_mat(ctx0, w, x);
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
return cur;
|
||||
}
|
||||
|
||||
// MiMoVL vision tower for MiMo-V2.5 (non-Pro). Qwen2.5-VL-shaped ViT, except:
|
||||
// 1. GQA in attention (32 Q / 8 KV heads, head_dim 64).
|
||||
// 2. Per-head attention sinks on every windowed layer. The sinks adjust
|
||||
// the softmax denominator (equivalently, a virtual extra K column with V=0),
|
||||
// so they decay attention weight without contributing to the output.
|
||||
// 3. Per-layer window-attention mode in hparams.wa_pattern_mode:
|
||||
// -1 -> full, 0 -> row-window+sinks, 1 -> col-window+sinks.
|
||||
// Col mode transposes the merge-unit grid on entry and restores
|
||||
// it on exit. Both patch and rotary orderings are pre-computed
|
||||
// host-side.
|
||||
// 4. 1D banded sliding window (|q-k| > window_size -> -inf) as a
|
||||
// single 2D mask broadcast across heads.
|
||||
// 5. Per-block MLP biases.
|
||||
ggml_cgraph * clip_graph_mimovl::build() {
|
||||
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
|
||||
GGML_ASSERT(model.patch_embeddings_1 != nullptr);
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
GGML_ASSERT(hparams.n_head_kv > 0);
|
||||
GGML_ASSERT(n_head % hparams.n_head_kv == 0);
|
||||
GGML_ASSERT((int) hparams.wa_pattern_mode.size() == n_layer);
|
||||
|
||||
const int batch_size = 1;
|
||||
const int n_pos = n_patches;
|
||||
const int n_head_kv = hparams.n_head_kv;
|
||||
const int merge = hparams.n_merge > 0 ? hparams.n_merge : 2;
|
||||
const int merge_unit = merge * merge;
|
||||
const int n_units = n_pos / merge_unit;
|
||||
GGML_ASSERT(n_units * merge_unit == n_pos);
|
||||
|
||||
// MiMoVL has head_dim=64 with n_embd=1280, so n_embd is NOT n_head*head_dim
|
||||
// (the base class's d_head = n_embd/n_head = 40 is wrong here). Derive
|
||||
// head_dim from the fused QKV projection: rows = (n_head + 2*n_head_kv)*head_dim.
|
||||
GGML_ASSERT(model.layers[0].qkv_w != nullptr);
|
||||
const int qkv_rows = model.layers[0].qkv_w->ne[1];
|
||||
const int head_dim = qkv_rows / (n_head + 2 * n_head_kv);
|
||||
GGML_ASSERT(head_dim * (n_head + 2 * n_head_kv) == qkv_rows);
|
||||
const float attn_scale = 1.0f / std::sqrt((float) head_dim);
|
||||
const int rope_n_dims = head_dim / 2;
|
||||
int mrope_sections[4] = {rope_n_dims/2, rope_n_dims/2, 0, 0};
|
||||
|
||||
// Patch embed: Conv3D(kt=2) split into two Conv2D, then interleave-merge
|
||||
// along the height axis to match the merge-tile token order.
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw,
|
||||
patch_size, patch_size, 0, 0, 1, 1);
|
||||
{
|
||||
ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw,
|
||||
patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, inp_1);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w,h,c,b] -> [c,w,h,b]
|
||||
inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
inp = ggml_reshape_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||
inp = ggml_cont_3d(ctx0, inp, n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
cb(inp, "patch_embed", -1);
|
||||
|
||||
ggml_tensor * positions_row = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos * 4);
|
||||
ggml_set_name(positions_row, "mimovl_positions_row");
|
||||
ggml_set_input(positions_row);
|
||||
|
||||
ggml_tensor * positions_col = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos * 4);
|
||||
ggml_set_name(positions_col, "mimovl_positions_col");
|
||||
ggml_set_input(positions_col);
|
||||
|
||||
// idx_col is the col-major merge-unit permutation. Take it as F32 so we can
|
||||
// derive the inverse permutation in-graph via ggml_argsort;
|
||||
// ggml_get_rows requires its index tensor to be I32, so cast back as well.
|
||||
ggml_tensor * idx_col_f = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_units);
|
||||
ggml_set_name(idx_col_f, "mimovl_idx_col");
|
||||
ggml_set_input(idx_col_f);
|
||||
ggml_tensor * idx_col = ggml_cast(ctx0, idx_col_f, GGML_TYPE_I32);
|
||||
ggml_tensor * idx_col_inv = ggml_argsort(ctx0, idx_col_f, GGML_SORT_ORDER_ASC);
|
||||
|
||||
ggml_tensor * window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
|
||||
ggml_set_name(window_mask, "mimovl_window_mask");
|
||||
ggml_set_input(window_mask);
|
||||
|
||||
ggml_tensor * window_mask_attn = (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED)
|
||||
? ggml_cast(ctx0, window_mask, GGML_TYPE_F16)
|
||||
: window_mask;
|
||||
|
||||
// Reorder helper: permute patches at merge-unit granularity. The patch
|
||||
// sequence is laid out as n_units groups of merge_unit (=4) consecutive
|
||||
// patches; the row<->col transpose only permutes whole groups. We keep
|
||||
// the per-group (h,w) ordering intact by reshaping to
|
||||
// [n_embd*merge_unit, n_units] before ggml_get_rows.
|
||||
auto reorder = [&](ggml_tensor * x, ggml_tensor * idx) {
|
||||
ggml_tensor * y = ggml_reshape_2d(ctx0, x, n_embd * merge_unit, n_units);
|
||||
y = ggml_get_rows(ctx0, y, idx);
|
||||
return ggml_reshape_3d(ctx0, y, n_embd, n_pos, batch_size);
|
||||
};
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
int prev_mode = -1;
|
||||
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
const int mode = hparams.wa_pattern_mode[il];
|
||||
const bool is_full = (mode == -1);
|
||||
const bool is_col = (mode == 1);
|
||||
|
||||
// Reorder transitions on entry/exit of a col-mode run.
|
||||
if (is_col && prev_mode != 1) {
|
||||
inpL = reorder(inpL, idx_col);
|
||||
cb(inpL, "reorder_to_col", il);
|
||||
} else if (!is_col && prev_mode == 1) {
|
||||
inpL = reorder(inpL, idx_col_inv);
|
||||
cb(inpL, "reorder_to_row", il);
|
||||
}
|
||||
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
// Pre-attention RMSNorm.
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_RMS, eps, il);
|
||||
cb(cur, "ln1", il);
|
||||
|
||||
// Fused QKV with GQA.
|
||||
ggml_tensor * qkv = build_mm(layer.qkv_w, cur);
|
||||
qkv = ggml_add(ctx0, qkv, layer.qkv_b);
|
||||
|
||||
const size_t row = ggml_row_size(qkv->type, head_dim);
|
||||
const size_t off_k = ggml_row_size(qkv->type, n_head * head_dim);
|
||||
const size_t off_v = ggml_row_size(qkv->type, (n_head + n_head_kv) * head_dim);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim, n_head, n_pos, row, qkv->nb[1], 0);
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim, n_head_kv, n_pos, row, qkv->nb[1], off_k);
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim, n_head_kv, n_pos, row, qkv->nb[1], off_v);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
// 2D RoPE
|
||||
ggml_tensor * pos = is_col ? positions_col : positions_row;
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, pos, nullptr, rope_n_dims, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000.0f, 1.0f, 0.0f, 1.0f, 32.0f, 1.0f);
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, pos, nullptr, rope_n_dims, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000.0f, 1.0f, 0.0f, 1.0f, 32.0f, 1.0f);
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
|
||||
// Full layers: plain attention. Windowed layers: banded mask and per-head sinks.
|
||||
ggml_tensor * mask = is_full ? nullptr : window_mask_attn;
|
||||
ggml_tensor * sinks = is_full ? nullptr : layer.attn_sinks;
|
||||
if (!is_full) {
|
||||
GGML_ASSERT(layer.attn_sinks != nullptr);
|
||||
}
|
||||
ggml_tensor * attn_out = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, mask, attn_scale, il, sinks);
|
||||
cb(attn_out, "attn_out", il);
|
||||
|
||||
// Residual 1.
|
||||
cur = ggml_add(ctx0, attn_out, inpL);
|
||||
inpL = cur;
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
// Pre-FFN RMSNorm.
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_RMS, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
// SwiGLU MLP with biases
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// Residual 2.
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
inpL = cur;
|
||||
prev_mode = mode;
|
||||
}
|
||||
|
||||
// If the last block was col-mode, undo the transpose so the merger sees patches in row order.
|
||||
if (prev_mode == 1) {
|
||||
inpL = reorder(inpL, idx_col_inv);
|
||||
cb(inpL, "reorder_to_row_final", -1);
|
||||
}
|
||||
|
||||
// Merger: post-LayerNorm
|
||||
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, 1e-6f, n_layer);
|
||||
cb(inpL, "post_ln", -1);
|
||||
|
||||
// Spatial merge: pack each merge_unit (=4) of patches into a single
|
||||
// (n_embd*merge_unit)-wide row, then run the 2-layer MLP.
|
||||
ggml_tensor * embeddings = ggml_reshape_3d(ctx0, inpL, n_embd * merge_unit, n_units, batch_size);
|
||||
embeddings = build_ffn(embeddings,
|
||||
model.mm_0_w, nullptr,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, nullptr,
|
||||
FFN_GELU, -1);
|
||||
cb(embeddings, "vit_out", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
return gf;
|
||||
}
|
||||
405
tools/mtmd/models/minicpmv.cpp
Normal file
405
tools/mtmd/models/minicpmv.cpp
Normal file
@@ -0,0 +1,405 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_minicpmv::build() {
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
const int n_pos = n_patches;
|
||||
const int n_embd_proj = n_mmproj_embd;
|
||||
|
||||
// position embeddings for the projector (not for ViT)
|
||||
// see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
|
||||
// base frequency omega
|
||||
ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
|
||||
ggml_set_name(omega, "omega");
|
||||
ggml_set_input(omega);
|
||||
|
||||
// 2D input positions (using float for sinusoidal embeddings)
|
||||
ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
// for selecting learned pos embd, used by ViT
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * embeddings = build_vit(
|
||||
inp, n_pos,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
nullptr);
|
||||
|
||||
// resampler projector (it is just another transformer)
|
||||
|
||||
ggml_tensor * q = model.mm_model_query;
|
||||
ggml_tensor * v = build_mm(model.mm_model_kv_proj, embeddings);
|
||||
|
||||
// norm
|
||||
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
|
||||
// calculate sinusoidal pos embd
|
||||
ggml_tensor * pos_embed = nullptr;
|
||||
{
|
||||
// outer product
|
||||
ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
|
||||
ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
|
||||
ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
|
||||
// sin and cos
|
||||
ggml_tensor * pos_embd_x = ggml_concat(
|
||||
ctx0,
|
||||
ggml_sin(ctx0, theta_x),
|
||||
ggml_cos(ctx0, theta_x),
|
||||
0 // concat on first dim
|
||||
);
|
||||
ggml_tensor * pos_embd_y = ggml_concat(
|
||||
ctx0,
|
||||
ggml_sin(ctx0, theta_y),
|
||||
ggml_cos(ctx0, theta_y),
|
||||
0 // concat on first dim
|
||||
);
|
||||
pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
|
||||
}
|
||||
|
||||
// k = v + pos_embed
|
||||
ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
|
||||
|
||||
// attention
|
||||
{
|
||||
const int d_head = 128;
|
||||
int n_head = n_embd_proj/d_head;
|
||||
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||
int num_query = hparams.minicpmv_query_num;
|
||||
ggml_tensor * Q = ggml_add(ctx0,
|
||||
build_mm(model.mm_model_attn_q_w, q),
|
||||
model.mm_model_attn_q_b);
|
||||
ggml_tensor * K = ggml_add(ctx0,
|
||||
build_mm(model.mm_model_attn_k_w, k),
|
||||
model.mm_model_attn_k_b);
|
||||
ggml_tensor * V = ggml_add(ctx0,
|
||||
build_mm(model.mm_model_attn_v_w, v),
|
||||
model.mm_model_attn_v_b);
|
||||
|
||||
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
|
||||
K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
|
||||
V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
|
||||
|
||||
cb(Q, "resampler_Q", -1);
|
||||
cb(K, "resampler_K", -1);
|
||||
cb(V, "resampler_V", -1);
|
||||
|
||||
float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
|
||||
embeddings = build_attn(
|
||||
model.mm_model_attn_o_w,
|
||||
model.mm_model_attn_o_b,
|
||||
Q, K, V, nullptr, resampler_kq_scale, -1);
|
||||
cb(embeddings, "resampler_attn_out", -1);
|
||||
}
|
||||
// layernorm
|
||||
embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
|
||||
// projection
|
||||
embeddings = build_mm(model.mm_model_proj, embeddings);
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_minicpmv4_6::build() {
|
||||
const int insert_lid = hparams.insert_layer_id;
|
||||
const int n_pos = n_patches;
|
||||
const int half_h = n_patches_y / 2;
|
||||
const int half_w = n_patches_x / 2;
|
||||
const int n_ds = half_h * half_w; // after ViT merger 2x2 downsample
|
||||
const int qh = half_h / 2;
|
||||
const int qw = half_w / 2;
|
||||
const int n_ds2 = qh * qw; // after final merger 2x2 downsample
|
||||
|
||||
auto add_i32_input = [&](const char * name, int n) {
|
||||
ggml_tensor * t = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n);
|
||||
ggml_set_name(t, name);
|
||||
ggml_set_input(t);
|
||||
return t;
|
||||
};
|
||||
|
||||
// position indices for ViT learned positional embeddings
|
||||
ggml_tensor * positions = add_i32_input("positions", n_pos);
|
||||
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
||||
|
||||
// ViT merger window reorder indices + block-diagonal mask
|
||||
// (mask layout follows qwen2vl: -inf except for 4x4 blocks on the diagonal,
|
||||
// so each window-major group of 4 tokens only attends to itself)
|
||||
ggml_tensor * vit_merger_window_idx = add_i32_input("vit_merger_window_idx", n_pos);
|
||||
ggml_tensor * vit_merger_inv_window_idx = add_i32_input("vit_merger_inv_window_idx", n_pos);
|
||||
ggml_tensor * vit_merger_window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
|
||||
ggml_set_name(vit_merger_window_mask, "vit_merger_window_mask");
|
||||
ggml_set_input(vit_merger_window_mask);
|
||||
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
||||
vit_merger_window_mask = ggml_cast(ctx0, vit_merger_window_mask, GGML_TYPE_F16);
|
||||
}
|
||||
|
||||
// ViT merger 2x2 downsample gather indices
|
||||
ggml_tensor * vit_merger_ds_idx_0 = add_i32_input("vit_merger_ds_idx_0", n_ds);
|
||||
ggml_tensor * vit_merger_ds_idx_1 = add_i32_input("vit_merger_ds_idx_1", n_ds);
|
||||
ggml_tensor * vit_merger_ds_idx_2 = add_i32_input("vit_merger_ds_idx_2", n_ds);
|
||||
ggml_tensor * vit_merger_ds_idx_3 = add_i32_input("vit_merger_ds_idx_3", n_ds);
|
||||
|
||||
// final merger 2x2 downsample gather indices
|
||||
ggml_tensor * merger_ds_idx_0 = add_i32_input("merger_ds_idx_0", n_ds2);
|
||||
ggml_tensor * merger_ds_idx_1 = add_i32_input("merger_ds_idx_1", n_ds2);
|
||||
ggml_tensor * merger_ds_idx_2 = add_i32_input("merger_ds_idx_2", n_ds2);
|
||||
ggml_tensor * merger_ds_idx_3 = add_i32_input("merger_ds_idx_3", n_ds2);
|
||||
|
||||
// patch embedding + positional embedding
|
||||
ggml_tensor * inp = build_inp();
|
||||
inp = ggml_add(ctx0, inp, learned_pos_embd);
|
||||
cb(inp, "pos_embed", -1);
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
if (model.pre_ln_w) {
|
||||
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
cb(inpL, "pre_ln", -1);
|
||||
}
|
||||
|
||||
// ViT layers 0..insert_layer_id (inclusive)
|
||||
// Mirrors the separate-qkv path of clip_graph::build_vit so the two manually
|
||||
// unrolled segments around the ViT merger read like build_vit() expansions.
|
||||
for (int il = 0; il <= insert_lid; il++) {
|
||||
auto & layer = model.layers[il];
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "layer_inp_normed", il);
|
||||
|
||||
{
|
||||
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
if (layer.ls_1_w) {
|
||||
cur = ggml_mul(ctx0, cur, layer.ls_1_w);
|
||||
cb(cur, "attn_out_scaled", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
inpL = cur;
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b, hparams.ffn_op, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
if (layer.ls_2_w) {
|
||||
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
|
||||
cb(cur, "ffn_out_scaled", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
// ViT merger: window self-attention
|
||||
// Tokens are reordered to window-major (4 tokens per window are contiguous),
|
||||
// and a block-diagonal mask restricts attention to within each window. This
|
||||
// mirrors the qwen2vl windowed-attention pattern so build_attn() can pick the
|
||||
// flash-attention path when available.
|
||||
{
|
||||
ggml_tensor * residual = inpL;
|
||||
ggml_tensor * cur = build_norm(inpL,
|
||||
model.vit_merger_ln1_w, model.vit_merger_ln1_b,
|
||||
NORM_TYPE_NORMAL, eps, -1);
|
||||
cb(cur, "vit_merger_attn_inp_normed", -1);
|
||||
|
||||
cur = ggml_get_rows(ctx0, cur, vit_merger_window_idx);
|
||||
cb(cur, "vit_merger_window_reorder", -1);
|
||||
|
||||
ggml_tensor * Qcur = build_mm(model.vit_merger_attn_q_w, cur);
|
||||
if (model.vit_merger_attn_q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.vit_merger_attn_q_b);
|
||||
}
|
||||
ggml_tensor * Kcur = build_mm(model.vit_merger_attn_k_w, cur);
|
||||
if (model.vit_merger_attn_k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.vit_merger_attn_k_b);
|
||||
}
|
||||
ggml_tensor * Vcur = build_mm(model.vit_merger_attn_v_w, cur);
|
||||
if (model.vit_merger_attn_v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.vit_merger_attn_v_b);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||
cb(Qcur, "vit_merger_Qcur", -1);
|
||||
cb(Kcur, "vit_merger_Kcur", -1);
|
||||
cb(Vcur, "vit_merger_Vcur", -1);
|
||||
|
||||
cur = build_attn(model.vit_merger_attn_o_w, model.vit_merger_attn_o_b,
|
||||
Qcur, Kcur, Vcur, vit_merger_window_mask, kq_scale, -1);
|
||||
cb(cur, "vit_merger_attn_out", -1);
|
||||
|
||||
cur = ggml_get_rows(ctx0, cur, vit_merger_inv_window_idx);
|
||||
inpL = ggml_add(ctx0, cur, residual);
|
||||
cb(inpL, "vit_merger_attn_residual", -1);
|
||||
}
|
||||
|
||||
// ViT merger: 2x2 spatial downsample + MLP (4 tokens -> 1)
|
||||
{
|
||||
ggml_tensor * p0 = ggml_get_rows(ctx0, inpL, vit_merger_ds_idx_0);
|
||||
ggml_tensor * p1 = ggml_get_rows(ctx0, inpL, vit_merger_ds_idx_1);
|
||||
ggml_tensor * p2 = ggml_get_rows(ctx0, inpL, vit_merger_ds_idx_2);
|
||||
ggml_tensor * p3 = ggml_get_rows(ctx0, inpL, vit_merger_ds_idx_3);
|
||||
|
||||
ggml_tensor * mean_res = ggml_add(ctx0, p0, p1);
|
||||
mean_res = ggml_add(ctx0, mean_res, p2);
|
||||
mean_res = ggml_add(ctx0, mean_res, p3);
|
||||
mean_res = ggml_scale(ctx0, mean_res, 0.25f);
|
||||
cb(mean_res, "vit_merger_ds_mean_res", -1);
|
||||
|
||||
ggml_tensor * cat = ggml_concat(ctx0, p0, p1, 0);
|
||||
cat = ggml_concat(ctx0, cat, p2, 0);
|
||||
cat = ggml_concat(ctx0, cat, p3, 0);
|
||||
|
||||
ggml_tensor * cur = build_norm(cat,
|
||||
model.vit_merger_ds_ln_w, model.vit_merger_ds_ln_b,
|
||||
NORM_TYPE_NORMAL, eps, -1);
|
||||
cb(cur, "vit_merger_ds_normed", -1);
|
||||
|
||||
// ViTWindowAttentionMerger downsample MLP uses gelu_pytorch_tanh (FFN_GELU)
|
||||
cur = build_ffn(cur,
|
||||
model.vit_merger_ds_up_w, model.vit_merger_ds_up_b,
|
||||
nullptr, nullptr,
|
||||
model.vit_merger_ds_down_w, model.vit_merger_ds_down_b,
|
||||
FFN_GELU, -1);
|
||||
cb(cur, "vit_merger_ds_mlp_out", -1);
|
||||
|
||||
inpL = ggml_add(ctx0, cur, mean_res);
|
||||
cb(inpL, "vit_merger_ds_out", -1);
|
||||
}
|
||||
|
||||
// ViT layers (insert_layer_id+1)..n_layer-1, operating on the downsampled tokens
|
||||
{
|
||||
const int64_t n_pos_ds = n_ds;
|
||||
for (int il = insert_lid + 1; il < n_layer; il++) {
|
||||
auto & layer = model.layers[il];
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "layer_inp_normed", il);
|
||||
|
||||
{
|
||||
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos_ds);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos_ds);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos_ds);
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
if (layer.ls_1_w) {
|
||||
cur = ggml_mul(ctx0, cur, layer.ls_1_w);
|
||||
cb(cur, "attn_out_scaled", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
inpL = cur;
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b, hparams.ffn_op, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
if (layer.ls_2_w) {
|
||||
cur = ggml_mul(ctx0, cur, layer.ls_2_w);
|
||||
cb(cur, "ffn_out_scaled", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
}
|
||||
|
||||
if (model.post_ln_w) {
|
||||
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
cb(inpL, "post_ln", -1);
|
||||
}
|
||||
|
||||
// Final Merger (DownsampleMLP): another 2x2 spatial merge -> projector embedding
|
||||
{
|
||||
ggml_tensor * p0 = ggml_get_rows(ctx0, inpL, merger_ds_idx_0);
|
||||
ggml_tensor * p1 = ggml_get_rows(ctx0, inpL, merger_ds_idx_1);
|
||||
ggml_tensor * p2 = ggml_get_rows(ctx0, inpL, merger_ds_idx_2);
|
||||
ggml_tensor * p3 = ggml_get_rows(ctx0, inpL, merger_ds_idx_3);
|
||||
|
||||
ggml_tensor * cat = ggml_concat(ctx0, p0, p1, 0);
|
||||
cat = ggml_concat(ctx0, cat, p2, 0);
|
||||
cat = ggml_concat(ctx0, cat, p3, 0);
|
||||
|
||||
ggml_tensor * cur = build_norm(cat,
|
||||
model.mm_input_norm_w, model.mm_input_norm_b,
|
||||
NORM_TYPE_NORMAL, eps, -1);
|
||||
cb(cur, "merger_normed", -1);
|
||||
|
||||
// MiniCPMV4_6DownsampleMLP uses nn.GELU() (erf-based, FFN_GELU_ERF)
|
||||
cur = build_ffn(cur,
|
||||
model.mm_ffn_up_w, model.mm_ffn_up_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_ffn_down_w, model.mm_ffn_down_b,
|
||||
FFN_GELU_ERF, -1);
|
||||
cb(cur, "merger_out", -1);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, inpL);
|
||||
return gf;
|
||||
}
|
||||
451
tools/mtmd/models/mobilenetv5.cpp
Normal file
451
tools/mtmd/models/mobilenetv5.cpp
Normal file
@@ -0,0 +1,451 @@
|
||||
#include "models.h"
|
||||
|
||||
// Helpers for MobileNetV5 Blocks
|
||||
// RMS Norm 2D - normalizes over channels for each spatial position
|
||||
ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
|
||||
// inp: [W, H, C, B]
|
||||
|
||||
ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (weight) {
|
||||
cur = ggml_mul(ctx0, cur, weight);
|
||||
}
|
||||
|
||||
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
|
||||
ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
|
||||
const int64_t ih = inp->ne[1]; // height
|
||||
const int64_t iw = inp->ne[0]; // width
|
||||
|
||||
// Calculate output size (ceil division)
|
||||
const int64_t oh = (ih + stride_h - 1) / stride_h;
|
||||
const int64_t ow = (iw + stride_w - 1) / stride_w;
|
||||
|
||||
// Calculate padding needed
|
||||
const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
|
||||
const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
|
||||
|
||||
// Split padding asymmetrically
|
||||
const int pad_h_top = pad_h / 2;
|
||||
const int pad_h_bottom = pad_h - pad_h_top;
|
||||
const int pad_w_left = pad_w / 2;
|
||||
const int pad_w_right = pad_w - pad_w_left;
|
||||
|
||||
// Apply padding if needed
|
||||
// ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
|
||||
// For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
|
||||
if (pad_h > 0 || pad_w > 0) {
|
||||
inp = ggml_pad_ext(ctx0, inp,
|
||||
pad_w_left, pad_w_right, // width padding (dim 0)
|
||||
pad_h_top, pad_h_bottom, // height padding (dim 1)
|
||||
0, 0, // no channel padding (dim 2)
|
||||
0, 0); // no batch padding (dim 3)
|
||||
}
|
||||
|
||||
return inp;
|
||||
}
|
||||
|
||||
|
||||
// Edge Residual Block (Stage 0)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// 1. Expansion Conv (3x3)
|
||||
if (stride == 2) {
|
||||
// Case: Downsampling (Block 0)
|
||||
// Replicates Conv2dSame(kernel=3, stride=2)
|
||||
cur = pad_same_2d(cur, 3, 3, stride, stride);
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
|
||||
} else {
|
||||
// Case: Normal 3x3 Block (Block 1, 2)
|
||||
// Replicates Conv2d(kernel=3, stride=1, padding=1)
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
|
||||
}
|
||||
|
||||
// BN + Activation
|
||||
if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
// 2. Pointwise Linear Conv (1x1)
|
||||
// 1x1 Convs usually have padding=0 and stride=1
|
||||
cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
|
||||
|
||||
// 3. Residual Connection
|
||||
// Only apply residual if spatial dimensions and channels match (stride 1)
|
||||
if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Universal Inverted Residual Block (Stage 1+)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// 1. Depthwise Start (Optional)
|
||||
// NOTE: dw_start always has stride=1 (no downsampling here)
|
||||
if (block.dw_start_w) {
|
||||
int k = block.dw_start_w->ne[0]; // 3 or 5
|
||||
int p = k / 2;
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
|
||||
if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
|
||||
}
|
||||
|
||||
// 2. Pointwise Expansion (1x1)
|
||||
if (block.pw_exp_w) {
|
||||
// Standard 1x1 conv, pad=0, stride=1
|
||||
cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
}
|
||||
|
||||
// 3. Depthwise Mid (Optional)
|
||||
// NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
|
||||
if (block.dw_mid_w) {
|
||||
int k = block.dw_mid_w->ne[0]; // 3 or 5
|
||||
|
||||
if (stride > 1) {
|
||||
// Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
|
||||
cur = pad_same_2d(cur, k, k, stride, stride);
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
|
||||
} else {
|
||||
// Case: Stride 1 -> Use Standard Symmetric Padding
|
||||
int p = k / 2;
|
||||
cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
|
||||
}
|
||||
|
||||
if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
}
|
||||
|
||||
// 4. Pointwise Projection (1x1)
|
||||
if (block.pw_proj_w) {
|
||||
cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
|
||||
}
|
||||
|
||||
// Apply Layer Scaling if present
|
||||
if (block.layer_scale_w) {
|
||||
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
|
||||
}
|
||||
|
||||
// 5. Residual Connection
|
||||
bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
|
||||
bool same_channel = (inp->ne[2] == cur->ne[2]);
|
||||
if (same_spatial && same_channel) {
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// Attention Block (MQA)
|
||||
ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
|
||||
ggml_tensor * cur = inp;
|
||||
|
||||
// Norm
|
||||
if (block.attn_norm_w) {
|
||||
cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
|
||||
}
|
||||
|
||||
// 1. Q Calculation
|
||||
ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// 2. K Calculation (Downsampled)
|
||||
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
|
||||
ggml_tensor * k_inp = cur;
|
||||
if (block.attn_k_dw_w) {
|
||||
int k_size = block.attn_k_dw_w->ne[0]; // Usually 3
|
||||
k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding
|
||||
k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (block.attn_k_norm_w) {
|
||||
k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
|
||||
}
|
||||
}
|
||||
ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// 3. V Calculation (Downsampled)
|
||||
// Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
|
||||
ggml_tensor * v_inp = cur;
|
||||
if (block.attn_v_dw_w) {
|
||||
int v_size = block.attn_v_dw_w->ne[0]; // Usually 3
|
||||
v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding
|
||||
v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (block.attn_v_norm_w) {
|
||||
v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
|
||||
}
|
||||
}
|
||||
ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
|
||||
const int D = k->ne[2]; // Head dimension
|
||||
const int n_head = q->ne[2] / D;
|
||||
const int N = W * H;
|
||||
|
||||
// Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
|
||||
q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
|
||||
q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
|
||||
q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
|
||||
q = ggml_cont(ctx0, q);
|
||||
|
||||
const int Wk = k->ne[0]; const int Hk = k->ne[1];
|
||||
const int M = Wk * Hk;
|
||||
|
||||
// Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
|
||||
k = ggml_reshape_3d(ctx0, k, M, D, B);
|
||||
k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
|
||||
k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
|
||||
k = ggml_cont(ctx0, k);
|
||||
|
||||
// Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
|
||||
v = ggml_reshape_3d(ctx0, v, M, D, B);
|
||||
v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
|
||||
v = ggml_cont(ctx0, v); // [M, D, 1, B]
|
||||
|
||||
// Multi-Query Attention
|
||||
float scale = 1.0f / sqrtf((float)D);
|
||||
|
||||
// Step 1: Compute Q @ K.T
|
||||
ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
|
||||
|
||||
scores = ggml_scale(ctx0, scores, scale);
|
||||
|
||||
scores = ggml_soft_max(ctx0, scores);
|
||||
|
||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
|
||||
|
||||
kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
|
||||
kqv = ggml_cont(ctx0, kqv);
|
||||
|
||||
|
||||
kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
|
||||
kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
|
||||
kqv = ggml_cont(ctx0, kqv);
|
||||
|
||||
// Output projection
|
||||
cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// Residual & Layer Scale
|
||||
if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
|
||||
if (block.layer_scale_w) {
|
||||
cur = ggml_mul(ctx0, cur, block.layer_scale_w);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, inp);
|
||||
}
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_mobilenetv5::build() {
|
||||
ggml_tensor * inp = build_inp_raw();
|
||||
|
||||
// 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
|
||||
ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding
|
||||
|
||||
cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0
|
||||
if (model.mobilenet_stem_conv_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
|
||||
}
|
||||
if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
|
||||
// 2. Blocks
|
||||
std::vector<ggml_tensor*> intermediate_features;
|
||||
const int total_blocks = model.mobilenet_blocks.size();
|
||||
|
||||
auto is_stage_start = [&](int i) {
|
||||
if (i == 0) return true;
|
||||
for (int end_idx : model.mobilenet_stage_ends) {
|
||||
if (i == end_idx + 1) return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
auto is_fusion_point = [&](int i) {
|
||||
if (model.mobilenet_stage_ends.size() >= 4) {
|
||||
if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
|
||||
if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
|
||||
} else {
|
||||
if (i == total_blocks - 1) return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
for (int i = 0; i < total_blocks; i++) {
|
||||
const auto & block = model.mobilenet_blocks[i];
|
||||
int stride = is_stage_start(i) ? 2 : 1;
|
||||
|
||||
if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride);
|
||||
else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block);
|
||||
else cur = build_inverted_residual(cur, block, stride);
|
||||
|
||||
if (is_fusion_point(i)) {
|
||||
|
||||
intermediate_features.push_back(cur);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Multi-Scale Fusion Adapter (MSFA)
|
||||
if (!intermediate_features.empty()) {
|
||||
|
||||
// A. Reference Resolution: PyTorch implementation uses inputs[0]
|
||||
// We assume intermediate_features[0] is the "High Resolution" target.
|
||||
// In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
|
||||
ggml_tensor* target_feat = intermediate_features[0];
|
||||
int high_res_w = target_feat->ne[0];
|
||||
int high_res_h = target_feat->ne[1];
|
||||
|
||||
std::vector<ggml_tensor*> resized_feats;
|
||||
|
||||
// B. Resize inputs to match inputs[0] (High Resolution)
|
||||
for (auto feat : intermediate_features) {
|
||||
int feat_w = feat->ne[0];
|
||||
int feat_h = feat->ne[1];
|
||||
|
||||
// PyTorch: if feat_size < high_resolution: interpolate
|
||||
if (feat_w < high_res_w || feat_h < high_res_h) {
|
||||
// Calculate scale factor.
|
||||
// Note: PyTorch 'nearest' works on arbitrary float scales.
|
||||
// ggml_upscale generally takes integer factors or target sizes depending on helper.
|
||||
// Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
|
||||
int scale_w = high_res_w / feat_w;
|
||||
// int scale_h = high_res_h / feat_h;
|
||||
|
||||
// Safety check for non-integer scaling if strictly replicating
|
||||
GGML_ASSERT(high_res_w % feat_w == 0);
|
||||
|
||||
// Upsample (Nearest Neighbor)
|
||||
// 2 is the scale factor
|
||||
feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
|
||||
}
|
||||
resized_feats.push_back(feat);
|
||||
}
|
||||
|
||||
// C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
|
||||
cur = resized_feats[0];
|
||||
for (size_t k = 1; k < resized_feats.size(); ++k) {
|
||||
cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
|
||||
}
|
||||
|
||||
// D. FFN (UniversalInvertedResidual)
|
||||
// Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
|
||||
|
||||
// 1. Expansion
|
||||
if (model.msfa_ffn_expand_w) {
|
||||
// 1x1 Conv
|
||||
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
if (model.msfa_ffn_expand_bn) {
|
||||
cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
|
||||
}
|
||||
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
}
|
||||
|
||||
// 2. Projection (No DW because kernel_size=0)
|
||||
if (model.msfa_ffn_project_w) {
|
||||
// 1x1 Conv
|
||||
cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
|
||||
// UniversalInvertedResidual typically has a norm after projection
|
||||
if (model.msfa_ffn_project_bn) {
|
||||
cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// E. Final Downsample to Target Resolution (Output Resolution)
|
||||
// PyTorch: matches self.output_resolution (e.g. 16x16)
|
||||
const int target_out_res = 16;
|
||||
int current_w = cur->ne[0];
|
||||
|
||||
if (current_w > target_out_res) {
|
||||
int s = current_w / target_out_res;
|
||||
|
||||
GGML_ASSERT(current_w % target_out_res == 0);
|
||||
|
||||
// Avg Pool: Kernel=s, Stride=s
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
// F. Final Norm
|
||||
if (model.msfa_concat_norm_w) {
|
||||
cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Gemma 3n Multimodal Projection (Embedder)
|
||||
// Input: 'cur' is [Width, Height, Channels, Batch]
|
||||
int W = cur->ne[0];
|
||||
int H = cur->ne[1];
|
||||
int C = cur->ne[2];
|
||||
int B = cur->ne[3];
|
||||
|
||||
GGML_ASSERT(C == hparams.n_embd);
|
||||
|
||||
// 1. Permute and Flatten to [Channels, Tokens, Batch]
|
||||
// PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
|
||||
cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
|
||||
// 2. FEATURE SCALING
|
||||
// PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
|
||||
const float scale_factor = sqrtf((float)C);
|
||||
cur = ggml_scale(ctx0, cur, scale_factor);
|
||||
|
||||
|
||||
// 3. SOFT EMBEDDING NORM
|
||||
// PyTorch: self._norm(x) * self.weight
|
||||
// We must normalize regardless, then multiply if weight exists.
|
||||
{
|
||||
const float eps = 1e-6f; // Gemma3n uses 1e-6
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (model.mm_soft_emb_norm_w) {
|
||||
// Weight shape is (2048,) -> Element-wise broadcast multiply
|
||||
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// 4. PROJECTION
|
||||
// PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
|
||||
// Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
|
||||
if (model.mm_input_proj_w) {
|
||||
cur = build_mm(model.mm_input_proj_w, cur);
|
||||
}
|
||||
|
||||
// 5. POST PROJECTION NORM
|
||||
// PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
|
||||
// with_scale=False means weight is registered as buffer with value 1.0
|
||||
// So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
|
||||
{
|
||||
const float eps = 1e-6f;
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
|
||||
if (model.mm_post_proj_norm_w) {
|
||||
// If weight is loaded, multiply (should be ~1.0 anyway)
|
||||
cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
192
tools/mtmd/models/models.h
Normal file
192
tools/mtmd/models/models.h
Normal file
@@ -0,0 +1,192 @@
|
||||
#pragma once
|
||||
|
||||
#include "../clip-graph.h"
|
||||
|
||||
/*
|
||||
* IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
|
||||
* We encourage human contributors to ensure the quality and reliability of the codebase.
|
||||
*/
|
||||
|
||||
struct clip_graph_siglip : clip_graph {
|
||||
clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_gemma4v : clip_graph {
|
||||
clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
|
||||
};
|
||||
|
||||
struct clip_graph_pixtral : clip_graph {
|
||||
clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_qwen2vl : clip_graph {
|
||||
clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_qwen3vl : clip_graph {
|
||||
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_mimovl : clip_graph {
|
||||
clip_graph_mimovl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
// Force F32 mat-mul accumulation to avoid F16 overflow in the FFN down-proj
|
||||
// when the mmproj is stored in F16 (the source weights are BF16; downcasting
|
||||
// to F16 reduces dynamic range below the SwiGLU output magnitude on the last few layers).
|
||||
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
|
||||
};
|
||||
|
||||
struct clip_graph_step3vl : clip_graph {
|
||||
clip_graph_step3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_youtuvl : clip_graph {
|
||||
clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_yasa2 : clip_graph {
|
||||
clip_graph_yasa2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
ggml_tensor * layer_norm_channels(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b, float eps = 1e-6f);
|
||||
ggml_tensor * convnext_grn(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b);
|
||||
};
|
||||
|
||||
struct clip_graph_minicpmv : clip_graph {
|
||||
clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_minicpmv4_6 : clip_graph {
|
||||
clip_graph_minicpmv4_6(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_internvl : clip_graph {
|
||||
clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_nemotron_v2_vl : clip_graph {
|
||||
clip_graph_nemotron_v2_vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_llama4 : clip_graph {
|
||||
clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_kimivl : clip_graph {
|
||||
clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_paddleocr : clip_graph {
|
||||
clip_graph_paddleocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_dotsocr : clip_graph {
|
||||
clip_graph_dotsocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_cogvlm : clip_graph {
|
||||
clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_llava : clip_graph {
|
||||
clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_whisper_enc : clip_graph {
|
||||
clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_deepseekocr : clip_graph {
|
||||
clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_conformer : clip_graph {
|
||||
clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_granite_speech : clip_graph {
|
||||
clip_graph_granite_speech(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_gemma4a : clip_graph {
|
||||
clip_graph_gemma4a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
|
||||
};
|
||||
|
||||
struct clip_graph_glm4v : clip_graph {
|
||||
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_hunyuanocr : clip_graph {
|
||||
clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_mobilenetv5 : clip_graph {
|
||||
clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
ggml_tensor * rms_norm_2d(
|
||||
ggml_tensor * inp,
|
||||
ggml_tensor * weight,
|
||||
float eps = 1e-6f);
|
||||
|
||||
ggml_tensor* pad_same_2d(
|
||||
ggml_tensor* inp,
|
||||
int kernel_h,
|
||||
int kernel_w,
|
||||
int stride_h,
|
||||
int stride_w,
|
||||
int dilation_h = 1,
|
||||
int dilation_w = 1);
|
||||
|
||||
ggml_tensor * build_edge_residual(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block,
|
||||
int stride);
|
||||
|
||||
ggml_tensor * build_inverted_residual(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block,
|
||||
int stride);
|
||||
|
||||
ggml_tensor * build_mobilenet_attn(
|
||||
ggml_tensor * inp,
|
||||
const mobilenetv5_block & block);
|
||||
};
|
||||
|
||||
struct clip_graph_qwen3a : clip_graph {
|
||||
clip_graph_qwen3a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_kimik25 : clip_graph {
|
||||
clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode);
|
||||
};
|
||||
35
tools/mtmd/models/nemotron-v2-vl.cpp
Normal file
35
tools/mtmd/models/nemotron-v2-vl.cpp
Normal file
@@ -0,0 +1,35 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_nemotron_v2_vl::build() {
|
||||
GGML_ASSERT(model.class_embedding != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
|
||||
const int n_registers = model.class_embedding->ne[1];
|
||||
const int n_pos = n_patches + n_registers;
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
// add position embeddings (pre-downsampled during GGUF conversion for fixed 512x512 input)
|
||||
inp = ggml_add(ctx0, inp, model.position_embeddings);
|
||||
cb(inp, "inp_pos", -1);
|
||||
|
||||
inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
|
||||
|
||||
ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, hparams.ffn_op, nullptr, nullptr);
|
||||
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
n_embd, n_patches,
|
||||
ggml_row_size(cur->type, n_embd),
|
||||
n_registers * ggml_row_size(cur->type, n_embd));
|
||||
|
||||
cur = build_patch_merge_permute(cur, model.hparams.n_merge);
|
||||
|
||||
{
|
||||
cur = build_norm(cur, model.mm_0_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
|
||||
cur = build_ffn(cur, model.mm_1_w, nullptr, nullptr, nullptr, model.mm_3_w, nullptr, FFN_RELU_SQR, -1);
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
52
tools/mtmd/models/paddleocr.cpp
Normal file
52
tools/mtmd/models/paddleocr.cpp
Normal file
@@ -0,0 +1,52 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_paddleocr::build() {
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return ggml_rope_multi(
|
||||
ctx0, cur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
|
||||
32768, 10000, 1, 0, 1, 32, 1);
|
||||
};
|
||||
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
{
|
||||
// mlp_AR paddleocr projector
|
||||
float proj_norm_eps = 1e-5;
|
||||
cur = build_norm(cur,
|
||||
model.mm_input_norm_w, model.mm_input_norm_b,
|
||||
NORM_TYPE_NORMAL, proj_norm_eps, -1);
|
||||
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
hparams.ffn_op, -1);
|
||||
cb(cur, "mlp_out", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
86
tools/mtmd/models/pixtral.cpp
Normal file
86
tools/mtmd/models/pixtral.cpp
Normal file
@@ -0,0 +1,86 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_pixtral::build() {
|
||||
const int n_merge = hparams.n_merge;
|
||||
|
||||
// 2D input positions
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
|
||||
};
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_RMS,
|
||||
hparams.ffn_op,
|
||||
nullptr, // no learned pos embd
|
||||
add_pos);
|
||||
|
||||
// mistral small 3.1 patch merger
|
||||
// ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
|
||||
if (model.mm_patch_merger_w) {
|
||||
GGML_ASSERT(hparams.n_merge > 0);
|
||||
|
||||
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
|
||||
|
||||
// reshape image tokens to 2D grid
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
|
||||
cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
// torch.nn.functional.unfold is just an im2col under the hood
|
||||
// we just need a dummy kernel to make it work
|
||||
ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
|
||||
cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
|
||||
|
||||
// project to n_embd
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
|
||||
cur = build_mm(model.mm_patch_merger_w, cur);
|
||||
}
|
||||
|
||||
// LlavaMultiModalProjector (always using GELU activation)
|
||||
{
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
}
|
||||
|
||||
// arrangement of the [IMG_BREAK] token
|
||||
if (model.token_embd_img_break) {
|
||||
// not efficient, but works
|
||||
// the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
|
||||
// and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
|
||||
// after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
|
||||
|
||||
const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
|
||||
const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
|
||||
const int p_total = p_x * p_y;
|
||||
const int n_embd_text = cur->ne[0];
|
||||
const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
|
||||
|
||||
ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
|
||||
ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
|
||||
tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
|
||||
tok = ggml_add(ctx0, tok, model.token_embd_img_break);
|
||||
tmp = ggml_concat(ctx0, tmp, tok, 1);
|
||||
cur = ggml_view_2d(ctx0, tmp,
|
||||
n_embd_text, n_tokens_output,
|
||||
ggml_row_size(tmp->type, n_embd_text), 0);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
183
tools/mtmd/models/qwen2vl.cpp
Normal file
183
tools/mtmd/models/qwen2vl.cpp
Normal file
@@ -0,0 +1,183 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_qwen2vl::build() {
|
||||
GGML_ASSERT(model.patch_bias == nullptr);
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
|
||||
const int batch_size = 1;
|
||||
const bool use_window_attn = hparams.n_wa_pattern > 0;
|
||||
const int n_wa_pattern = hparams.n_wa_pattern;
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
||||
|
||||
norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL
|
||||
? NORM_TYPE_RMS // qwen 2.5 vl
|
||||
: NORM_TYPE_NORMAL; // qwen 2 vl
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
|
||||
// second conv dimension
|
||||
{
|
||||
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, inp_1);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
inp = ggml_reshape_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||
inp = ggml_cont_3d(
|
||||
ctx0, inp,
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
ggml_tensor * window_mask = nullptr;
|
||||
ggml_tensor * window_idx = nullptr;
|
||||
ggml_tensor * inv_window_idx = nullptr;
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
// pre-layernorm
|
||||
if (model.pre_ln_w) {
|
||||
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
|
||||
}
|
||||
|
||||
if (use_window_attn) {
|
||||
// handle window attention inputs
|
||||
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
||||
ggml_set_name(inv_window_idx, "inv_window_idx");
|
||||
ggml_set_input(inv_window_idx);
|
||||
// mask for window attention
|
||||
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
|
||||
ggml_set_name(window_mask, "window_mask");
|
||||
ggml_set_input(window_mask);
|
||||
|
||||
// if flash attn is used, we need to pad the mask and cast to f16
|
||||
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
||||
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
|
||||
}
|
||||
|
||||
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
|
||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
|
||||
|
||||
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
||||
|
||||
// layernorm1
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
|
||||
cb(cur, "ln1", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_add(ctx0,
|
||||
build_mm(layer.q_w, cur), layer.q_b);
|
||||
ggml_tensor * Kcur = ggml_add(ctx0,
|
||||
build_mm(layer.k_w, cur), layer.k_b);
|
||||
ggml_tensor * Vcur = ggml_add(ctx0,
|
||||
build_mm(layer.v_w, cur), layer.v_b);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
// apply M-RoPE
|
||||
Qcur = ggml_rope_multi(
|
||||
ctx0, Qcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
Kcur = ggml_rope_multi(
|
||||
ctx0, Kcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
|
||||
ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
inpL = cur; // inpL = residual, cur = hidden_states
|
||||
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
// layernorm2
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
// ffn
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
if (model.post_ln_w) {
|
||||
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
|
||||
}
|
||||
|
||||
// multimodal projection
|
||||
ggml_tensor * embeddings = inpL;
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
||||
embeddings = build_ffn(embeddings,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
if (use_window_attn) {
|
||||
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
||||
ggml_set_name(window_idx, "window_idx");
|
||||
ggml_set_input(window_idx);
|
||||
|
||||
// embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
|
||||
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
return gf;
|
||||
}
|
||||
68
tools/mtmd/models/qwen3a.cpp
Normal file
68
tools/mtmd/models/qwen3a.cpp
Normal file
@@ -0,0 +1,68 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_qwen3a::build() {
|
||||
ggml_tensor * inp = build_inp_raw(1);
|
||||
|
||||
// conv2d block
|
||||
// TODO: do we need to split by chunks of n_window each like on transformers impl?
|
||||
{
|
||||
inp = ggml_conv_2d(ctx0, model.conv2d_1_w, inp, 2, 2, 1, 1, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, model.conv2d_1_b);
|
||||
inp = ggml_gelu_erf(ctx0, inp);
|
||||
|
||||
inp = ggml_conv_2d(ctx0, model.conv2d_2_w, inp, 2, 2, 1, 1, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, model.conv2d_2_b);
|
||||
inp = ggml_gelu_erf(ctx0, inp);
|
||||
|
||||
inp = ggml_conv_2d(ctx0, model.conv2d_3_w, inp, 2, 2, 1, 1, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, model.conv2d_3_b);
|
||||
inp = ggml_gelu_erf(ctx0, inp);
|
||||
|
||||
// inp [n_pos, n_mels/8, channels, 1] (W, H, C, N)
|
||||
cb(inp, "after_conv_blocks", -1);
|
||||
|
||||
const int64_t n_pos_after_conv = inp->ne[0];
|
||||
const int64_t n_mel_after_conv = inp->ne[1]; // 128/8 = 16
|
||||
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 3, 1));
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_pos_after_conv, n_mel_after_conv * inp->ne[3]); // [n_pos, 7680]
|
||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); // [7680, n_pos]
|
||||
|
||||
// project to n_embd
|
||||
inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
|
||||
if (model.conv_out_b) {
|
||||
inp = ggml_add(ctx0, inp, model.conv_out_b);
|
||||
}
|
||||
cb(inp, "after_conv_out", -1);
|
||||
}
|
||||
|
||||
auto n_pos = inp->ne[1];
|
||||
|
||||
ggml_tensor * pos_embd_selected = ggml_view_2d(
|
||||
ctx0, model.position_embeddings,
|
||||
model.position_embeddings->ne[0], n_pos,
|
||||
model.position_embeddings->nb[1], 0
|
||||
);
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_pos,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
pos_embd_selected,
|
||||
nullptr);
|
||||
|
||||
cb(cur, "after_transformer", -1);
|
||||
|
||||
// projector
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
193
tools/mtmd/models/qwen3vl.cpp
Normal file
193
tools/mtmd/models/qwen3vl.cpp
Normal file
@@ -0,0 +1,193 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_qwen3vl::build() {
|
||||
GGML_ASSERT(model.patch_bias != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
|
||||
const int batch_size = 1;
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
||||
|
||||
norm_type norm_t = NORM_TYPE_NORMAL;
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
|
||||
// second conv dimension
|
||||
{
|
||||
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, inp_1);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
inp = ggml_reshape_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||
inp = ggml_cont_3d(
|
||||
ctx0, inp,
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
|
||||
// add patch bias
|
||||
if (model.patch_bias != nullptr) {
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
cb(inp, "patch_bias", -1);
|
||||
}
|
||||
|
||||
// calculate absolute position embedding and apply
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||
learned_pos_embd = ggml_cont_4d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
learned_pos_embd = ggml_reshape_4d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
|
||||
learned_pos_embd = ggml_cont_3d(
|
||||
ctx0, learned_pos_embd,
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
inp = ggml_add(ctx0, inp, learned_pos_embd);
|
||||
cb(inp, "inp_pos_emb", -1);
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
// pre-layernorm
|
||||
if (model.pre_ln_w) {
|
||||
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
|
||||
}
|
||||
|
||||
// deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
|
||||
ggml_tensor * deepstack_features = nullptr;
|
||||
const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
auto & layer = model.layers[il];
|
||||
|
||||
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
||||
|
||||
// layernorm1
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
|
||||
cb(cur, "ln1", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = build_mm(layer.qkv_w, cur);
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ 0);
|
||||
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ ggml_row_size(cur->type, n_embd));
|
||||
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
/* nb1 */ ggml_row_size(cur->type, d_head),
|
||||
/* nb2 */ cur->nb[1],
|
||||
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
// apply M-RoPE
|
||||
Qcur = ggml_rope_multi(
|
||||
ctx0, Qcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
Kcur = ggml_rope_multi(
|
||||
ctx0, Kcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
inpL = cur; // inpL = residual, cur = hidden_states
|
||||
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
// layernorm2
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
// ffn
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
if (layer.has_deepstack()) {
|
||||
ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
|
||||
feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
|
||||
feat = build_ffn(feat,
|
||||
layer.deepstack_fc1_w, layer.deepstack_fc1_b,
|
||||
nullptr, nullptr,
|
||||
layer.deepstack_fc2_w, layer.deepstack_fc2_b,
|
||||
ffn_op_type::FFN_GELU, il);
|
||||
|
||||
if(!deepstack_features) {
|
||||
deepstack_features = feat;
|
||||
} else {
|
||||
// concat along the feature dimension
|
||||
deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
|
||||
}
|
||||
}
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
if (model.post_ln_w) {
|
||||
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
|
||||
}
|
||||
|
||||
// multimodal projection
|
||||
ggml_tensor * embeddings = inpL;
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
||||
|
||||
embeddings = build_ffn(embeddings,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
ffn_op_type::FFN_GELU, -1);
|
||||
|
||||
if (deepstack_features) {
|
||||
embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0);
|
||||
} // concat along the feature dimension
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
return gf;
|
||||
}
|
||||
94
tools/mtmd/models/siglip.cpp
Normal file
94
tools/mtmd/models/siglip.cpp
Normal file
@@ -0,0 +1,94 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_siglip::build() {
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
ggml_tensor * learned_pos_embd = model.position_embeddings;
|
||||
if (proj_type == PROJECTOR_TYPE_LFM2 || proj_type == PROJECTOR_TYPE_PHI4) {
|
||||
learned_pos_embd = resize_position_embeddings();
|
||||
}
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
nullptr);
|
||||
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
const int batch_size = 1;
|
||||
GGML_ASSERT(n_patches_x == n_patches_y);
|
||||
const int patches_per_image = n_patches_x;
|
||||
const int kernel_size = hparams.n_merge;
|
||||
|
||||
cur = ggml_transpose(ctx0, cur);
|
||||
cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
|
||||
|
||||
// doing a pool2d to reduce the number of output tokens
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
|
||||
// apply norm before projection
|
||||
cur = ggml_rms_norm(ctx0, cur, eps);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
|
||||
|
||||
// apply projection
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
|
||||
cur);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// pixel_shuffle
|
||||
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
cur = build_mm(model.mm_fc_w, cur);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_LFM2) {
|
||||
// pixel unshuffle block
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
|
||||
// projection, in LFM2-VL input norm is optional
|
||||
if (model.mm_input_norm_w) {
|
||||
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
}
|
||||
|
||||
if (model.mm_input_norm_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
}
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) {
|
||||
cur = build_ffn(cur,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
hparams.ffn_op,
|
||||
-1);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_PHI4) {
|
||||
cur = build_ffn(cur,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
|
||||
} else {
|
||||
GGML_ABORT("SigLIP: Unsupported projector type");
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
81
tools/mtmd/models/step3vl.cpp
Normal file
81
tools/mtmd/models/step3vl.cpp
Normal file
@@ -0,0 +1,81 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_step3vl::build() {
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
|
||||
norm_type norm_t = NORM_TYPE_NORMAL;
|
||||
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||
};
|
||||
|
||||
auto add_spatial_bias = [&](ggml_tensor * cur, ggml_tensor * bias) {
|
||||
if (bias == nullptr) {
|
||||
return cur;
|
||||
}
|
||||
|
||||
const int64_t width = cur->ne[0];
|
||||
const int64_t height = cur->ne[1];
|
||||
const int64_t channels = cur->ne[2];
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, cur, width * height, channels);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cur = ggml_add(ctx0, cur, bias);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cur = ggml_reshape_3d(ctx0, cur, width, height, channels);
|
||||
|
||||
return cur;
|
||||
};
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp,
|
||||
n_patches,
|
||||
norm_t,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
add_pos);
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
// [n_embd, n_patches] -> [w, h, n_embd] for spatial downsampling convolutions.
|
||||
cur = ggml_permute(ctx0, cur, 1, 0, 2, 3);
|
||||
cur = ggml_cont_3d(ctx0, cur, n_patches_x, n_patches_y, n_embd);
|
||||
|
||||
// First downsampler: Conv2d(1536 -> 3072, k=3, s=2, p=1)
|
||||
cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, 2, 2, 1, 1, 1, 1);
|
||||
cur = add_spatial_bias(cur, model.mm_0_b);
|
||||
cb(cur, "downsample_0", -1);
|
||||
|
||||
// Second downsampler: Conv2d(3072 -> 6144, k=3, s=2, p=1)
|
||||
cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 2, 2, 1, 1, 1, 1);
|
||||
cur = add_spatial_bias(cur, model.mm_1_b);
|
||||
cb(cur, "downsample_1", -1);
|
||||
|
||||
// [w, h, c] -> [c, w*h]
|
||||
{
|
||||
const int64_t w = cur->ne[0];
|
||||
const int64_t h = cur->ne[1];
|
||||
cur = ggml_reshape_3d(ctx0, cur, w * h, cur->ne[2], cur->ne[3]);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
|
||||
}
|
||||
cb(cur, "downsample_flatten", -1);
|
||||
|
||||
// Final projector: Linear(6144 -> projection_dim)
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
||||
cb(cur, "projector_out", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
137
tools/mtmd/models/whisper-enc.cpp
Normal file
137
tools/mtmd/models/whisper-enc.cpp
Normal file
@@ -0,0 +1,137 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||
const int n_frames = img.nx;
|
||||
const int n_pos = n_frames / 2;
|
||||
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
|
||||
|
||||
ggml_tensor * inp = build_inp_raw(1);
|
||||
|
||||
// conv1d block
|
||||
{
|
||||
// convolution + gelu
|
||||
ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
|
||||
cur = ggml_add(ctx0, cur, model.conv1d_1_b);
|
||||
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
|
||||
cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
|
||||
cur = ggml_add(ctx0, cur, model.conv1d_2_b);
|
||||
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
// transpose
|
||||
inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cb(inp, "after_conv1d", -1);
|
||||
}
|
||||
|
||||
// sanity check (only check one layer, but it should be the same for all)
|
||||
GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
|
||||
GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
|
||||
GGML_ASSERT(model.layers[0].q_b);
|
||||
GGML_ASSERT(model.layers[0].v_b);
|
||||
GGML_ASSERT(!model.layers[0].k_b); // no bias for k
|
||||
|
||||
ggml_tensor * pos_embd_selected = ggml_view_2d(
|
||||
ctx0, model.position_embeddings,
|
||||
model.position_embeddings->ne[0], n_pos,
|
||||
model.position_embeddings->nb[1], 0
|
||||
);
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_pos,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
pos_embd_selected,
|
||||
nullptr);
|
||||
|
||||
cb(cur, "after_transformer", -1);
|
||||
|
||||
if (model.audio_has_stack_frames()) {
|
||||
// StackAudioFrames
|
||||
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
|
||||
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
|
||||
cb(cur, "after_stacked", -1);
|
||||
}
|
||||
|
||||
if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
|
||||
// UltravoxProjector
|
||||
// pre-norm
|
||||
cur = ggml_rms_norm(ctx0, cur, 1e-6);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||
|
||||
// ffn in
|
||||
cur = build_mm(model.mm_1_w, cur);
|
||||
|
||||
// swiglu
|
||||
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
|
||||
cur = ggml_swiglu_swapped(ctx0, cur);
|
||||
|
||||
// mid-norm
|
||||
cur = ggml_rms_norm(ctx0, cur, 1e-6);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
|
||||
|
||||
// ffn out
|
||||
cur = build_mm(model.mm_2_w, cur);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
|
||||
// projector
|
||||
cur = build_mm(model.mm_fc_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
|
||||
// projector
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
|
||||
// projector
|
||||
cur = build_ffn(cur,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_MERALION) {
|
||||
// stack (above) -> ln -> linear0+silu -> GLU -> out
|
||||
cur = ggml_norm(ctx0, cur, hparams.eps);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.mm_0_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_0_b);
|
||||
cur = ggml_silu(ctx0, cur);
|
||||
|
||||
ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
gate = ggml_add(ctx0, gate, model.mm_1_b);
|
||||
gate = ggml_silu(ctx0, gate);
|
||||
|
||||
ggml_tensor * pool = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
pool = ggml_add(ctx0, pool, model.mm_2_b);
|
||||
|
||||
cur = ggml_mul(ctx0, gate, pool);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_3_b);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
|
||||
cur = ggml_norm(ctx0, cur, hparams.eps);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
|
||||
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
|
||||
cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
|
||||
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
|
||||
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
|
||||
} else {
|
||||
GGML_ABORT("%s: unknown projector type", __func__);
|
||||
}
|
||||
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
191
tools/mtmd/models/yasa2.cpp
Normal file
191
tools/mtmd/models/yasa2.cpp
Normal file
@@ -0,0 +1,191 @@
|
||||
// ABOUTME: Yasa2 vision encoder graph builder for ConvNeXt-based architecture.
|
||||
// ABOUTME: Implements patch embedding, ConvNeXt stages with GRN, and adaptive pooling.
|
||||
|
||||
#include "models.h"
|
||||
|
||||
static ggml_tensor * add_channel_bias(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * x_whcb,
|
||||
ggml_tensor * b_c) {
|
||||
if (!b_c) {
|
||||
return x_whcb;
|
||||
}
|
||||
ggml_tensor * b4 = ggml_reshape_4d(ctx0, b_c, 1, 1, b_c->ne[0], 1);
|
||||
return ggml_add(ctx0, x_whcb, b4);
|
||||
}
|
||||
|
||||
static ggml_tensor * mul_channel_weight(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * x_whcb,
|
||||
ggml_tensor * w_c) {
|
||||
if (!w_c) {
|
||||
return x_whcb;
|
||||
}
|
||||
ggml_tensor * w4 = ggml_reshape_4d(ctx0, w_c, 1, 1, w_c->ne[0], 1);
|
||||
return ggml_mul(ctx0, x_whcb, w4);
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph_yasa2::layer_norm_channels(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b, float eps) {
|
||||
// Match HF ConvNextLayerNorm(channels_first):
|
||||
// u = mean_c(x), s = mean_c((x-u)^2), x = (x-u)/sqrt(s+eps)
|
||||
// cast back to input dtype before affine.
|
||||
ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3); // [W,H,C,B] -> [C,H,W,B]
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
ggml_tensor * u = ggml_mean(ctx0, cur); // [1,H,W,B]
|
||||
ggml_tensor * xm = ggml_sub(ctx0, cur, u); // [C,H,W,B]
|
||||
|
||||
ggml_tensor * s = ggml_mul(ctx0, xm, xm); // [C,H,W,B]
|
||||
s = ggml_mean(ctx0, s); // [1,H,W,B]
|
||||
s = ggml_clamp(ctx0, s, eps, 1e30f); // avoid div-by-zero in no-alloc warmup
|
||||
s = ggml_sqrt(ctx0, s); // [1,H,W,B]
|
||||
|
||||
ggml_tensor * xhat = ggml_div(ctx0, xm, s); // [C,H,W,B]
|
||||
xhat = ggml_permute(ctx0, xhat, 2, 1, 0, 3); // [W,H,C,B]
|
||||
xhat = ggml_cont(ctx0, xhat);
|
||||
xhat = mul_channel_weight(ctx0, xhat, w);
|
||||
xhat = add_channel_bias(ctx0, xhat, b);
|
||||
return xhat;
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph_yasa2::convnext_grn(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b) {
|
||||
// Exact ConvNeXtV2 GRN:
|
||||
// Gx = ||x||_2 over spatial dims (W,H), Nx = Gx / (mean_c(Gx) + eps)
|
||||
// y = w * (x * Nx) + b + x
|
||||
const int64_t wdim = inp->ne[0];
|
||||
const int64_t hdim = inp->ne[1];
|
||||
const int64_t cdim = inp->ne[2];
|
||||
const int64_t bdim = inp->ne[3];
|
||||
|
||||
// Keep GRN math in fp32 for stability; fp16/bf16 accumulation can drift.
|
||||
ggml_tensor * sq = ggml_mul(ctx0, inp, inp);
|
||||
ggml_tensor * sq_flat = ggml_reshape_4d(ctx0, sq, wdim * hdim, cdim, 1, bdim); // [WH,C,1,B]
|
||||
ggml_tensor * gx = ggml_sum_rows(ctx0, sq_flat); // [1,C,1,B]
|
||||
gx = ggml_sqrt(ctx0, gx); // [1,C,1,B]
|
||||
|
||||
ggml_tensor * gx_ch_first = ggml_permute(ctx0, gx, 1, 0, 2, 3); // [C,1,1,B]
|
||||
gx_ch_first = ggml_cont(ctx0, gx_ch_first);
|
||||
ggml_tensor * gx_mean = ggml_mean(ctx0, gx_ch_first); // [1,1,1,B]
|
||||
|
||||
gx_mean = ggml_clamp(ctx0, gx_mean, 1e-6f, 1e30f); // approx +eps, warmup-safe
|
||||
ggml_tensor * nx = ggml_div(ctx0, gx, gx_mean); // [1,C,1,B]
|
||||
nx = ggml_permute(ctx0, nx, 0, 2, 1, 3); // [1,1,C,B]
|
||||
nx = ggml_cont(ctx0, nx);
|
||||
|
||||
ggml_tensor * xnx = ggml_mul(ctx0, inp, nx);
|
||||
xnx = mul_channel_weight(ctx0, xnx, w);
|
||||
xnx = add_channel_bias(ctx0, xnx, b);
|
||||
return ggml_add(ctx0, inp, xnx);
|
||||
}
|
||||
|
||||
ggml_cgraph * clip_graph_yasa2::build() {
|
||||
ggml_tensor * cur = build_inp_raw();
|
||||
|
||||
// Patch embedding Conv2d(kernel=4, stride=4)
|
||||
cur = ggml_conv_2d(ctx0, model.yasa_patch_w, cur, patch_size, patch_size, 0, 0, 1, 1);
|
||||
cur = add_channel_bias(ctx0, cur, model.yasa_patch_b);
|
||||
ggml_set_name(cur, "yasa2_patch_conv_out");
|
||||
cb(cur, "yasa2_patch_conv_out", -1);
|
||||
cur = layer_norm_channels(cur, model.yasa_patch_ln_w, model.yasa_patch_ln_b, eps);
|
||||
ggml_set_name(cur, "yasa2_patch_ln_out");
|
||||
cb(cur, "yasa2_patch_ln_out", -1);
|
||||
|
||||
// ConvNeXt stages
|
||||
for (size_t s = 0; s < model.yasa_stages.size(); ++s) {
|
||||
const auto & stage = model.yasa_stages[s];
|
||||
|
||||
if (stage.down_conv_w) {
|
||||
cur = layer_norm_channels(cur, stage.down_ln_w, stage.down_ln_b, eps);
|
||||
cur = ggml_conv_2d(ctx0, stage.down_conv_w, cur, 2, 2, 0, 0, 1, 1);
|
||||
cur = add_channel_bias(ctx0, cur, stage.down_conv_b);
|
||||
ggml_format_name(cur, "yasa2_stage%zu_down_out", s);
|
||||
}
|
||||
|
||||
for (size_t bi = 0; bi < stage.blocks.size(); ++bi) {
|
||||
const auto & blk = stage.blocks[bi];
|
||||
ggml_tensor * res = cur;
|
||||
|
||||
ggml_tensor * x = ggml_conv_2d_dw(ctx0, blk.dw_w, cur, 1, 1, 3, 3, 1, 1);
|
||||
x = add_channel_bias(ctx0, x, blk.dw_b);
|
||||
x = layer_norm_channels(x, blk.ln_w, blk.ln_b, eps);
|
||||
|
||||
// pwconv1/pwconv2 are HF Linear layers over channels; implement via matmul on tokens.
|
||||
const int64_t w = x->ne[0];
|
||||
const int64_t h = x->ne[1];
|
||||
const int64_t b = x->ne[3];
|
||||
|
||||
ggml_tensor * tok = ggml_reshape_3d(ctx0, x, w * h, x->ne[2], b); // [T,C,B]
|
||||
tok = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [C,T,B]
|
||||
tok = ggml_cont(ctx0, tok);
|
||||
|
||||
tok = ggml_mul_mat(ctx0, blk.pw1_w, tok); // [4C,T,B]
|
||||
if (blk.pw1_b) {
|
||||
ggml_tensor * b1 = ggml_reshape_3d(ctx0, blk.pw1_b, blk.pw1_b->ne[0], 1, 1); // [4C,1,1]
|
||||
tok = ggml_add(ctx0, tok, b1);
|
||||
}
|
||||
x = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [T,4C,B]
|
||||
x = ggml_cont(ctx0, x);
|
||||
x = ggml_reshape_4d(ctx0, x, w, h, tok->ne[0], b); // [W,H,4C,B]
|
||||
x = ggml_gelu_erf(ctx0, x);
|
||||
x = convnext_grn(x, blk.grn_w, blk.grn_b);
|
||||
|
||||
tok = ggml_reshape_3d(ctx0, x, w * h, x->ne[2], b); // [T,4C,B]
|
||||
tok = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [4C,T,B]
|
||||
tok = ggml_cont(ctx0, tok);
|
||||
|
||||
tok = ggml_mul_mat(ctx0, blk.pw2_w, tok); // [C,T,B]
|
||||
if (blk.pw2_b) {
|
||||
ggml_tensor * b2 = ggml_reshape_3d(ctx0, blk.pw2_b, blk.pw2_b->ne[0], 1, 1); // [C,1,1]
|
||||
tok = ggml_add(ctx0, tok, b2);
|
||||
}
|
||||
x = ggml_permute(ctx0, tok, 1, 0, 2, 3); // [T,C,B]
|
||||
x = ggml_cont(ctx0, x);
|
||||
x = ggml_reshape_4d(ctx0, x, w, h, tok->ne[0], b); // [W,H,C,B]
|
||||
|
||||
cur = ggml_add(ctx0, res, x);
|
||||
ggml_format_name(cur, "yasa2_stage%zu_blk%zu_out", s, bi);
|
||||
}
|
||||
}
|
||||
|
||||
// HF path adds vision position embeddings BEFORE adaptive pooling.
|
||||
const int64_t pre_w = cur->ne[0];
|
||||
const int64_t pre_h = cur->ne[1];
|
||||
ggml_tensor * tokens_pre = ggml_reshape_3d(ctx0, cur, pre_w * pre_h, cur->ne[2], cur->ne[3]); // [T,C,B]
|
||||
tokens_pre = ggml_permute(ctx0, tokens_pre, 1, 0, 2, 3); // [C,T,B]
|
||||
tokens_pre = ggml_cont(ctx0, tokens_pre);
|
||||
if (model.yasa_vision_pos_embed && tokens_pre->ne[1] == model.yasa_vision_pos_embed->ne[1]) {
|
||||
const int64_t n_ch = model.yasa_vision_pos_embed->ne[0];
|
||||
const int64_t n_tokens = model.yasa_vision_pos_embed->ne[1];
|
||||
ggml_tensor * pos = ggml_reshape_3d(ctx0, model.yasa_vision_pos_embed, (int) n_ch, (int) n_tokens, 1);
|
||||
tokens_pre = ggml_add(ctx0, tokens_pre, pos);
|
||||
}
|
||||
cur = ggml_permute(ctx0, tokens_pre, 1, 0, 2, 3); // [T,C,B]
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_reshape_4d(ctx0, cur, pre_w, pre_h, cur->ne[1], cur->ne[2]); // [W,H,C,B]
|
||||
|
||||
// AdaptiveAvgPool2d target is 8x8 for real inputs, but warmup can use tiny images.
|
||||
const int pooled_w = std::min(8, (int) cur->ne[0]);
|
||||
const int pooled_h = std::min(8, (int) cur->ne[1]);
|
||||
const int kw = std::max(1, (int) cur->ne[0] / pooled_w);
|
||||
const int kh = std::max(1, (int) cur->ne[1] / pooled_h);
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kw, kh, kw, kh, 0, 0);
|
||||
|
||||
// [W,H,C,B] -> [C,T,B]
|
||||
ggml_tensor * tokens = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2], cur->ne[3]);
|
||||
tokens = ggml_permute(ctx0, tokens, 1, 0, 2, 3);
|
||||
tokens = ggml_cont(ctx0, tokens);
|
||||
cb(tokens, "yasa2_tokens", -1);
|
||||
|
||||
GGML_ASSERT(model.mm_0_w && model.mm_2_w);
|
||||
ggml_tensor * embeddings = build_ffn(
|
||||
tokens,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF,
|
||||
-1);
|
||||
cb(embeddings, "yasa2_emb", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
return gf;
|
||||
}
|
||||
179
tools/mtmd/models/youtuvl.cpp
Normal file
179
tools/mtmd/models/youtuvl.cpp
Normal file
@@ -0,0 +1,179 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_youtuvl::build() {
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
const int batch_size = 1;
|
||||
const bool use_window_attn = !hparams.wa_layer_indexes.empty();
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4;
|
||||
const int m = 2;
|
||||
const int Wp = n_patches_x;
|
||||
const int Hp = n_patches_y;
|
||||
const int Hm = Hp / m;
|
||||
const int Wm = Wp / m;
|
||||
norm_type norm_t = NORM_TYPE_NORMAL;
|
||||
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * inp = build_inp_raw();
|
||||
|
||||
// change conv3d to linear
|
||||
// reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
|
||||
{
|
||||
inp = ggml_reshape_4d(
|
||||
ctx0, inp,
|
||||
Wm * m * patch_size, m * patch_size, Hm, 3);
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
m * patch_size * 3, Wm, m * patch_size, Hm);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
m * patch_size * 3, patch_size, m, Hm * Wm);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
patch_size, 3, patch_size, Hm * Wm * m * m);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
|
||||
inp = ggml_cont_3d(
|
||||
ctx0, inp,
|
||||
3*patch_size* patch_size, Hm * Wm * m * m, 1);
|
||||
}
|
||||
inp = build_mm(model.patch_embeddings_0, inp);
|
||||
|
||||
if (model.patch_bias) {
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
}
|
||||
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
ggml_tensor * window_mask = nullptr;
|
||||
ggml_tensor * window_idx = nullptr;
|
||||
ggml_tensor * inv_window_idx = nullptr;
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
// pre-layernorm
|
||||
if (model.pre_ln_w) {
|
||||
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
|
||||
}
|
||||
if (use_window_attn) {
|
||||
inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
|
||||
ggml_set_name(inv_window_idx, "inv_window_idx");
|
||||
ggml_set_input(inv_window_idx);
|
||||
// mask for window attention
|
||||
window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
|
||||
ggml_set_name(window_mask, "window_mask");
|
||||
ggml_set_input(window_mask);
|
||||
|
||||
// if flash attn is used, we need to pad the mask and cast to f16
|
||||
if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
||||
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
|
||||
}
|
||||
|
||||
// inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
|
||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
|
||||
|
||||
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
||||
|
||||
// layernorm1
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_add(ctx0,
|
||||
build_mm(layer.q_w, cur), layer.q_b);
|
||||
ggml_tensor * Kcur = ggml_add(ctx0,
|
||||
build_mm(layer.k_w, cur), layer.k_b);
|
||||
ggml_tensor * Vcur = ggml_add(ctx0,
|
||||
build_mm(layer.v_w, cur), layer.v_b);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
|
||||
|
||||
Qcur = ggml_rope_multi(
|
||||
ctx0, Qcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
Kcur = ggml_rope_multi(
|
||||
ctx0, Kcur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
|
||||
ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
|
||||
|
||||
cur = build_attn(layer.o_w, layer.o_b,
|
||||
Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
|
||||
}
|
||||
// re-add the layer input, e.g., residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
inpL = cur; // inpL = residual, cur = hidden_states
|
||||
|
||||
// layernorm2
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
|
||||
|
||||
// ffn
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
nullptr, nullptr,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
|
||||
// residual 2
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
ggml_tensor * embeddings = inpL;
|
||||
if (use_window_attn) {
|
||||
const int spatial_merge_unit = 4;
|
||||
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
|
||||
ggml_set_name(window_idx, "window_idx");
|
||||
ggml_set_input(window_idx);
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
|
||||
embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
|
||||
cb(embeddings, "window_order_restored", -1);
|
||||
}
|
||||
|
||||
// post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
|
||||
if (model.post_ln_w) {
|
||||
embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
|
||||
}
|
||||
|
||||
// Now apply merger (VLPatchMerger):
|
||||
// 1. Apply RMS norm (ln_q in VLPatchMerger)
|
||||
embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
|
||||
cb(embeddings, "merger_normed", -1);
|
||||
|
||||
// 2. First reshape for spatial merge (merge 2x2 patches)
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
|
||||
cb(embeddings, "merger_reshaped", -1);
|
||||
|
||||
embeddings = build_ffn(embeddings,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, model.mm_1_b,
|
||||
FFN_GELU,
|
||||
-1);
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
return gf;
|
||||
}
|
||||
Reference in New Issue
Block a user