llama.cpp verification source 2026-05-22

2026-05-22 16:44:08 +08:00
commit 8e5a449007
2740 changed files with 1155720 additions and 0 deletions
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -0,0 +1,7 @@
+*
+!*.*
+!snapshots/
+*.o
+ggml-common.h
+**/*.swp
+!peg-parser
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,302 @@
+llama_add_compile_flags()
+
+function(llama_build source)
+    set(TEST_SOURCES ${source} ${ARGN})
+
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_TARGET ${LLAMA_TEST_NAME})
+    else()
+        get_filename_component(TEST_TARGET ${source} NAME_WE)
+    endif()
+
+    add_executable(${TEST_TARGET} ${TEST_SOURCES})
+    target_link_libraries(${TEST_TARGET} PRIVATE llama llama-common)
+    if (LLAMA_TESTS_INSTALL)
+        install(TARGETS ${TEST_TARGET} RUNTIME)
+    endif()
+endfunction()
+
+function(llama_test target)
+    include(CMakeParseArguments)
+    set(options)
+    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if (NOT DEFINED LLAMA_TEST_LABEL)
+        set(LLAMA_TEST_LABEL "main")
+    endif()
+    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
+        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    endif()
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_NAME ${LLAMA_TEST_NAME})
+    else()
+        set(TEST_NAME ${target})
+    endif()
+
+    set(TEST_TARGET ${target})
+
+    add_test(
+        NAME ${TEST_NAME}
+        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        COMMAND $<TARGET_FILE:${TEST_TARGET}>
+        ${LLAMA_TEST_ARGS})
+
+    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
+endfunction()
+
+function(llama_test_cmd target)
+    include(CMakeParseArguments)
+    set(options)
+    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if (NOT DEFINED LLAMA_TEST_LABEL)
+        set(LLAMA_TEST_LABEL "main")
+    endif()
+    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
+        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    endif()
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_NAME ${LLAMA_TEST_NAME})
+    else()
+        set(TEST_NAME ${target})
+    endif()
+
+    add_test(
+        NAME ${TEST_NAME}
+        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        COMMAND ${target}
+        ${LLAMA_TEST_ARGS})
+
+    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
+endfunction()
+
+# Builds and runs a test source file.
+# Optional args:
+# - NAME: name of the executable & test target (defaults to the source file name without extension)
+# - LABEL: label for the test (defaults to main)
+# - ARGS: arguments to pass to the test executable
+# - WORKING_DIRECTORY
+function(llama_build_and_test source)
+    include(CMakeParseArguments)
+    set(options)
+    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(TEST_SOURCES ${source} ${LLAMA_TEST_UNPARSED_ARGUMENTS} get-model.cpp)
+
+    if (NOT DEFINED LLAMA_TEST_LABEL)
+        set(LLAMA_TEST_LABEL "main")
+    endif()
+    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
+        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    endif()
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_TARGET ${LLAMA_TEST_NAME})
+    else()
+        get_filename_component(TEST_TARGET ${source} NAME_WE)
+    endif()
+
+    add_executable(${TEST_TARGET} ${TEST_SOURCES})
+    if (LLAMA_TESTS_INSTALL)
+        install(TARGETS ${TEST_TARGET} RUNTIME)
+    endif()
+    target_link_libraries(${TEST_TARGET} PRIVATE llama-common)
+
+    add_test(
+        NAME ${TEST_TARGET}
+        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        COMMAND $<TARGET_FILE:${TEST_TARGET}>
+        ${LLAMA_TEST_ARGS})
+
+    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
+endfunction()
+
+# build test-tokenizer-0 target once and add many tests
+llama_build(test-tokenizer-0.cpp)
+
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-bert-bge.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-command-r.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-coder.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-llm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-gemma-4           ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gemma-4.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-phi-3.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-qwen2.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
+
+if (NOT WIN32)
+    llama_test_cmd(
+        ${CMAKE_CURRENT_SOURCE_DIR}/test-tokenizers-repo.sh
+        NAME test-tokenizers-ggml-vocabs
+        WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+        ARGS https://huggingface.co/ggml-org/vocabs ${PROJECT_SOURCE_DIR}/models/ggml-vocabs
+    )
+endif()
+
+if (LLAMA_LLGUIDANCE)
+    llama_build_and_test(test-grammar-llguidance.cpp ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
+endif ()
+
+if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
+    # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API (when building with shared libraries)
+    llama_build_and_test(test-sampling.cpp)
+    llama_build_and_test(test-reasoning-budget.cpp)
+    llama_build_and_test(test-grammar-parser.cpp)
+    llama_build_and_test(test-grammar-integration.cpp)
+    llama_build_and_test(test-llama-grammar.cpp)
+    llama_build_and_test(test-chat.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
+    target_include_directories(test-chat PRIVATE ${PROJECT_SOURCE_DIR}/tools/server)
+    target_link_libraries(test-chat PRIVATE server-context)
+    # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
+    if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+        llama_build_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
+        target_include_directories(test-json-schema-to-grammar PRIVATE ${PROJECT_SOURCE_DIR}/tools/server)
+    endif()
+
+    if (NOT GGML_BACKEND_DL)
+        llama_build(test-quantize-stats.cpp)
+    endif()
+
+    llama_build(test-gbnf-validator.cpp)
+
+    # build test-tokenizer-1-bpe target once and add many tests
+    llama_build(test-tokenizer-1-bpe.cpp)
+
+    # TODO: disabled due to slowness
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-aquila.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-neox.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf --ignore-merges)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
+
+    # build test-tokenizer-1-spm target once and add many tests
+    llama_build(test-tokenizer-1-spm.cpp)
+
+    llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
+    #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-baichuan.gguf)
+
+    # llama_build_and_test(test-double-float.cpp) # SLOW
+
+    llama_build_and_test(test-llama-archs.cpp)
+endif()
+
+llama_build_and_test(test-chat-peg-parser.cpp peg-parser/simple-tokenize.cpp)
+llama_build_and_test(test-jinja.cpp)
+llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python)
+llama_build_and_test(test-chat-auto-parser.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
+llama_build_and_test(test-chat-template.cpp)
+llama_build_and_test(test-json-partial.cpp)
+llama_build_and_test(test-log.cpp)
+llama_build_and_test(
+    test-peg-parser.cpp
+    peg-parser/simple-tokenize.cpp
+    peg-parser/test-basic.cpp
+    peg-parser/test-gbnf-generation.cpp
+    peg-parser/test-json-parser.cpp
+    peg-parser/test-json-serialization.cpp
+    peg-parser/test-python-dict-parser.cpp
+    peg-parser/test-unicode.cpp
+    peg-parser/tests.h
+)
+llama_build_and_test(test-regex-partial.cpp)
+
+if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+    set(MODEL_NAME "tinyllamas/stories15M-q4_0.gguf")
+    set(MODEL_HASH "SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739")
+else()
+    set(MODEL_NAME "tinyllamas/stories15M-be.Q4_0.gguf")
+    set(MODEL_HASH "SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d")
+endif()
+set(MODEL_DEST "${CMAKE_BINARY_DIR}/${MODEL_NAME}")
+
+add_test(NAME test-download-model COMMAND ${CMAKE_COMMAND}
+    -DDEST=${MODEL_DEST}
+    -DNAME=${MODEL_NAME}
+    -DHASH=${MODEL_HASH}
+    -P ${CMAKE_SOURCE_DIR}/cmake/download-models.cmake
+)
+set_tests_properties(test-download-model PROPERTIES FIXTURES_SETUP test-download-model)
+
+llama_build_and_test(test-thread-safety.cpp ARGS -m "${MODEL_DEST}" -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
+set_tests_properties(test-thread-safety PROPERTIES FIXTURES_REQUIRED test-download-model)
+
+llama_build_and_test(test-arg-parser.cpp)
+
+if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
+  # TODO: repair known memory leaks
+  llama_build_and_test(test-opt.cpp)
+endif()
+llama_build_and_test(test-gguf.cpp)
+llama_build_and_test(test-backend-ops.cpp)
+
+llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
+llama_build_and_test(test-autorelease.cpp       LABEL "model")
+llama_build_and_test(test-backend-sampler.cpp   LABEL "model")
+
+# Test for state restore with fragmented KV cache
+# Requires a model, uses same args pattern as test-thread-safety
+llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -m "${MODEL_DEST}")
+set_tests_properties(test-state-restore-fragmented PROPERTIES FIXTURES_REQUIRED test-download-model)
+
+if (NOT GGML_BACKEND_DL)
+    # these tests use the backends directly and cannot be built with dynamic loading
+    llama_build_and_test(test-barrier.cpp)
+    llama_build_and_test(test-quantize-fns.cpp)
+    llama_build_and_test(test-quantize-perf.cpp)
+    llama_build_and_test(test-rope.cpp)
+endif()
+
+# libmtmd
+set(LLAMA_TEST_NAME test-mtmd-c-api)
+llama_build_and_test(test-mtmd-c-api.c)
+target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
+unset(LLAMA_TEST_NAME)
+
+# GGUF model data fetcher library for tests that need real model metadata
+# Only compile when cpp-httplib has SSL support (CPPHTTPLIB_OPENSSL_SUPPORT)
+if (TARGET cpp-httplib)
+    get_target_property(_cpp_httplib_defs cpp-httplib INTERFACE_COMPILE_DEFINITIONS)
+    if (_cpp_httplib_defs MATCHES "CPPHTTPLIB_OPENSSL_SUPPORT")
+        add_library(gguf-model-data STATIC gguf-model-data.cpp)
+        target_link_libraries(gguf-model-data PRIVATE llama-common cpp-httplib)
+        target_include_directories(gguf-model-data PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+        add_executable(test-gguf-model-data test-gguf-model-data.cpp)
+        target_link_libraries(test-gguf-model-data PRIVATE gguf-model-data llama-common)
+        llama_test(test-gguf-model-data LABEL "model")
+
+        # test-quant-type-selection requires gguf-model-data for remote model metadata
+        llama_build_and_test(test-quant-type-selection.cpp LABEL "model")
+        target_link_libraries(test-quant-type-selection PRIVATE gguf-model-data)
+        target_compile_definitions(test-quant-type-selection PRIVATE
+            SNAPSHOT_DIR="${CMAKE_CURRENT_SOURCE_DIR}/snapshots")
+    endif()
+endif()
+
+# dummy executable - not installed
+get_filename_component(TEST_TARGET test-c.c NAME_WE)
+add_executable(${TEST_TARGET} test-c.c)
+target_link_libraries(${TEST_TARGET} PRIVATE llama)
+
+llama_build_and_test(test-alloc.cpp)
+target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
+
+llama_build(export-graph-ops.cpp)
+target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
+if (TARGET gguf-model-data)
+    target_link_libraries(export-graph-ops PRIVATE gguf-model-data)
+    target_compile_definitions(export-graph-ops PRIVATE LLAMA_HF_FETCH)
+endif()
--- a/tests/export-graph-ops.cpp
+++ b/tests/export-graph-ops.cpp
@@ -0,0 +1,226 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama-cpp.h"
+#include "../src/llama-ext.h"
+#include "ggml.h"
+#include "gguf-model-data.h"
+#include "gguf.h"
+#include "ggml-backend.h"
+#include "download.h"
+
+#include <array>
+#include <vector>
+#include <set>
+#include <fstream>
+#include <iostream>
+#include <random>
+
+// Noop because weights are not needed
+static void set_tensor_data(struct ggml_tensor * tensor, void * userdata) {
+    GGML_UNUSED(tensor);
+    GGML_UNUSED(userdata);
+}
+
+struct input_tensor {
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+    std::array<size_t, 4> nb;
+
+    input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) {
+        memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
+        memcpy(this->nb.data(), nb, 4 * sizeof(size_t));
+    }
+
+    bool operator<(const input_tensor &b) const {
+        return std::tie(type, ne, nb) <
+               std::tie(b.type, b.ne, b.nb);
+    }
+
+    void serialize(std::ostream& out) const {
+        out << type << ' ';
+        for (size_t i = 0; i < 4; i++) {
+            out << ne[i] << ' ';
+        }
+        for (size_t i = 0; i < 4; i++) {
+            out << nb[i] << ' ';
+        }
+    }
+};
+
+struct test_object {
+    ggml_op op;
+    ggml_type type;
+    std::array<int64_t, 4> ne;
+    std::vector<int32_t> op_params;
+    std::vector<input_tensor> sources;
+    std::string name;
+
+    void serialize(std::ostream& out) const {
+        out << op << ' ' << type << ' ';
+        for (size_t i = 0; i < 4; i++) {
+            out << ne[i] << ' ';
+        }
+
+        out << op_params.size() << ' ';
+        for (size_t i = 0; i < op_params.size(); i++) {
+            out << op_params[i] << ' ';
+        }
+
+        out << sources.size() << ' ';
+        for (size_t s = 0; s < sources.size(); s++) {
+            sources[s].serialize(out);
+        }
+
+        if (!name.empty()) {
+            out << name;
+        } else {
+            out << '-';
+        }
+
+        out << '\n';
+    }
+
+    bool operator<(const test_object &b) const {
+        return std::tie(op, type, ne, op_params, sources) <
+               std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
+    }
+};
+
+static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
+    int n_nodes = ggml_graph_n_nodes(cgraph);
+    int n_skipped = 0;
+    int n_before = (int) tests.size();
+    for (int i = 0; i < n_nodes; i++) {
+        ggml_tensor * node = ggml_graph_node(cgraph, i);
+
+        if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
+            n_skipped++;
+            continue;
+        }
+
+        test_object test;
+
+        test.op = node->op;
+        test.type = node->type;
+        memcpy(&test.ne, node->ne, 4 * sizeof(int64_t));
+
+        test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t));
+        memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS);
+
+        for (size_t s = 0; s < GGML_MAX_SRC; s++) {
+            if (node->src[s] == nullptr) {
+                break;
+            }
+
+            test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
+        }
+
+        test.name = node->name;
+        tests.insert(test);
+    }
+
+    int n_new = (int) tests.size() - n_before;
+    LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n",
+            label, n_new, n_nodes, n_skipped);
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+    params.out_file = "tests.txt";
+
+    common_init();
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
+        return 1;
+    }
+
+    // Load CPU-only
+    ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    params.devices = { cpu_device, nullptr };
+    params.fit_params = false;
+    params.n_gpu_layers = 0;
+
+    params.warmup = false;
+
+    llama_context * ctx;
+    common_init_result_ptr init_result;
+    llama_context_ptr ctx2;
+    llama_model_ptr model;
+
+    if (params.model.hf_repo.empty()) {
+        init_result = common_init_from_params(params);
+
+        ctx = init_result->context();
+    } else {
+#ifdef LLAMA_HF_FETCH
+        auto [hf_repo, hf_quant] = common_download_split_repo_tag(params.model.hf_repo);
+        if (hf_quant.empty() || hf_quant == "latest") {
+            hf_quant = "Q4_K_M";
+        }
+
+        gguf_context_ptr gguf_ctx = gguf_fetch_gguf_ctx(hf_repo, hf_quant);
+        if (!gguf_ctx) {
+            LOG_ERR("failed to fetch GGUF metadata from %s\n", hf_repo.c_str());
+            return 1;
+        }
+
+        llama_model_params model_params = llama_model_default_params();
+        model_params.devices = params.devices.data();
+        model_params.no_alloc = true;
+
+        model.reset(llama_model_init_from_user(gguf_ctx.get(), set_tensor_data, nullptr, model_params));
+
+        if (!model) {
+            LOG_ERR("failed to create llama_model from %s\n", hf_repo.c_str());
+            return 1;
+        }
+
+        llama_context_params ctx_params = llama_context_default_params();
+        ctx2.reset(llama_init_from_model(model.get(), ctx_params));
+        ctx = ctx2.get();
+
+        if (!ctx) {
+            LOG_ERR("failed to create llama_context\n");
+            return 1;
+        }
+#else
+        LOG_ERR("export-graph-ops compiled without HF fetch support\n");
+        return 1;
+#endif
+    }
+
+    const uint32_t n_seqs  = llama_n_seq_max(ctx);
+    const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
+
+    std::set<test_object> tests;
+
+    auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
+    if (!gf_pp) {
+        LOG_ERR("failed to reserve prompt processing graph\n");
+        return 1;
+    }
+    extract_graph_ops(gf_pp, "pp", tests);
+
+    auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
+    if (!gf_tg) {
+        LOG_ERR("failed to reserve token generation graph\n");
+        return 1;
+    }
+    extract_graph_ops(gf_tg, "tg", tests);
+
+    LOG_INF("%d unique ops total\n", (int) tests.size());
+
+    std::ofstream f(params.out_file);
+
+    if (!f.is_open()) {
+        LOG_ERR("unable to open output file: %s\n", params.out_file.c_str());
+        return 1;
+    }
+
+    for (const auto& test : tests) {
+        test.serialize(f);
+    }
+
+    return 0;
+}
--- a/tests/get-model.cpp
+++ b/tests/get-model.cpp
@@ -0,0 +1,21 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "get-model.h"
+
+char * get_model_or_exit(int argc, char *argv[]) {
+    char * model_path;
+    if (argc > 1) {
+        model_path = argv[1];
+
+    } else {
+        model_path = getenv("LLAMACPP_TEST_MODELFILE");
+        if (!model_path || strlen(model_path) == 0) {
+            fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    return model_path;
+}
--- a/tests/get-model.h
+++ b/tests/get-model.h
@@ -0,0 +1,2 @@
+#pragma once
+char * get_model_or_exit(int, char*[]);
--- a/tests/gguf-model-data.cpp
+++ b/tests/gguf-model-data.cpp
@@ -0,0 +1,740 @@
+// GGUF binary parser adapted from the huggingface/gguf package.
+// Reference: https://github.com/huggingface/huggingface.js
+
+#include "gguf-model-data.h"
+
+#include "common.h"
+#include "ggml-cpp.h"
+#include "gguf.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+
+#include "http.h"
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
+// Equivalent of RangeView
+struct gguf_buf_reader {
+    const char * data;
+    size_t       size;
+    size_t       pos;
+
+    gguf_buf_reader(const std::vector<char> & buf) : data(buf.data()), size(buf.size()), pos(0) {}
+
+    bool has_n_bytes(size_t n) const {
+        return pos + n <= size;
+    }
+
+    template <typename T>
+    bool read_val(T & out) {
+        if (!has_n_bytes(sizeof(T))) {
+            return false;
+        }
+        memcpy(&out, data + pos, sizeof(T));
+        pos += sizeof(T);
+        return true;
+    }
+
+    bool read_str(std::string & out) {
+        uint64_t len;
+        if (!read_val(len)) {
+            return false;
+        }
+        if (!has_n_bytes((size_t)len)) {
+            return false;
+        }
+        out.assign(data + pos, (size_t)len);
+        pos += (size_t)len;
+        return true;
+    }
+
+    bool skip(size_t n) {
+        if (!has_n_bytes(n)) {
+            return false;
+        }
+        pos += n;
+        return true;
+    }
+};
+
+static size_t gguf_val_type_size(int32_t vtype) {
+    switch (vtype) {
+        case GGUF_TYPE_UINT8:   return 1;
+        case GGUF_TYPE_INT8:    return 1;
+        case GGUF_TYPE_UINT16:  return 2;
+        case GGUF_TYPE_INT16:   return 2;
+        case GGUF_TYPE_UINT32:  return 4;
+        case GGUF_TYPE_INT32:   return 4;
+        case GGUF_TYPE_FLOAT32: return 4;
+        case GGUF_TYPE_BOOL:    return 1;
+        case GGUF_TYPE_UINT64:  return 8;
+        case GGUF_TYPE_INT64:   return 8;
+        case GGUF_TYPE_FLOAT64: return 8;
+        default:                return 0; // string/array handled separately
+    }
+}
+
+// Equivalent of readMetadataValue(), skips unused values rather than storing
+static bool gguf_skip_value(gguf_buf_reader & r, int32_t vtype) {
+    if (vtype == GGUF_TYPE_STRING) {
+        std::string tmp;
+        return r.read_str(tmp);
+    }
+    if (vtype == GGUF_TYPE_ARRAY) {
+        int32_t elem_type;
+        uint64_t count;
+        if (!r.read_val(elem_type)) {
+            return false;
+        }
+        if (!r.read_val(count)) {
+            return false;
+        }
+        if (elem_type == GGUF_TYPE_STRING) {
+            for (uint64_t i = 0; i < count; i++) {
+                std::string tmp;
+                if (!r.read_str(tmp)) {
+                    return false;
+                }
+            }
+            return true;
+        }
+        if (elem_type == GGUF_TYPE_ARRAY) {
+            // nested arrays - recurse
+            for (uint64_t i = 0; i < count; i++) {
+                if (!gguf_skip_value(r, GGUF_TYPE_ARRAY)) {
+                    return false;
+                }
+            }
+            return true;
+        }
+        size_t elem_sz = gguf_val_type_size(elem_type);
+        if (elem_sz == 0) {
+            return false;
+        }
+        return r.skip((size_t)count * elem_sz);
+    }
+    size_t sz = gguf_val_type_size(vtype);
+    if (sz == 0) {
+        return false;
+    }
+    return r.skip(sz);
+}
+
+static bool gguf_read_uint32_val(gguf_buf_reader & r, int32_t vtype, uint32_t & out) {
+    // Handle array-valued fields (e.g. per-layer head counts in hybrid models)
+    // by reading the first element as a representative value.
+    if (vtype == GGUF_TYPE_ARRAY) {
+        int32_t elem_type;
+        uint64_t count;
+        if (!r.read_val(elem_type)) {
+            return false;
+        }
+        if (!r.read_val(count)) {
+            return false;
+        }
+        if (count == 0) {
+            return false;
+        }
+        // Read first element, skip the rest
+        if (!gguf_read_uint32_val(r, elem_type, out)) {
+            return false;
+        }
+        for (uint64_t i = 1; i < count; i++) {
+            size_t sz = gguf_val_type_size(elem_type);
+            if (sz == 0) {
+                return false;
+            }
+            if (!r.skip(sz)) {
+                return false;
+            }
+        }
+        return true;
+    }
+    if (vtype == GGUF_TYPE_UINT8) {
+        uint8_t v;
+        if (!r.read_val(v)) {
+            return false;
+        }
+        out = v;
+        return true;
+    }
+    if (vtype == GGUF_TYPE_INT8) {
+        int8_t v;
+        if (!r.read_val(v)) {
+            return false;
+        }
+        out = (uint32_t)v;
+        return true;
+    }
+    if (vtype == GGUF_TYPE_UINT16) {
+        uint16_t v;
+        if (!r.read_val(v)) {
+            return false;
+        }
+        out = v;
+        return true;
+    }
+    if (vtype == GGUF_TYPE_INT16) {
+        int16_t v;
+        if (!r.read_val(v)) {
+            return false;
+        }
+        out = (uint32_t)v;
+        return true;
+    }
+    if (vtype == GGUF_TYPE_UINT32) {
+        uint32_t v;
+        if (!r.read_val(v)) {
+            return false;
+        }
+        out = v;
+        return true;
+    }
+    if (vtype == GGUF_TYPE_INT32) {
+        int32_t v;
+        if (!r.read_val(v)) {
+            return false;
+        }
+        out = (uint32_t)v;
+        return true;
+    }
+    if (vtype == GGUF_TYPE_UINT64) {
+        uint64_t v;
+        if (!r.read_val(v)) {
+            return false;
+        }
+        out = (uint32_t)v;
+        return true;
+    }
+    if (vtype == GGUF_TYPE_INT64) {
+        int64_t v;
+        if (!r.read_val(v)) {
+            return false;
+        }
+        out = (uint32_t)v;
+        return true;
+    }
+    return false;
+}
+
+// Follows the same header -> KV -> tensor parsing sequence as gguf() huggingface/gguf
+static std::optional<gguf_remote_model> gguf_parse_meta(const std::vector<char> & buf) {
+    gguf_buf_reader r(buf);
+
+    // Header: magic(4) + version(4) + tensor_count(8) + kv_count(8) = 24 bytes minimum
+    uint32_t magic_raw;
+    if (!r.read_val(magic_raw)) {
+        return std::nullopt;
+    }
+    if (memcmp(&magic_raw, "GGUF", 4) != 0) {
+        fprintf(stderr, "gguf_parse_meta: invalid magic\n");
+        return std::nullopt;
+    }
+
+    uint32_t version;
+    if (!r.read_val(version)) {
+        return std::nullopt;
+    }
+    if (version < 2 || version > 3) {
+        fprintf(stderr, "gguf_parse_meta: unsupported version %u\n", version);
+        return std::nullopt;
+    }
+
+    int64_t tensor_count_raw;
+    int64_t kv_count_raw;
+    if (!r.read_val(tensor_count_raw)) {
+        return std::nullopt;
+    }
+    if (!r.read_val(kv_count_raw)) {
+        return std::nullopt;
+    }
+
+    uint64_t tensor_count = (uint64_t)tensor_count_raw;
+    uint64_t kv_count     = (uint64_t)kv_count_raw;
+
+    gguf_remote_model model;
+
+    std::string arch_prefix;
+
+    // Parse KV pairs
+    for (uint64_t i = 0; i < kv_count; i++) {
+        std::string key;
+        if (!r.read_str(key)) {
+            return std::nullopt;
+        }
+
+        int32_t vtype;
+        if (!r.read_val(vtype)) {
+            return std::nullopt;
+        }
+
+        if (key == "general.architecture" && vtype == GGUF_TYPE_STRING) {
+            if (!r.read_str(model.architecture)) {
+                return std::nullopt;
+            }
+            arch_prefix = model.architecture + ".";
+            continue;
+        }
+
+        // Extract split.count for proper handling of split files
+        if (key == "split.count") {
+            uint32_t v;
+            if (!gguf_read_uint32_val(r, vtype, v)) {
+                return std::nullopt;
+            }
+            model.n_split = (uint16_t)v;
+            continue;
+        }
+
+        // Extract split.tensors.count so we can verify we have all tensors
+        if (key == "split.tensors.count") {
+            uint32_t v;
+            if (!gguf_read_uint32_val(r, vtype, v)) {
+                return std::nullopt;
+            }
+            model.n_split_tensors = v;
+            continue;
+        }
+
+        if (!arch_prefix.empty()) {
+            uint32_t * target = nullptr;
+
+            if      (key == arch_prefix + "embedding_length")         { target = &model.n_embd; }
+            else if (key == arch_prefix + "feed_forward_length")      { target = &model.n_ff; }
+            else if (key == arch_prefix + "block_count")              { target = &model.n_layer; }
+            else if (key == arch_prefix + "attention.head_count")     { target = &model.n_head; }
+            else if (key == arch_prefix + "attention.head_count_kv")  { target = &model.n_head_kv; }
+            else if (key == arch_prefix + "expert_count")             { target = &model.n_expert; }
+            else if (key == arch_prefix + "attention.key_length")     { target = &model.n_embd_head_k; }
+            else if (key == arch_prefix + "attention.value_length")   { target = &model.n_embd_head_v; }
+
+            if (target) {
+                if (!gguf_read_uint32_val(r, vtype, *target)) {
+                    return std::nullopt;
+                }
+                continue;
+            }
+        }
+
+        if (!gguf_skip_value(r, vtype)) {
+            return std::nullopt;
+        }
+    }
+
+    // Parse tensor info entries
+    model.tensors.reserve((size_t)tensor_count);
+    for (uint64_t i = 0; i < tensor_count; i++) {
+        gguf_remote_tensor t;
+
+        if (!r.read_str(t.name)) {
+            return std::nullopt;
+        }
+        if (!r.read_val(t.n_dims)) {
+            return std::nullopt;
+        }
+
+        if (t.n_dims > 4) {
+            fprintf(stderr, "gguf_parse_meta: tensor '%s' has %u dims (max 4)\n", t.name.c_str(), t.n_dims);
+            return std::nullopt;
+        }
+
+        for (uint32_t d = 0; d < t.n_dims; d++) {
+            if (!r.read_val(t.ne[d])) {
+                return std::nullopt;
+            }
+        }
+
+        int32_t type_raw;
+        if (!r.read_val(type_raw)) {
+            return std::nullopt;
+        }
+        t.type = (ggml_type)type_raw;
+
+        uint64_t offset;
+        if (!r.read_val(offset)) {
+            return std::nullopt;
+        }
+
+        // Infer n_vocab from token_embd.weight
+        if (t.name == "token_embd.weight") {
+            model.n_vocab = (uint32_t)t.ne[1];
+        }
+
+        model.tensors.push_back(std::move(t));
+    }
+
+    return model;
+}
+
+// cache handling for local download
+static std::string get_default_cache_dir() {
+    return fs_get_cache_directory() + "gguf-headers/";
+}
+
+static std::string sanitize_for_path(const std::string & s) {
+    std::string out = s;
+    for (char & c : out) {
+        if (c == '/' || c == '\\' || c == ':') {
+            c = '_';
+        }
+    }
+    return out;
+}
+
+static bool read_file(const std::string & path, std::vector<char> & out) {
+    std::ifstream f(path, std::ios::binary | std::ios::ate);
+    if (!f.good()) {
+        return false;
+    }
+    auto sz = f.tellg();
+    if (sz <= 0) {
+        return false;
+    }
+    out.resize((size_t)sz);
+    f.seekg(0);
+    f.read(out.data(), sz);
+    return f.good();
+}
+
+static bool write_file(const std::string & path, const std::vector<char> & data) {
+    std::ofstream f(path, std::ios::binary | std::ios::trunc);
+    if (!f.good()) {
+        return false;
+    }
+    f.write(data.data(), (std::streamsize)data.size());
+    return f.good();
+}
+
+// HuggingFace file auto-detection and HTTP download
+static std::pair<long, std::vector<char>> gguf_http_get(
+        const std::string & url,
+        const httplib::Headers & headers = {},
+        int timeout_sec = 60) {
+    try {
+        auto [cli, parts] = common_http_client(url);
+
+        if (timeout_sec > 0) {
+            cli.set_read_timeout(timeout_sec, 0);
+            cli.set_write_timeout(timeout_sec, 0);
+        }
+        cli.set_connection_timeout(30, 0);
+
+        std::vector<char> body;
+        auto res = cli.Get(parts.path, headers,
+            [&](const char * data, size_t len) {
+                body.insert(body.end(), data, data + len);
+                return true;
+            }, nullptr);
+
+        if (!res) {
+            fprintf(stderr, "gguf_fetch: HTTP request failed for %s (error %d)\n",
+                    url.c_str(), (int)res.error());
+            return {-1, {}};
+        }
+        return {res->status, std::move(body)};
+    } catch (const std::exception & e) {
+        fprintf(stderr, "gguf_fetch: HTTP error: %s\n", e.what());
+        return {-1, {}};
+    }
+}
+
+// Find the filename for given repo/quant.
+// For split models, returns the first shard (the one containing "00001-of-")
+// split_prefix is set to the portion before "-00001-of-XXXXX.gguf" when a split file is found
+static std::string detect_gguf_filename(const std::string & repo, const std::string & quant,
+                                        std::string & split_prefix) {
+    split_prefix.clear();
+    std::string api_url = "https://huggingface.co/api/models/" + repo;
+
+    auto [code, body] = gguf_http_get(api_url, {}, 30);
+    if (code != 200 || body.empty()) {
+        fprintf(stderr, "gguf_fetch: failed to query HF API for %s (HTTP %ld)\n", repo.c_str(), code);
+        return "";
+    }
+
+    nlohmann::json j;
+    try {
+        j = nlohmann::json::parse(body.begin(), body.end());
+    } catch (...) {
+        fprintf(stderr, "gguf_fetch: failed to parse HF API response\n");
+        return "";
+    }
+
+    if (!j.contains("siblings") || !j["siblings"].is_array()) {
+        fprintf(stderr, "gguf_fetch: unexpected HF API response format\n");
+        return "";
+    }
+
+    std::vector<std::string> matches;
+    std::string quant_upper = quant;
+    for (char & c : quant_upper) { c = (char)toupper(c); }
+
+    for (const auto & sibling : j["siblings"]) {
+        if (!sibling.contains("rfilename")) { continue; }
+        std::string fname = sibling["rfilename"].get<std::string>();
+        if (fname.size() < 5 || fname.substr(fname.size() - 5) != ".gguf") {
+            continue;
+        }
+
+        std::string fname_upper = fname;
+        for (char & c : fname_upper) { c = (char)toupper(c); }
+        if (fname_upper.find(quant_upper) != std::string::npos) {
+            matches.push_back(fname);
+        }
+    }
+
+    if (matches.empty()) {
+        fprintf(stderr, "gguf_fetch: no .gguf files matching '%s' in %s\n", quant.c_str(), repo.c_str());
+        return "";
+    }
+
+    std::sort(matches.begin(), matches.end());
+
+    // Prefer non-split, non-supplementary file
+    for (const auto & m : matches) {
+        if (m.find("-of-") == std::string::npos && m.find("mmproj") == std::string::npos) {
+            return m;
+        }
+    }
+
+    // Return the first shard (00001-of-) and extract the prefix
+    for (const auto & m : matches) {
+        auto pos = m.find("-00001-of-");
+        if (pos != std::string::npos) {
+            split_prefix = m.substr(0, pos);
+            return m;
+        }
+    }
+
+    return matches[0];
+}
+
+static std::optional<gguf_remote_model> fetch_and_parse(
+        const std::string & repo,
+        const std::string & filename,
+        const std::string & cache_path,
+        bool verbose) {
+    std::string url = "https://huggingface.co/" + repo + "/resolve/main/" + filename;
+
+    // Progressive download inspired by RangeView.fetchChunk()
+    // Start at 2MB, double each time, cap at 64MB
+    size_t chunk_size = 2 * 1024 * 1024;
+    const size_t max_chunk = 64 * 1024 * 1024;
+
+    while (chunk_size <= max_chunk) {
+        if (verbose) {
+            fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
+        }
+
+        char range_buf[64];
+        snprintf(range_buf, sizeof(range_buf), "bytes=0-%zu", chunk_size - 1);
+        httplib::Headers headers = {{"Range", range_buf}};
+
+        auto [code, body] = gguf_http_get(url, headers, 120);
+        if (code != 200 && code != 206) {
+            fprintf(stderr, "gguf_fetch: HTTP %ld fetching %s\n", code, url.c_str());
+            return std::nullopt;
+        }
+
+        if (body.empty()) {
+            fprintf(stderr, "gguf_fetch: empty response\n");
+            return std::nullopt;
+        }
+
+        auto result = gguf_parse_meta(body);
+        if (result.has_value()) {
+            write_file(cache_path, body);
+            return result;
+        }
+
+        if (code == 200) {
+            fprintf(stderr, "gguf_fetch: server returned full response but metadata parse failed\n");
+            return std::nullopt;
+        }
+
+        // Parse failed, try larger chunk
+        chunk_size *= 2;
+    }
+
+    fprintf(stderr, "gguf_fetch: metadata exceeds 64MB, giving up\n");
+    return std::nullopt;
+}
+
+static std::string get_cache_file_path(const std::string& cdir, const std::string& repo_part, const std::string& filename) {
+    std::string fname_part = sanitize_for_path(filename);
+    return cdir + "/" + repo_part + "--" + fname_part + ".partial";
+}
+
+// Try cache first, then fetch and parse a single GGUF shard.
+static std::optional<gguf_remote_model> fetch_or_cached(
+        const std::string & repo,
+        const std::string & filename,
+        const std::string & cdir,
+        const std::string & repo_part,
+        bool verbose) {
+    std::string cache_path = get_cache_file_path(cdir, repo_part, filename);
+
+    {
+        std::vector<char> cached;
+        if (std::filesystem::exists(cache_path) && read_file(cache_path, cached)) {
+            auto result = gguf_parse_meta(cached);
+            if (result.has_value()) {
+                if (verbose) {
+                    fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
+                }
+                return result;
+            }
+        }
+    }
+
+    fs_create_directory_with_parents(cdir);
+    return fetch_and_parse(repo, filename, cache_path, verbose);
+}
+
+std::optional<gguf_remote_model> gguf_fetch_model_meta(
+        const std::string & repo,
+        const std::string & quant,
+        const std::string & cache_dir,
+        bool verbose) {
+    std::string cdir = cache_dir.empty() ? get_default_cache_dir() : cache_dir;
+    std::string repo_part = sanitize_for_path(repo);
+
+    std::string split_prefix;
+    std::string filename = detect_gguf_filename(repo, quant, split_prefix);
+    if (filename.empty()) {
+        return std::nullopt;
+    }
+
+    auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part, verbose);
+    if (!model_opt.has_value()) {
+        fprintf(stderr, "gguf_fetch: failed to fetch %s\n", filename.c_str());
+        return std::nullopt;
+    }
+
+    auto & model = model_opt.value();
+
+    // If the model is split across multiple files we need to fetch the remaining shards metadata
+    if (model.n_split > 1) {
+        if (split_prefix.empty()) {
+            fprintf(stderr, "gguf_fetch: model reports %u splits but filename has no split pattern\n", model.n_split);
+            return std::nullopt;
+        }
+
+        if (verbose) {
+            fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
+                    model.n_split, model.n_split - 1);
+        }
+
+        for (int i = 2; i <= model.n_split; i++) {
+            char num_buf[6], total_buf[6];
+            snprintf(num_buf,   sizeof(num_buf),   "%05d", i);
+            snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
+            std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
+
+            auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
+            if (!shard.has_value()) {
+                fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
+                return std::nullopt;
+            }
+
+            model.tensors.insert(model.tensors.end(),
+                std::make_move_iterator(shard->tensors.begin()),
+                std::make_move_iterator(shard->tensors.end()));
+        }
+
+        if (model.n_split_tensors > 0 && model.tensors.size() != model.n_split_tensors) {
+            fprintf(stderr, "gguf_fetch: WARNING: expected %u tensors from split.tensors.count, got %zu\n",
+                    model.n_split_tensors, model.tensors.size());
+        }
+    }
+
+    return model_opt;
+}
+
+gguf_context_ptr gguf_fetch_gguf_ctx(
+        const std::string & repo,
+        const std::string & quant,
+        const std::string & cache_dir,
+        bool verbose) {
+    std::string cdir = cache_dir.empty() ? get_default_cache_dir() : cache_dir;
+    std::string repo_part = sanitize_for_path(repo);
+
+    std::string split_prefix;
+    std::string filename = detect_gguf_filename(repo, quant, split_prefix);
+
+    if (filename.empty()) {
+        return nullptr;
+    }
+
+    auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part, verbose);
+    if (!model_opt.has_value()) {
+        fprintf(stderr, "gguf_fetch: failed to fetch %s\n", filename.c_str());
+        return nullptr;
+    }
+
+    auto & model = model_opt.value();
+
+    const std::string cache_path = get_cache_file_path(cdir, repo_part, filename);
+
+    ggml_context_ptr ggml_ctx_ptr;
+    ggml_context * ggml_ctx{};
+    gguf_init_params params{true, &ggml_ctx};
+    gguf_context_ptr ctx{gguf_init_from_file(cache_path.c_str(), params)};
+    ggml_ctx_ptr.reset(ggml_ctx);
+
+    if (ctx == nullptr) {
+        fprintf(stderr, "gguf_fetch: gguf_init_from_file failed\n");
+        return nullptr;
+    }
+
+    // If the model is split across multiple files we need to fetch the remaining shards metadata
+    if (model.n_split > 1) {
+        if (split_prefix.empty()) {
+            fprintf(stderr, "gguf_fetch: model reports %u splits but filename has no split pattern\n", model.n_split);
+            return nullptr;
+        }
+
+        if (verbose) {
+            fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
+                    model.n_split, model.n_split - 1);
+        }
+
+        for (int i = 2; i <= model.n_split; i++) {
+            char num_buf[6], total_buf[6];
+            snprintf(num_buf,   sizeof(num_buf),   "%05d", i);
+            snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
+            std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
+
+            auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
+            if (!shard.has_value()) {
+                fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
+                return nullptr;
+            }
+
+            // Load tensors from shard and add to main gguf_context
+            const std::string shard_path = get_cache_file_path(cdir, repo_part, shard_name);
+            ggml_context_ptr shard_ggml_ctx_ptr;
+            ggml_context * shard_ggml_ctx{};
+            gguf_init_params shard_params{true, &shard_ggml_ctx};
+            gguf_context_ptr shard_ctx{gguf_init_from_file(shard_path.c_str(), shard_params)};
+            shard_ggml_ctx_ptr.reset(shard_ggml_ctx);
+
+            if (shard_ctx == nullptr) {
+                fprintf(stderr, "gguf_fetch: shard gguf_init_from_file failed\n");
+                return nullptr;
+            }
+
+            for (ggml_tensor * t = ggml_get_first_tensor(shard_ggml_ctx); t; t = ggml_get_next_tensor(shard_ggml_ctx, t)) {
+                gguf_add_tensor(ctx.get(), t);
+            }
+        }
+
+        gguf_set_val_u16(ctx.get(), "split.count", 1);
+    }
+
+    return ctx;
+}
--- a/tests/gguf-model-data.h
+++ b/tests/gguf-model-data.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "ggml-cpp.h"
+#include "gguf.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+struct gguf_remote_tensor {
+    std::string  name;
+    ggml_type    type    = GGML_TYPE_F32;
+    int64_t      ne[4]   = {1, 1, 1, 1};  // dimensions, unused dims = 1
+    uint32_t     n_dims  = 0;
+};
+
+struct gguf_remote_model {
+    // Selected KV metadata
+    std::string architecture;               // general.architecture
+    uint32_t    n_embd          = 0;        // <arch>.embedding_length
+    uint32_t    n_ff            = 0;        // <arch>.feed_forward_length
+    uint32_t    n_vocab         = 0;        // inferred from token_embd.weight ne[1]
+    uint32_t    n_layer         = 0;        // <arch>.block_count
+    uint32_t    n_head          = 0;        // <arch>.attention.head_count
+    uint32_t    n_head_kv       = 0;        // <arch>.attention.head_count_kv
+    uint32_t    n_expert        = 0;        // <arch>.expert_count (0 if absent)
+    uint32_t    n_embd_head_k   = 0;        // <arch>.attention.key_length
+    uint32_t    n_embd_head_v   = 0;        // <arch>.attention.value_length
+    uint16_t    n_split         = 0;        // split.count (0 = not split)
+    uint32_t    n_split_tensors = 0;        // split.tensors.count (0 if not split)
+
+    std::vector<gguf_remote_tensor> tensors;
+};
+
+// Fetch model metadata from HuggingFace with local caching.
+// repo: e.g., "ggml-org/Qwen3-32B-GGUF"
+// quant: e.g., "Q8_0" -- auto-detects filename (including first shard of split models)
+// Returns nullopt if download fails or network is unavailable.
+std::optional<gguf_remote_model> gguf_fetch_model_meta(
+    const std::string & repo,
+    const std::string & quant = "Q8_0",
+    const std::string & cache_dir = "",  // empty = default
+    bool verbose = true);
+
+gguf_context_ptr gguf_fetch_gguf_ctx(
+    const std::string & repo,
+    const std::string & quant = "Q8_0",
+    const std::string & cache_dir = "",
+    bool verbose = true);
--- a/tests/peg-parser/simple-tokenize.cpp
+++ b/tests/peg-parser/simple-tokenize.cpp
@@ -0,0 +1,37 @@
+#include "simple-tokenize.h"
+
+std::vector<std::string> simple_tokenize(const std::string & input) {
+    std::vector<std::string> result;
+    std::string              current;
+
+    for (size_t i = 0; i < input.size(); i++) {
+        switch (input[i]) {
+            case ' ':
+            case '\n':
+            case '\t':
+            case '{':
+            case '}':
+            case ',':
+            case '[':
+            case '"':
+            case ']':
+            case '.':
+            case '<':
+            case '>':
+            case '=':
+            case '/':
+                if (!current.empty()) {
+                    result.push_back(current);
+                    current.clear();
+                }
+            default:;
+        }
+        current += input[i];
+    }
+
+    if (!current.empty()) {
+        result.push_back(current);
+    }
+
+    return result;
+}
--- a/tests/peg-parser/simple-tokenize.h
+++ b/tests/peg-parser/simple-tokenize.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+std::vector<std::string> simple_tokenize(const std::string &);
--- a/tests/peg-parser/test-basic.cpp
+++ b/tests/peg-parser/test-basic.cpp
@@ -0,0 +1,471 @@
+#include "peg-parser.h"
+#include "tests.h"
+
+void test_basic(testing & t) {
+    t.test("chars", [](testing & t) {
+        // Test common escape sequences - newline
+        t.test("escape_sequence_newline", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\n");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_newline", true, result.success());
+        });
+
+        // Test common escape sequences - tab
+        t.test("escape_sequence_tab", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\t");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_tab", true, result.success());
+        });
+
+        // Test common escape sequences - backslash
+        t.test("escape_sequence_backslash", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\\");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_backslash", true, result.success());
+        });
+
+        // Test common escape sequences - space (should ())
+        t.test("escape_sequence_space_fail", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context(" ");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_space_fail", true, result.fail());
+        });
+
+        // Test escaped dash - 'a' should succeed
+        t.test("escaped_dash_a", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("a");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_a", true, result.success());
+        });
+
+        // Test escaped dash - '-' should succeed (literal dash)
+        t.test("escaped_dash_literal", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("-");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_literal", true, result.success());
+        });
+
+        // Test escaped dash - 'z' should succeed
+        t.test("escaped_dash_z", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("z");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_z", true, result.success());
+        });
+
+        // Test escaped dash - 'b' should NOT match (since \- is literal dash, not range)
+        t.test("escaped_dash_b_fail", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("b");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_b_fail", true, result.fail());
+        });
+    });
+
+
+    t.test("optional", [](testing & t) {
+        // Full match with optional part present
+        t.test("optional_present", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello world");
+            auto result = parser.parse(ctx);
+            t.assert_equal("optional_present", true, result.success());
+            t.assert_equal("optional_present_end", 11u, result.end);
+        });
+
+        // Full match with optional part absent
+        t.test("optional_absent", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello");
+            auto result = parser.parse(ctx);
+            t.assert_equal("optional_absent", true, result.success());
+            t.assert_equal("optional_absent_end", 5u, result.end);
+        });
+
+        // Partial match - waiting for more input to determine if optional matches
+        t.test("partial_match_need_more", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello ", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto result = parser.parse(ctx);
+            t.assert_equal("partial_match_need_more", true, result.need_more_input());
+        });
+    });
+
+    t.test("partial parsing", [](testing & t) {
+        // Literals - Basic Success
+        t.test("literal_success", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("hello");
+            result = parser.parse(ctx);
+            t.assert_equal("literal_success", true, result.success());
+        });
+
+        // Char Classes - Basic Lowercase Success
+        t.test("char_class_lowercase_success", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("a");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_lowercase_success", true, result.success());
+        });
+
+        // Char Classes - Uppercase Fail
+        t.test("char_class_uppercase_fail", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("A");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_uppercase_fail", true, result.fail());
+        });
+
+        // Char Classes with Dash - Lowercase Success
+        t.test("char_class_with_dash_lowercase", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("f");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_lowercase", true, result.success());
+        });
+
+        // Char Classes with Dash - Literal Dash Success
+        t.test("char_class_with_dash_literal_dash", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("-");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_literal_dash", true, result.success());
+        });
+
+        // Char Classes with Dash - Uppercase Fail
+        t.test("char_class_with_dash_uppercase_fail", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("A");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_uppercase_fail", true, result.fail());
+        });
+
+        // Sequences - Partial Match 1
+        t.test("sequence_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<thi", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_1", true, result.need_more_input());
+        });
+
+        // Sequences - Partial Match 2
+        t.test("sequence_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("begin") + p.literal("end"); });
+
+            auto ctx    = common_peg_parse_context("begin", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_2", true, result.need_more_input());
+        });
+
+        // Sequences - Partial Match 3
+        t.test("sequence_partial_match_3", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<think></", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_3", true, result.need_more_input());
+        });
+
+        // Sequences - Full Match
+        t.test("sequence_full_match", [&](testing & t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello") + p.literal("world"); });
+
+            auto ctx    = common_peg_parse_context("helloworld");
+            auto result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("sequence_full_match", true, result.success());
+        });
+
+        // Sequences - No Match
+        t.test("sequence_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<think>I am common_chat_combinator_parser", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_no_match", true, result.fail());
+        });
+
+        // Choices - Partial Match 1
+        t.test("choices_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("option1") | p.literal("option2"); });
+
+            auto ctx    = common_peg_parse_context("opt", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_partial_match_1", true, result.need_more_input());
+        });
+
+        // Choices - Partial Match 2
+        t.test("choices_partial_match_2", [&](testing & t) {
+            auto parser =
+                build_peg_parser([](common_peg_parser_builder & p) { return p.literal("choice_a") | p.literal("choice_b"); });
+
+            auto ctx    = common_peg_parse_context("choice", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_partial_match_2", true, result.need_more_input());
+        });
+
+        // Choices - Full Match 1
+        t.test("choices_full_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("first") | p.literal("second"); });
+
+            auto ctx    = common_peg_parse_context("first");
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_full_match_1", true, result.success());
+        });
+
+        // Choices - Full Match 2
+        t.test("choices_full_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("alpha") | p.literal("beta"); });
+
+            auto ctx    = common_peg_parse_context("beta");
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_full_match_2", true, result.success());
+        });
+
+        // Choices - No Match
+        t.test("choices_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("good") | p.literal("better"); });
+
+            auto ctx    = common_peg_parse_context("best");
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_no_match", true, result.fail());
+        });
+
+        // Zero or More - Partial Match 1
+        t.test("zero_or_more_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("ab")); });
+
+            auto ctx    = common_peg_parse_context("a", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_partial_match_1", true, result.need_more_input());
+        });
+
+        // Zero or More - Partial Match 2
+        t.test("zero_or_more_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("xy")); });
+
+            auto ctx    = common_peg_parse_context("xyx", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_partial_match_2", true, result.need_more_input());
+        });
+
+        // Zero or More - Full Match
+        t.test("zero_or_more_full_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("test")); });
+
+            auto ctx    = common_peg_parse_context("test");
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_full_match", true, result.success());
+        });
+
+        // One or More - Partial Match 1
+        t.test("one_or_more_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("repeat")); });
+
+            auto ctx    = common_peg_parse_context("rep", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_partial_match_1", true, result.need_more_input());
+        });
+
+        // One or More - Partial Match 2
+        t.test("one_or_more_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("ab")); });
+
+            auto ctx    = common_peg_parse_context("aba", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_partial_match_2", true, result.need_more_input());
+        });
+
+        // One or More - Full Match
+        t.test("one_or_more_full_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("single")); });
+
+            auto ctx    = common_peg_parse_context("single");
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_full_match", true, result.success());
+        });
+
+        // One or More - No Match
+        t.test("one_or_more_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("()")); });
+
+            auto ctx    = common_peg_parse_context("success");
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_no_match", true, result.fail());
+        });
+    });
+
+
+    t.test("recursive rules", [](testing &t) {
+        // Test simple number
+        t.test("simple_number", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("1");
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test simple list
+        t.test("simple_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[1]");
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test nested list
+        t.test("nested_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[2]]");
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test deeply nested list
+        t.test("deeply_nested_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[[3]]]");
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test need_more_input match
+        t.test("need_more_input_match", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[", COMMON_PEG_PARSE_FLAG_LENIENT);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+        });
+
+        // Test no match
+        t.test("no_match", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[a]");
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_fail", true, result.fail());
+        });
+
+        // Test markers
+        t.test("marker", [](testing &t) {
+            auto bracket_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.marker();
+            });
+
+            common_peg_parse_context ctx_square("[marker]");
+            common_peg_parse_context ctx_sharp("<marker>");
+
+            auto result_square = bracket_parser.parse(ctx_square);
+            auto result_sharp = bracket_parser.parse(ctx_sharp);
+
+            t.assert_true("result_square_is_success", result_square.success());
+            t.assert_true("result_sharp_is_success", result_sharp.success());
+        });
+    });
+}
--- a/tests/peg-parser/test-gbnf-generation.cpp
+++ b/tests/peg-parser/test-gbnf-generation.cpp
@@ -0,0 +1,370 @@
+#include "tests.h"
+
+#include "json-schema-to-grammar.h"
+
+#include <regex>
+
+static std::string trim_leading_space(const std::string & s) {
+    static const std::regex leading_ws_re = std::regex(R"((^|\n)\s+)");
+    return std::regex_replace(s, leading_ws_re, "$1");
+}
+
+static void assert_gbnf_equal(testing & t, const std::string & expected, const std::string & actual) {
+    t.assert_equal("gbnf are equal", trim_leading_space(expected), trim_leading_space(actual));
+}
+
+void test_gbnf_generation(testing &t) {
+    t.test("literal grammar generation", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("char class grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.chars("[a-z]", 1, 1);
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= [a-z]
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("sequence grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") + p.literal(" ") + p.literal("world");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" " " "world"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("choice grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("cat") | p.literal("dog");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "cat" | "dog"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("one_or_more grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.one_or_more(p.literal("a"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a"+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("zero_or_more grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.zero_or_more(p.literal("a"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a"*
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("optional grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") + p.optional(p.literal(" world"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" " world"?
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("until grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p)  {
+            return p.until("</tag>");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])*
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("complex expressions with parentheses", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.one_or_more(p.literal("a") | p.literal("b"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= ("a" | "b")+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("rule references", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            auto digit = p.rule("digit", p.chars("[0-9]", 1, 1));
+            return p.one_or_more(digit);
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            digit ::= [0-9]
+            root ::= digit+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("escaping in literals", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello\nworld\n!");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello\nworld\n!"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("operator<< (whitespace insertion)", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") << p.literal("world");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" space "world"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("emit only reachable rules", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            p.rule("orphan", p.literal("orphan"));
+            return p.literal("hello") + p.rule("child", p.literal(" world"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            child ::= " world"
+            root ::= "hello" child
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("tagged choice inside sequence gets parenthesized", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("a") + p.tag("t", p.literal("b") | p.literal("c"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a" ("b" | "c")
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("tagged sequence inside choice gets parenthesized", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.tag("t", p.literal("a") + p.literal("b")) | p.literal("c");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a" "b" | "c"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("atomic choice inside repetition gets parenthesized", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.one_or_more(p.atomic(p.literal("a") | p.literal("b")));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= ("a" | "b")+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("silent parser emits nothing in gbnf", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") + p.gbnf(p.literal("world"), "");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("silent choice inside sequence emits nothing", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("a") + p.gbnf(p.literal("b") | p.literal("c"), "") + p.literal("d");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a" "d"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("silent wrapped in tag emits nothing", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("a") + p.tag("t", p.gbnf(p.literal("b"), ""));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("gbnf parser emits custom grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("a") + p.gbnf(p.literal("b"), "[a-z]+");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a" [a-z]+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("nested transparent wrappers get parenthesized", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("x") + p.tag("outer", p.atomic(p.literal("a") | p.literal("b")));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "x" ("a" | "b")
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("emit only trigger rules (and references)", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            auto rule1 = p.rule("rule-1", p.literal("a") + p.ref("rule-2"));
+            p.rule("rule-2", p.literal("b") + p.ref("rule-3"), true);
+            p.rule("rule-3", p.literal("c") + p.ref("rule-4"));
+            p.rule("rule-4", p.literal("d"), true);
+            return rule1;
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= rule-1
+            rule-1 ::= "a" rule-2
+            rule-2 ::= "b" rule-3
+            rule-3 ::= "c" rule-4
+            rule-4 ::= "d"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+
+        auto gbnf_lazy = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder, true);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= rule-2 | rule-4
+            rule-2 ::= "b" rule-3
+            rule-3 ::= "c" rule-4
+            rule-4 ::= "d"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf_lazy);
+    });
+}
--- a/tests/peg-parser/test-json-parser.cpp
+++ b/tests/peg-parser/test-json-parser.cpp
@@ -0,0 +1,109 @@
+#include "tests.h"
+
+void test_json_parser(testing &t) {
+    // Test parsing a simple JSON object
+    t.test("simple JSON object parsing", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"name": "test", "value": 42, "flag": true})";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing a JSON array with mixed types
+    t.test("JSON array with mixed types", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"([1, "hello", true, null, 3.14])";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing nested JSON with objects and arrays
+    t.test("nested JSON with objects and arrays", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string input =
+            R"({"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], "count": 2, "metadata": {"version": "1.0", "tags": ["admin", "user"]}})";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test need_more_input() parsing - incomplete object
+    t.test("need_more_input() parsing - incomplete object", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"name": "test", "value": )";
+        common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    // Test need_more_input() parsing - incomplete array
+    t.test("need_more_input() parsing - incomplete array", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"([1, 2, 3, )";
+        common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    // Test need_more_input() parsing - incomplete nested structure
+    t.test("need_more_input() parsing - incomplete nested structure", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"data": {"nested": )";
+        common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    t.test("object member", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.json_member("name", "\"" + p.chars("[a-z]") + "\"");
+        });
+
+        t.test("success", [&](testing &t) {
+            std::string input = R"("name": "bob")";
+            common_peg_parse_context ctx(input);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+        });
+
+        t.test("partial", [&](testing &t) {
+            std::string input = R"("name": "bo)";
+            common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("need more input", result.need_more_input());
+        });
+
+        t.test("failed", [&](testing &t) {
+            std::string input = R"([])";
+            common_peg_parse_context ctx(input);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("fail", result.fail());
+        });
+    });
+}
--- a/tests/peg-parser/test-json-serialization.cpp
+++ b/tests/peg-parser/test-json-serialization.cpp
@@ -0,0 +1,28 @@
+#include "tests.h"
+
+void test_json_serialization(testing &t) {
+    auto original = build_peg_parser([](common_peg_parser_builder & p) {
+        return "<tool_call>" + p.json() + "</tool_call>";
+    });
+
+    auto json_serialized = original.to_json().dump();
+
+    t.test("compare before/after", [&](testing &t) {
+        auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
+
+        // Test complex JSON
+        std::string input = R"({"name": "test", "values": [1, 2, 3], "nested": {"a": true}})";
+        common_peg_parse_context ctx1(input);
+        common_peg_parse_context ctx2(input);
+
+        auto result1 = original.parse(ctx1);
+        auto result2 = deserialized.parse(ctx2);
+
+        t.assert_equal("both_succeed", result1.success(), result2.success());
+        t.assert_equal("same_end_pos", result1.end, result2.end);
+    });
+
+    t.bench("deserialize", [&]() {
+        auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
+    }, 100);
+}
--- a/tests/peg-parser/test-python-dict-parser.cpp
+++ b/tests/peg-parser/test-python-dict-parser.cpp
@@ -0,0 +1,318 @@
+#include "tests.h"
+
+void test_python_dict_parser(testing &t) {
+    // Test parsing a simple Python dict object with single quotes
+    t.test("simple Python dict object parsing", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string    input = "{'name': 'test', 'value': 42, 'flag': True}";
+        common_peg_parse_context ctx(input);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing a Python array with mixed types
+    t.test("Python array with mixed types", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string    input = "[1, 'hello', True, None, 3.14]";
+        common_peg_parse_context ctx(input);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing nested Python dict with objects and arrays
+    t.test("nested Python dict with objects and arrays", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string input =
+            "{'users': [{'id': 1, 'name': 'Alice'}, {'id': 2, 'name': 'Bob'}], 'count': 2, 'metadata': {'version': '1.0', 'tags': ['admin', 'user']}}";
+        common_peg_parse_context ctx(input);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing Python dict with escaped single quotes
+    t.test("Python dict with escaped single quotes", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string    input = "{'message': 'It\\'s working!'}";
+        common_peg_parse_context ctx(input);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing Python dict with double quotes inside single quotes
+    t.test("Python dict with double quotes inside single quotes", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string    input = "{'quote': 'He said \"Hello\"'}";
+        common_peg_parse_context ctx(input);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test the example from the requirements
+    t.test("complex Python dict example from requirements", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string    input = "{ 'obj' : { 'something': 1, 'other \"something\"' : 'foo\\'s bar' } }";
+        common_peg_parse_context ctx(input);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test need_more_input() parsing - incomplete object
+    t.test("need_more_input() parsing - incomplete object", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string    input = "{'name': 'test', 'value': ";
+        common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    // Test need_more_input() parsing - incomplete single-quoted string
+    t.test("need_more_input() parsing - incomplete single-quoted string", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string    input = "{'name': 'test";
+        common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    // Test unicode in Python dict strings
+    t.test("unicode in Python dict strings", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string    input = "{'message': 'Hello, 世界!'}";
+        common_peg_parse_context ctx(input);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test Python dict with unicode escapes
+    t.test("Python dict with unicode escapes", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string    input = "{'unicode': 'Hello\\u0041'}";
+        common_peg_parse_context ctx(input);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test that Python parser accepts double-quoted strings too
+    t.test("Python parser accepts double-quoted strings", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string    input = "{\"name\": \"test\"}";
+        common_peg_parse_context ctx(input);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test Python parser with mixed quote styles
+    t.test("Python parser with mixed quote styles", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        std::string    input = "{\"name\": 'test', 'value': \"hello\"}";
+        common_peg_parse_context ctx(input);
+
+        auto result = parser.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test Python True/False/None
+    t.test("Python True/False/None", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
+
+        t.test("True", [&](testing &t) {
+            std::string input = "True";
+            common_peg_parse_context ctx(input);
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+            t.assert_equal("end", input.size(), result.end);
+        });
+
+        t.test("False", [&](testing &t) {
+            std::string input = "False";
+            common_peg_parse_context ctx(input);
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+            t.assert_equal("end", input.size(), result.end);
+        });
+
+        t.test("None", [&](testing &t) {
+            std::string input = "None";
+            common_peg_parse_context ctx(input);
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+            t.assert_equal("end", input.size(), result.end);
+        });
+
+        t.test("rejects JSON-style true/false/null", [&](testing &t) {
+            for (const auto & kw : {"true", "false", "null"}) {
+                std::string input = kw;
+                common_peg_parse_context ctx(input);
+                auto result = parser.parse(ctx);
+                t.assert_true(std::string("rejects ") + kw, result.fail());
+            }
+        });
+    });
+
+    // Test single-quoted string content parser directly
+    t.test("single-quoted string content parser", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.sequence({ p.literal("'"), p.string_content('\''), p.literal("'"), p.space() });
+        });
+
+        t.test("simple string", [&](testing &t) {
+            std::string input = "'hello'";
+            common_peg_parse_context ctx(input);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+            t.assert_equal("end", input.size(), result.end);
+        });
+
+        t.test("string with escaped single quote", [&](testing &t) {
+            std::string input = "'it\\'s'";
+            common_peg_parse_context ctx(input);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+            t.assert_equal("end", input.size(), result.end);
+        });
+
+        t.test("string with double quotes", [&](testing &t) {
+            std::string input = "'say \"hello\"'";
+            common_peg_parse_context ctx(input);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+            t.assert_equal("end", input.size(), result.end);
+        });
+
+        t.test("incomplete string", [&](testing &t) {
+            std::string input = "'hello";
+            common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("need_more_input", result.need_more_input());
+        });
+    });
+
+    // Test json() with pre-registered flexible json-string rule (python dict support)
+    t.test("json() parser with flexible json-string rule", [](testing &t) {
+        t.test("json() rejects single quotes by default", [&](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.json();
+            });
+
+            std::string input = "{'name': 'test'}";
+            common_peg_parse_context ctx(input);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("fail", result.fail());
+        });
+
+        t.test("json() accepts single quotes with pre-registered flexible json-string rule", [&](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                // Pre-register json-string rule with both quote styles
+                p.rule("json-string", [&]() {
+                    return p.choice({ p.double_quoted_string(), p.single_quoted_string() });
+                });
+                return p.json();
+            });
+
+            std::string input = "{'name': 'test'}";
+            common_peg_parse_context ctx(input);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+            t.assert_equal("end", input.size(), result.end);
+        });
+
+        t.test("json() still accepts double quotes with flexible json-string rule", [&](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("json-string", [&]() {
+                    return p.choice({ p.double_quoted_string(), p.single_quoted_string() });
+                });
+                return p.json();
+            });
+
+            std::string input = "{\"name\": \"test\"}";
+            common_peg_parse_context ctx(input);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+            t.assert_equal("end", input.size(), result.end);
+        });
+
+        t.test("json() accepts mixed quote styles with flexible json-string rule", [&](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("json-string", [&]() {
+                    return p.choice({ p.double_quoted_string(), p.single_quoted_string() });
+                });
+                return p.json();
+            });
+
+            std::string input = "{\"name\": 'test', 'value': \"hello\"}";
+            common_peg_parse_context ctx(input);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+            t.assert_equal("end", input.size(), result.end);
+        });
+
+        t.test("complex nested structure with flexible json-string rule", [&](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("json-string", [&]() {
+                    return p.choice({ p.double_quoted_string(), p.single_quoted_string() });
+                });
+                return p.json();
+            });
+
+            std::string input = "{ 'obj' : { 'something': 1, 'other \"something\"' : 'foo\\'s bar' } }";
+            common_peg_parse_context ctx(input);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+            t.assert_equal("end", input.size(), result.end);
+        });
+    });
+}
--- a/tests/peg-parser/test-unicode.cpp
+++ b/tests/peg-parser/test-unicode.cpp
@@ -0,0 +1,446 @@
+#include "tests.h"
+
+#include "peg-parser.h"
+
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <cctype>
+
+static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) {
+    t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual));
+}
+
+static std::string hex_dump(const std::string& str) {
+    std::ostringstream oss;
+    for (unsigned char c : str) {
+        if (std::isprint(c)) {
+            oss << c;
+        } else {
+            oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
+        }
+    }
+    return oss.str();
+}
+
+void test_unicode(testing &t) {
+    struct test_case {
+        std::string input;
+        std::string expected_text;
+        common_peg_parse_result_type expected_result;
+    };
+
+    t.test("any", [](testing &t) {
+        std::vector<test_case> test_cases {
+            // Valid UTF-8 sequences
+            {"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+            // Incomplete UTF-8 sequences (partial bytes at end)
+            {std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            {std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            {std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+            // Invalid/malformed UTF-8 sequences
+            {std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            {std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL},
+            {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+        };
+
+        auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+            return p.sequence({p.one_or_more(p.any()), p.end()});
+        });
+
+        for (size_t i = 0; i < test_cases.size(); i++) {
+            const auto & tc = test_cases[i];
+            std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+            t.test(test_name, [&](testing &t) {
+                common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
+                auto result = parser.parse(ctx);
+
+                // Assert result type matches
+                assert_result_equal(t, tc.expected_result, result.type);
+
+                // Assert matched text if success or need_more_input
+                if (result.success() || result.need_more_input()) {
+                    std::string matched = tc.input.substr(result.start, result.end - result.start);
+                    t.assert_equal(tc.expected_text, matched);
+                }
+            });
+        }
+    });
+
+    t.test("char classes", [](testing &t) {
+        t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Within range - CJK Unified Ideographs
+                {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
+                {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
+                {std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D
+                {std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF
+
+                // Outside range - should fail
+                {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},                                                     // ASCII
+                {std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL},                            // U+4DFF (before range)
+                {std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},                            // U+A000 (after range)
+
+                // Incomplete sequences in range
+                {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},                     // Incomplete U+4E00
+                {std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},                     // Incomplete U+597D
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Within range - Emoticons (all 4-byte UTF-8)
+                {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
+                {std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601
+                {std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F
+
+                // Outside range
+                {std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range)
+                {std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range)
+                {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range)
+
+                // Incomplete sequences
+                {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji
+                {std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},     // Very incomplete
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("mixed unicode ranges", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Match CJK
+                {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
+                {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
+
+                // Match emoticons
+                {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
+
+                // Match ASCII digits
+                {"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Don't match outside any range
+                {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},
+                {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680
+
+                // Incomplete
+                {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+                {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+    });
+
+    t.test("until parser", [](testing &t) {
+        t.test("ASCII delimiter with Unicode content", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // CJK characters before delimiter
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Emoji before delimiter
+                {std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mixed content
+                {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("incomplete UTF-8 at end", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Incomplete emoji at end, no delimiter
+                {std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete CJK at end, no delimiter
+                {std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Complete content, no delimiter (should consume all valid UTF-8)
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("malformed UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Invalid UTF-8 bytes
+                {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Continuation byte without lead byte
+                {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Invalid continuation byte
+                {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+                });
+            }
+        });
+    });
+
+    t.test("json_string parser", [](testing &t) {
+        t.test("valid UTF-8 characters", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // ASCII only
+                {"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 2-byte UTF-8 (accented characters)
+                {std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 3-byte UTF-8 (CJK)
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 4-byte UTF-8 (emoji)
+                {std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mixed content
+                {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.sequence({p.string_content('"'), p.literal("\"")});
+                    });
+
+                    common_peg_parse_context ctx(tc.input);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start - 1);  // -1 to exclude closing quote
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("incomplete UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Incomplete 2-byte sequence
+                {std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete 3-byte sequence
+                {std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete 4-byte sequence
+                {std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete at very start
+                {std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.string_content('"');
+                    });
+
+                    common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("malformed UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Invalid UTF-8 bytes
+                {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Continuation byte without lead byte
+                {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Invalid continuation byte
+                {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.string_content('"');
+                    });
+
+                    common_peg_parse_context ctx(tc.input);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+                });
+            }
+        });
+
+        t.test("escape sequences with UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Unicode escape sequence
+                {"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mix of UTF-8 and escape sequences
+                {std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Escaped quote in UTF-8 string
+                {std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.sequence({p.string_content('"'), p.literal("\"")});
+                    });
+
+                    common_peg_parse_context ctx(tc.input);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start - 1);  // -1 to exclude closing quote
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+    });
+}
--- a/tests/peg-parser/tests.h
+++ b/tests/peg-parser/tests.h
@@ -0,0 +1,25 @@
+#pragma once
+
+// Common includes for all test files
+#include <nlohmann/json.hpp>
+#include <string>
+#include <vector>
+
+#include "../testing.h"
+#include "peg-parser.h"
+#include "chat-peg-parser.h"
+#include "simple-tokenize.h"
+
+struct bench_tool_call {
+    std::string            id;
+    std::string            name;
+    nlohmann::ordered_json args;
+};
+
+// Test function declarations
+void test_basic(testing &t);
+void test_json_parser(testing &t);
+void test_gbnf_generation(testing &t);
+void test_unicode(testing &t);
+void test_json_serialization(testing &t);
+void test_python_dict_parser(testing &t);
--- a/tests/snapshots/deepseek-v3.1.schema
+++ b/tests/snapshots/deepseek-v3.1.schema
--- a/tests/snapshots/gemma-3-4b-it.schema
+++ b/tests/snapshots/gemma-3-4b-it.schema
--- a/tests/snapshots/glm-4.6v.schema
+++ b/tests/snapshots/glm-4.6v.schema
--- a/tests/snapshots/gpt-oss-120b.schema
+++ b/tests/snapshots/gpt-oss-120b.schema
--- a/tests/snapshots/meta-llama-3.1-70b-instruct.schema
+++ b/tests/snapshots/meta-llama-3.1-70b-instruct.schema
--- a/tests/snapshots/nemotron-nano-3-30b-a3b.schema
+++ b/tests/snapshots/nemotron-nano-3-30b-a3b.schema
--- a/tests/snapshots/qwen3-0.6b.schema
+++ b/tests/snapshots/qwen3-0.6b.schema
--- a/tests/snapshots/qwen3-14b.schema
+++ b/tests/snapshots/qwen3-14b.schema
--- a/tests/snapshots/qwen3-coder-next.schema
+++ b/tests/snapshots/qwen3-coder-next.schema
--- a/tests/snapshots/qwen3.5-27b.schema
+++ b/tests/snapshots/qwen3.5-27b.schema
--- a/tests/snapshots/qwen3.5-397b-a17b.schema
+++ b/tests/snapshots/qwen3.5-397b-a17b.schema
--- a/tests/snapshots/step-3.5-flash.schema
+++ b/tests/snapshots/step-3.5-flash.schema
--- a/tests/test-alloc.cpp
+++ b/tests/test-alloc.cpp
@@ -0,0 +1,608 @@
+#include <ggml-alloc.h>
+#include <ggml-backend-impl.h>
+#include <ggml-cpp.h>
+#include <ggml-impl.h>
+#include <ggml.h>
+
+#include <algorithm>
+#include <exception>
+#include <memory>
+#include <vector>
+
+//
+// dummy backend with configurable max_buffer_size, tracks allocations
+
+uint8_t * const alloc_base = (uint8_t *) 16;
+
+struct dummy_backend_context {
+    size_t max_buffer_size = 64;
+    size_t alignment       = 8;
+
+    ggml_backend_buffer_i              buffer_interface;
+    std::vector<ggml_backend_buffer_t> buffers;
+
+    size_t allocated_total() const {
+        size_t n = 0;
+        for (ggml_backend_buffer_t buf : buffers) {
+            n += ggml_backend_buffer_get_size(buf);
+        }
+        return n;
+    }
+};
+
+// ggml_backend_buffer_type interface
+
+static const char * dummy_backend_buffer_type_get_name(ggml_backend_buffer_type_t) {
+    return "dummy_buffer_type";
+}
+
+static ggml_backend_buffer_t dummy_backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    dummy_backend_context * ctx    = (dummy_backend_context *) buft->context;
+    ggml_backend_buffer_t & buffer = ctx->buffers.emplace_back();
+    buffer                         = ggml_backend_buffer_init(buft, ctx->buffer_interface, ctx, size);
+    return buffer;
+}
+
+static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
+    return ctx->alignment;
+}
+
+static size_t dummy_backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
+    return ctx->max_buffer_size;
+}
+
+static bool dummy_backend_buffer_type_is_host(ggml_backend_buffer_type_t) {
+    return true;
+}
+
+// ggml_backend_buffer interface
+
+static void dummy_backend_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buffer->context;
+
+    auto i = std::find(ctx->buffers.begin(), ctx->buffers.end(), buffer);
+    GGML_ASSERT(i != ctx->buffers.end());
+    ctx->buffers.erase(i);
+}
+
+static void * dummy_backend_buffer_get_base(ggml_backend_buffer_t) {
+    return alloc_base;
+}
+
+static ggml_status dummy_backend_buffer_init_tensor(ggml_backend_buffer_t, ggml_tensor *) {
+    return GGML_STATUS_SUCCESS;
+}
+
+static void dummy_backend_buffer_memset_tensor(ggml_backend_buffer_t, ggml_tensor *, uint8_t, size_t, size_t) {}
+
+static void dummy_backend_buffer_set_tensor(ggml_backend_buffer_t, ggml_tensor *, const void *, size_t, size_t) {}
+
+static void dummy_backend_buffer_get_tensor(ggml_backend_buffer_t, const ggml_tensor *, void *, size_t, size_t) {}
+
+static void dummy_backend_buffer_clear(ggml_backend_buffer_t, uint8_t) {}
+
+// dummy_backend (not really a full backend, just provides what gallocr needs)
+
+struct dummy_backend {
+    std::unique_ptr<dummy_backend_context> context;
+    ggml_backend_buffer_type               buffer_type;
+};
+
+static dummy_backend dummy_backend_init(size_t max_buffer_size, size_t alignment = 8) {
+    dummy_backend b{};
+    b.context                  = std::make_unique<dummy_backend_context>();
+    b.context->alignment       = alignment;
+    b.context->max_buffer_size = max_buffer_size;
+
+    b.context->buffer_interface.free_buffer   = dummy_backend_buffer_free_buffer;
+    b.context->buffer_interface.get_base      = dummy_backend_buffer_get_base;
+    b.context->buffer_interface.init_tensor   = dummy_backend_buffer_init_tensor;
+    b.context->buffer_interface.memset_tensor = dummy_backend_buffer_memset_tensor;
+    b.context->buffer_interface.set_tensor    = dummy_backend_buffer_set_tensor;
+    b.context->buffer_interface.get_tensor    = dummy_backend_buffer_get_tensor;
+    b.context->buffer_interface.clear         = dummy_backend_buffer_clear;
+
+    b.buffer_type.context             = b.context.get();
+    b.buffer_type.iface.get_name      = dummy_backend_buffer_type_get_name;
+    b.buffer_type.iface.alloc_buffer  = dummy_backend_buffer_type_alloc_buffer;
+    b.buffer_type.iface.get_alignment = dummy_backend_buffer_type_get_alignment;
+    b.buffer_type.iface.get_max_size  = dummy_backend_buffer_type_get_max_size;
+    b.buffer_type.iface.is_host       = dummy_backend_buffer_type_is_host;
+    return b;
+}
+
+//
+// test utilities
+
+struct test_context_with_graph {
+    ggml_context *   ctx;
+    ggml_cgraph *    graph;
+    ggml_context_ptr ctx_ptr;
+};
+
+static test_context_with_graph make_context() {
+    ggml_init_params params{};
+    params.mem_size = 48 * ggml_tensor_overhead() + ggml_graph_overhead();
+    params.no_alloc = true;
+
+    ggml_context *   ctx     = ggml_init(params);
+    ggml_context_ptr ctx_ptr = ggml_context_ptr(ctx);
+    ggml_cgraph *    graph   = ggml_new_graph(ctx);
+    return { ctx, graph, std::move(ctx_ptr) };
+}
+
+static ggml_tensor * make_input_1d(ggml_context * ctx, int64_t n_elements) {
+    ggml_tensor * t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+    ggml_set_input(t);
+    return t;
+}
+
+static ggml_tensor * make_input_with_size(ggml_context * ctx, size_t size_bytes) {
+    GGML_ASSERT(size_bytes % 4 == 0);
+    return make_input_1d(ctx, size_bytes / 4);
+}
+
+static void assign_names(ggml_context * ctx, const char * prefix = "x") {
+    int i = 0;
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        ggml_format_name(t, "%s%d", prefix, i++);
+    }
+}
+
+static int get_leaf_id(ggml_cgraph * graph, const char * tensor_name) {
+    for (int i = 0; i < graph->n_leafs; ++i) {
+        if (strncmp(graph->leafs[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
+            return i;
+        }
+    }
+    fprintf(stderr, "leaf not found: %s\n", tensor_name);
+    return -1;
+}
+
+static int get_node_id(ggml_cgraph * graph, const char * tensor_name) {
+    for (int i = 0; i < graph->n_nodes; ++i) {
+        if (strncmp(graph->nodes[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
+            return i;
+        }
+    }
+    fprintf(stderr, "node not found: %s", tensor_name);
+    return -1;
+}
+
+static ggml_gallocr_ptr allocate_graph(ggml_cgraph * graph, ggml_tensor * out, ggml_backend_buffer_type_t buft) {
+    ggml_set_output(out);
+    ggml_build_forward_expand(graph, out);
+
+    ggml_gallocr_ptr galloc = ggml_gallocr_ptr(ggml_gallocr_new(buft));
+    bool             result = ggml_gallocr_alloc_graph(galloc.get(), graph);
+    GGML_ASSERT(result);
+    return galloc;
+}
+
+//
+// correctness checks for result allocations
+
+static void check_all_allocated(ggml_cgraph * graph) {
+    for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
+        ggml_tensor * t = ggml_graph_node(graph, i);
+        GGML_ASSERT(t->buffer != nullptr);
+        GGML_ASSERT(t->data != nullptr);
+    }
+}
+
+static void check_max_size(ggml_context * ctx) {
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        auto   buft     = ggml_backend_buffer_get_type(t->buffer);
+        size_t max_size = ggml_backend_buft_get_max_size(buft);
+        size_t offset   = (char *) t->data - (char *) ggml_backend_buffer_get_base(t->buffer);
+        GGML_ASSERT(t->data >= ggml_backend_buffer_get_base(t->buffer));
+        GGML_ASSERT((size_t) offset + ggml_nbytes(t) <= max_size);
+    }
+}
+
+static bool can_reuse_memory(ggml_cgraph * graph, int current_i, ggml_tensor * current, ggml_tensor * other) {
+    if (other->flags & GGML_TENSOR_FLAG_OUTPUT) {
+        return false;
+    }
+    // Check if `other` is still "alive", ie. an input to any node after the `current` op
+    for (int i = current_i; i < ggml_graph_n_nodes(graph); ++i) {
+        ggml_tensor * t = ggml_graph_node(graph, i);
+        for (int s = 0; s < GGML_MAX_SRC; s++) {
+            if (t == current && ggml_op_can_inplace(t->op)) {
+                continue;
+            }
+            if (t->src[s] == other) {
+                return false;
+            }
+            if (t->src[s] && t->src[s]->view_src == other) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+static bool memory_overlap(ggml_tensor * a, ggml_tensor * b) {
+    if (a->buffer != b->buffer) {
+        return false;
+    }
+    int64_t a0 = (int64_t) a->data;
+    int64_t a1 = a0 + ggml_nbytes(a);
+    int64_t b0 = (int64_t) b->data;
+    int64_t b1 = b0 + ggml_nbytes(b);
+    return a1 > b0 && b1 > a0;
+}
+
+static ggml_tensor * get_view_source(ggml_tensor * t) {
+    while (t->view_src) {
+        t = t->view_src;
+    }
+    return t;
+}
+
+static void check_no_overlap(ggml_cgraph * graph) {
+    for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
+        for (int j = 0; j < i; ++j) {
+            ggml_tensor * t = ggml_graph_node(graph, i);
+            ggml_tensor * o = ggml_graph_node(graph, j);
+            GGML_ASSERT(t != o);
+
+            if (get_view_source(t) == get_view_source(o)) {
+                continue;
+            }
+            if (memory_overlap(t, o)) {
+                GGML_ASSERT(can_reuse_memory(graph, i, t, o));
+            }
+        }
+    }
+}
+
+//
+// test cases
+
+// Scenario where the first backend buffer is completely exhausted and there are further
+// tensors which require a second buffer
+static void test_max_size_too_many_tensors() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[7];
+    x[0] = make_input_with_size(ctx, 8);
+    x[1] = make_input_with_size(ctx, 8);
+    x[2] = make_input_with_size(ctx, 8);
+    x[3] = ggml_mul(ctx, x[0], x[1]);
+    x[4] = ggml_add(ctx, x[1], x[2]);
+    x[5] = ggml_add(ctx, x[3], x[0]);
+    x[6] = ggml_add(ctx, x[4], x[5]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[6], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
+}
+
+// Scenario where there is some space left in the first buffer, but not enough to accommodate
+// a larger tensor, so a second buffer is required
+static void test_max_size_tensor_too_large() {
+    dummy_backend backend      = dummy_backend_init(32);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[3];
+    x[0] = make_input_with_size(ctx, 16);    // chunk 0, [0 , 16)
+    x[1] = make_input_with_size(ctx, 8);     // chunk 0, [16, 24)
+    x[2] = ggml_concat(ctx, x[0], x[1], 0);  // chunk 1, [0 , 24)
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 32 + 24);
+}
+
+// Scenario where a single tensor exceeds the max buffer size - in this case the allocator
+// should try to create a bigger buffer anyway, and wait for the backend to throw an error.
+// Backends may report an artificially lower max size in some cases for compatibility reasons.
+static void test_tensor_larger_than_max_size() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[2];
+    x[0] = make_input_with_size(ctx, 24);
+    x[1] = ggml_scale(ctx, x[0], 2.0f);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[1], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() == 24);
+}
+
+// This test assumes a max of 16 buffer chunks, and tries to allocate tensors that would
+// require more. Expectation is that the last buffer should grow to fit everything,
+// leaving it to the backend to error out if it can't allocate that much.
+static void test_not_enough_chunks() {
+    const int max_chunks = 16;
+    const int max_size   = 8;
+
+    dummy_backend backend      = dummy_backend_init(max_size);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[max_chunks + 1];
+    for (int i = 0; i < max_chunks + 1; ++i) {
+        x[i] = make_input_with_size(ctx, max_size);
+    }
+    ggml_tensor * acc = x[0];
+    for (int i = 0; i < max_chunks; ++i) {
+        acc = ggml_add(ctx, acc, x[i + 1]);
+    }
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, acc, &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() > max_chunks * max_size);
+}
+
+// Fill up leftover unallocated space of a chunk after allocating a large tensor that
+// requires a new chunk.
+static void test_fill_leftover_space() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[4];
+    x[0] = make_input_with_size(ctx, 8);
+    x[1] = ggml_pad(ctx, x[0], 2, 0, 0, 0);
+    x[3] = ggml_mean(ctx, x[1]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[3], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 12 + 16);
+}
+
+// Check that views don't require any extra memory
+static void test_view_inplace() {
+    dummy_backend backend      = dummy_backend_init(32);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[6];
+    x[0] = make_input_1d(ctx, 4);                // chunk 0, [0, 16)
+    x[1] = ggml_reshape_2d(ctx, x[0], 2, 2);     // view of x0
+    x[2] = ggml_permute(ctx, x[1], 1, 0, 2, 3);  // view of x0
+    x[3] = ggml_view_1d(ctx, x[2], 2, 4);        // view of x0
+    x[4] = make_input_1d(ctx, 2);                // chunk 0, [16, 24)
+    x[5] = ggml_add(ctx, x[3], x[4]);            // reuse (inplace add)
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[5], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 24);
+}
+
+static void test_reuse_and_free() {
+    dummy_backend backend      = dummy_backend_init(40);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 24);
+    x[1] = make_input_with_size(ctx, 8);
+    x[2] = make_input_with_size(ctx, 8);
+    x[3] = ggml_add(ctx, x[1], x[2]);        // reuse, free x2
+    x[4] = ggml_pad(ctx, x[0], 2, 0, 0, 0);  // alloc new buffer, free x0
+    x[5] = ggml_scale(ctx, x[4], 2.0f);      // alloc from free block
+    x[6] = ggml_add(ctx, x[4], x[5]);        // reuse, free x5
+    x[7] = ggml_view_1d(ctx, x[6], 2, 8);    // view
+    x[8] = ggml_add(ctx, x[3], x[7]);        // reuse
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 40 + 32 + 32);
+}
+
+static void test_merge_free_block(size_t max_buffer_size) {
+    dummy_backend backend      = dummy_backend_init(max_buffer_size);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = make_input_with_size(ctx, 16);
+    x[2] = make_input_with_size(ctx, 16);
+    x[3] = ggml_mean(ctx, x[0]);
+    x[4] = ggml_mean(ctx, x[1]);
+    x[5] = ggml_pad(ctx, x[2], 2, 0, 0, 0);
+    x[6] = ggml_add(ctx, x[3], x[4]);
+    x[7] = ggml_pad(ctx, x[6], 5, 0, 0, 0);
+    x[8] = ggml_add(ctx, x[5], x[7]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 24);
+}
+
+// Check that previously allocated but freed memory is preferred over allocating
+// additional memory, even if the remaining space in a chunk would match tensor size better
+static void test_prefer_already_allocated_memory() {
+    dummy_backend backend      = dummy_backend_init(32, /*align*/ 4);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[3];
+    x[0] = make_input_with_size(ctx, 24);  // [24b][8b unused]
+    x[1] = ggml_mean(ctx, x[0]);           // [24b free][4b][4b unused]
+    x[2] = ggml_mean(ctx, x[1]);           // should be allocated in the 24b block
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() <= 28);
+}
+
+// test for allocating on multiple devices with some tensors in the graph
+// allocated externally (not by gallocr).
+static void test_multiple_buffer_types() {
+    dummy_backend backend_a = dummy_backend_init(32);
+    dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
+
+    auto [ctx_a, _a, ctx_a_ptr] = make_context();
+    auto [ctx_b, _b, ctx_b_ptr] = make_context();
+    auto [ctx, graph, ctx_ptr]  = make_context();
+
+    ggml_tensor * a[2];
+    a[0] = make_input_with_size(ctx_a, 16);
+    a[1] = make_input_with_size(ctx_a, 16);
+    assign_names(ctx_a, "a");
+
+    ggml_tensor * b[2];
+    b[0] = make_input_with_size(ctx_b, 24);
+    b[1] = make_input_with_size(ctx_b, 4);
+    assign_names(ctx_b, "b");
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = ggml_mul(ctx, x[0], a[0]);
+    x[2] = ggml_pad(ctx, x[1], 2, 0, 0, 0);
+    x[3] = ggml_mul(ctx, x[2], b[0]);
+    x[4] = ggml_mean(ctx, x[3]);
+    x[5] = ggml_add(ctx, x[4], b[1]);
+    x[6] = ggml_pad(ctx, x[5], 3, 0, 0, 0);
+    x[7] = ggml_add(ctx, x[6], a[1]);
+    x[8] = ggml_scale(ctx, x[7], 2.0f);
+    assign_names(ctx, "x");
+
+    ggml_backend_buffer_ptr    buf_a(ggml_backend_alloc_ctx_tensors_from_buft(ctx_a, &backend_a.buffer_type));
+    ggml_backend_buffer_ptr    buf_b(ggml_backend_alloc_ctx_tensors_from_buft(ctx_b, &backend_b.buffer_type));
+    ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
+
+    // assign buffer types manually to avoid extra complexity from backend scheduler
+    ggml_set_output(x[8]);
+    ggml_build_forward_expand(graph, x[8]);
+
+    GGML_ASSERT(graph->n_leafs == 5);
+    int leaf_buffer_ids[5];
+    leaf_buffer_ids[get_leaf_id(graph, "a0")] = 0;
+    leaf_buffer_ids[get_leaf_id(graph, "a1")] = 0;
+    leaf_buffer_ids[get_leaf_id(graph, "b0")] = 1;
+    leaf_buffer_ids[get_leaf_id(graph, "b1")] = 1;
+    leaf_buffer_ids[get_leaf_id(graph, "x0")] = 0;
+
+    GGML_ASSERT(graph->n_nodes == 8);
+    int node_buffer_ids[8];
+    node_buffer_ids[get_node_id(graph, "x1")] = 0;
+    node_buffer_ids[get_node_id(graph, "x2")] = 0;
+    node_buffer_ids[get_node_id(graph, "x3")] = 1;
+    node_buffer_ids[get_node_id(graph, "x4")] = 1;
+    node_buffer_ids[get_node_id(graph, "x5")] = 1;
+    node_buffer_ids[get_node_id(graph, "x6")] = 1;
+    node_buffer_ids[get_node_id(graph, "x7")] = 0;
+    node_buffer_ids[get_node_id(graph, "x8")] = 0;
+
+    ggml_gallocr_ptr galloc(ggml_gallocr_new_n(bufts, 2));
+    ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
+    ggml_gallocr_alloc_graph(galloc.get(), graph);
+
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend_a.context->allocated_total() <= 32 + 32 + 24);
+    GGML_ASSERT(backend_b.context->allocated_total() <= 32 + 24);
+}
+
+static void test_buffer_size_zero() {
+    dummy_backend backend_a    = dummy_backend_init(SIZE_MAX);
+    dummy_backend backend_b    = dummy_backend_init(SIZE_MAX);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[2];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = ggml_scale(ctx, x[0], 2.0f);
+
+    ggml_set_output(x[1]);
+    ggml_build_forward_expand(graph, x[1]);
+
+    int leaf_buffer_ids[1] = { 0 };
+    int node_buffer_ids[1] = { 0 };
+
+    ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
+    ggml_gallocr_ptr           galloc   = ggml_gallocr_ptr(ggml_gallocr_new_n(bufts, 2));
+    bool                       res1     = ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
+    bool                       res2     = ggml_gallocr_alloc_graph(galloc.get(), graph);
+    GGML_ASSERT(res1 && res2);
+
+    check_all_allocated(graph);
+    GGML_ASSERT(backend_a.context->allocated_total() == 16);
+    GGML_ASSERT(backend_b.context->allocated_total() == 0);
+}
+
+// Test re-using gallocr for a different graph. The new graph has the same
+// total size, but one of the chunks is larger, so reallocation is required.
+static void test_reallocation() {
+    dummy_backend    backend = dummy_backend_init(32, /*align*/ 4);
+    ggml_gallocr_ptr galloc;
+    {
+        auto [ctx, graph, ctx_ptr] = make_context();
+        ggml_tensor * x[4];
+        x[0] = make_input_with_size(ctx, 24);
+        x[1] = make_input_with_size(ctx, 16);
+        x[2] = ggml_view_1d(ctx, x[0], 4, 0);
+        x[3] = ggml_add(ctx, x[2], x[1]);
+        assign_names(ctx);
+
+        galloc = allocate_graph(graph, x[3], &backend.buffer_type);
+        check_all_allocated(graph);
+        GGML_ASSERT(backend.context->allocated_total() == 40);
+    }
+    {
+        auto [ctx, graph, ctx_ptr] = make_context();
+        ggml_tensor * x[3];
+        x[0] = make_input_with_size(ctx, 20);
+        x[1] = make_input_with_size(ctx, 20);
+        x[2] = ggml_add(ctx, x[0], x[1]);
+        assign_names(ctx);
+        ggml_set_output(x[2]);
+        ggml_build_forward_expand(graph, x[2]);
+
+        bool result = ggml_gallocr_alloc_graph(galloc.get(), graph);
+        GGML_ASSERT(result);
+        check_all_allocated(graph);
+        GGML_ASSERT(backend.context->allocated_total() == 40);
+    }
+}
+
+static void run(const char * name, void (*f)()) {
+    printf("%s ", name);
+    fflush(stdout);
+    f();
+    printf("PASSED\n");
+}
+
+int main() {
+    run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
+    run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
+    run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
+    run("test_not_enough_chunks", test_not_enough_chunks);
+    run("test_fill_leftover_space", test_fill_leftover_space);
+    run("test_view_inplace", test_view_inplace);
+    run("test_reuse_and_free", test_reuse_and_free);
+    run("test_merge_free_block(32)", []() { test_merge_free_block(32); });
+    run("test_merge_free_block(SIZE_MAX)", []() { test_merge_free_block(SIZE_MAX); });
+    run("test_prefer_already_allocated_memory", test_prefer_already_allocated_memory);
+    run("test_multiple_buffer_types", test_multiple_buffer_types);
+    run("test_buffer_size_zero", test_buffer_size_zero);
+    run("test_reallocation", test_reallocation);
+    return 0;
+}
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -0,0 +1,212 @@
+#include "arg.h"
+#include "common.h"
+#include "download.h"
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include <unordered_set>
+
+#undef NDEBUG
+#include <cassert>
+
+int main(void) {
+    common_params params;
+
+    printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
+    for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
+        try {
+            auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
+            common_params_add_preset_options(ctx_arg.options);
+            std::unordered_set<std::string> seen_args;
+            std::unordered_set<std::string> seen_env_vars;
+            for (const auto & opt : ctx_arg.options) {
+                // check for args duplications
+                for (const auto & arg : opt.get_args()) {
+                    if (seen_args.find(arg) == seen_args.end()) {
+                        seen_args.insert(arg);
+                    } else {
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str());
+                        exit(1);
+                    }
+                }
+                // check for env var duplications
+                for (const auto & env : opt.get_env()) {
+                    if (seen_env_vars.find(env) == seen_env_vars.end()) {
+                        seen_env_vars.insert(env);
+                    } else {
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str());
+                        exit(1);
+                    }
+                }
+
+                // exclude spec args from this check
+                // ref: https://github.com/ggml-org/llama.cpp/pull/22397
+                const bool skip = opt.is_spec;
+
+                // ensure shorter argument precedes longer argument
+                if (!skip && opt.args.size() > 1) {
+                    const std::string first(opt.args.front());
+                    const std::string last(opt.args.back());
+
+                    if (first.length() > last.length()) {
+                        fprintf(stderr, "test-arg-parser: shorter argument should come before longer one: %s, %s\n",
+                                first.c_str(), last.c_str());
+                        assert(false);
+                    }
+                }
+
+                // same check for negated arguments
+                if (opt.args_neg.size() > 1) {
+                    const std::string first(opt.args_neg.front());
+                    const std::string last(opt.args_neg.back());
+
+                    if (first.length() > last.length()) {
+                        fprintf(stderr, "test-arg-parser: shorter negated argument should come before longer one: %s, %s\n",
+                                first.c_str(), last.c_str());
+                        assert(false);
+                    }
+                }
+            }
+        } catch (std::exception & e) {
+            printf("%s\n", e.what());
+            assert(false);
+        }
+    }
+
+    auto list_str_to_char = [](std::vector<std::string> & argv) -> std::vector<char *> {
+        std::vector<char *> res;
+        for (auto & arg : argv) {
+            res.push_back(const_cast<char *>(arg.data()));
+        }
+        return res;
+    };
+
+    std::vector<std::string> argv;
+
+    printf("test-arg-parser: test invalid usage\n\n");
+
+    // missing value
+    argv = {"binary_name", "-m"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // wrong value (int)
+    argv = {"binary_name", "-ngl", "hello"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // wrong value (enum)
+    argv = {"binary_name", "-sm", "hello"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
+    argv = {"binary_name", "--draft", "123"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING));
+
+    // negated arg
+    argv = {"binary_name", "--no-mmap"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+
+    printf("test-arg-parser: test valid usage\n\n");
+
+    argv = {"binary_name", "-m", "model_file.gguf"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model.path == "model_file.gguf");
+
+    argv = {"binary_name", "-t", "1234"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.cpuparams.n_threads == 1234);
+
+    argv = {"binary_name", "--verbose"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.verbosity > 1);
+
+    argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model.path == "abc.gguf");
+    assert(params.n_predict == 6789);
+    assert(params.n_batch == 9090);
+
+    // --draft cannot be used outside llama-speculative
+    argv = {"binary_name", "--spec-draft-n-max", "123"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
+    assert(params.speculative.draft.n_max == 123);
+
+    // multi-value args (CSV)
+    argv = {"binary_name", "--lora", "file1.gguf,\"file2,2.gguf\",\"file3\"\"3\"\".gguf\",file4\".gguf"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.lora_adapters.size() == 4);
+    assert(params.lora_adapters[0].path == "file1.gguf");
+    assert(params.lora_adapters[1].path == "file2,2.gguf");
+    assert(params.lora_adapters[2].path == "file3\"3\".gguf");
+    assert(params.lora_adapters[3].path == "file4\".gguf");
+
+// skip this part on windows, because setenv is not supported
+#ifdef _WIN32
+    printf("test-arg-parser: skip on windows build\n");
+#else
+    printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n");
+
+    setenv("LLAMA_ARG_THREADS", "blah", true);
+    argv = {"binary_name"};
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
+    setenv("LLAMA_ARG_THREADS", "1010", true);
+    argv = {"binary_name"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model.path == "blah.gguf");
+    assert(params.cpuparams.n_threads == 1010);
+
+    printf("test-arg-parser: test negated environment variables\n\n");
+
+    setenv("LLAMA_ARG_MMAP", "0", true);
+    setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
+    argv = {"binary_name"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.use_mmap == false);
+    assert(params.no_perf == true);
+
+    printf("test-arg-parser: test environment variables being overwritten\n\n");
+
+    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
+    setenv("LLAMA_ARG_THREADS", "1010", true);
+    argv = {"binary_name", "-m", "overwritten.gguf"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.model.path == "overwritten.gguf");
+    assert(params.cpuparams.n_threads == 1010);
+#endif // _WIN32
+
+    printf("test-arg-parser: test download functions\n\n");
+    const char * GOOD_URL = "http://ggml.ai/";
+    const char * BAD_URL  = "http://ggml.ai/404";
+
+    {
+        printf("test-arg-parser: test good URL\n\n");
+        auto res = common_remote_get_content(GOOD_URL, {});
+        assert(res.first == 200);
+        assert(res.second.size() > 0);
+        std::string str(res.second.data(), res.second.size());
+        assert(str.find("llama.cpp") != std::string::npos);
+    }
+
+    {
+        printf("test-arg-parser: test bad URL\n\n");
+        auto res = common_remote_get_content(BAD_URL, {});
+        assert(res.first == 404);
+    }
+
+    {
+        printf("test-arg-parser: test max size error\n");
+        common_remote_params params;
+        params.max_size = 1;
+        try {
+            common_remote_get_content(GOOD_URL, params);
+            assert(false && "it should throw an error");
+        } catch (std::exception & e) {
+            printf("  expected error: %s\n\n", e.what());
+        }
+    }
+
+    printf("test-arg-parser: all tests OK\n\n");
+}
--- a/tests/test-autorelease.cpp
+++ b/tests/test-autorelease.cpp
@@ -0,0 +1,24 @@
+// ref: https://github.com/ggml-org/llama.cpp/issues/4952#issuecomment-1892864763
+
+#include <cstdio>
+#include <string>
+#include <thread>
+
+#include "llama.h"
+#include "get-model.h"
+
+// This creates a new context inside a pthread and then tries to exit cleanly.
+int main(int argc, char ** argv) {
+    auto * model_path = get_model_or_exit(argc, argv);
+
+    std::thread([&model_path]() {
+        llama_backend_init();
+        auto * model = llama_model_load_from_file(model_path, llama_model_default_params());
+        auto * ctx = llama_init_from_model(model, llama_context_default_params());
+        llama_free(ctx);
+        llama_model_free(model);
+        llama_backend_free();
+    }).join();
+
+    return 0;
+}
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
--- a/tests/test-backend-sampler.cpp
+++ b/tests/test-backend-sampler.cpp
--- a/tests/test-barrier.cpp
+++ b/tests/test-barrier.cpp
@@ -0,0 +1,236 @@
+#include "ggml.h"
+#include "ggml-cpu.h"
+
+#include <chrono>
+#include <iostream>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <vector>
+#include <thread>
+
+#define MAX_NARGS 2
+
+static void test_barrier(int n_threads, int n_rounds) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 1024*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    // Create graph
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+
+    // Lots of small, parallel ops where barriers in between will dominate
+    struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
+    for (int i = 0; i < 1000; i++) {
+        struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
+        out = ggml_mul_mat(ctx, a, out);
+
+        struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
+        out = ggml_mul_mat(ctx, d, out);
+    }
+
+    ggml_build_forward_expand(gf, out);
+    int n_nodes = ggml_graph_n_nodes(gf);
+
+    // Create threadpool
+    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
+    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+    if (!threadpool) {
+        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
+        exit(1);
+    }
+
+    // The test runs with constant number of threads
+    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
+
+    std::vector<uint8_t> work_data(cplan.work_size);
+    cplan.work_data = work_data.data();
+
+    std::cerr << "graph-compute with"
+              << "\n n_threads: " << n_threads
+              << "\n   n_nodes: " << n_nodes
+              << "\n  n_rounds: " << n_rounds
+              << "\n";
+    // ggml_graph_print(gf);
+
+    // Warmup
+    ggml_graph_compute(gf, &cplan);
+
+    auto t0 = std::chrono::high_resolution_clock::now();
+
+    for (int i=0; i < n_rounds; i++) {
+        ggml_graph_compute(gf, &cplan);
+    }
+
+    auto t1 = std::chrono::high_resolution_clock::now();
+
+    auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
+    auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
+    std::cerr << "graph-compute took " << usec << " usec "
+              << "\n " << (float) usec / n_rounds << " usec per-iter"
+              << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
+              << "\n";
+
+    ggml_threadpool_free(threadpool);
+    ggml_free(ctx);
+}
+
+static void test_active(int n_threads, int n_rounds) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 1024*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    // Create graph
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+
+    // Small graph with, parallel ops with barriers
+    struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
+    for (int i = 0; i < 2; i++) {
+        struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
+        out = ggml_mul_mat(ctx, a, out);
+
+        struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
+        out = ggml_mul_mat(ctx, d, out);
+    }
+
+    ggml_build_forward_expand(gf, out);
+    int n_nodes = ggml_graph_n_nodes(gf);
+
+    // Create threadpool
+    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
+    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+    if (!threadpool) {
+        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
+        exit(1);
+    }
+
+    std::cerr << "graph-compute with"
+              << "\n n_threads: " << n_threads
+              << "\n   n_nodes: " << n_nodes
+              << "\n  n_rounds: " << n_rounds
+              << "\n";
+    // ggml_graph_print(gf);
+
+    // In this test we keep changing the number of threads every 4th iteration
+    // to test for race conditions in that path
+
+    for (int i=0; i < n_rounds; i++) {
+        struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
+
+        std::vector<uint8_t> work_data(cplan.work_size);
+        cplan.work_data = work_data.data();
+
+        ggml_graph_compute(gf, &cplan);
+    }
+
+    ggml_threadpool_free(threadpool);
+    ggml_free(ctx);
+}
+
+static void test_multi_graph(int n_threads, int n_rounds) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 1024*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    struct ggml_context * ctx = ggml_init(params);
+
+    // Create graphs
+    struct ggml_cgraph * gf0 = ggml_new_graph(ctx);
+    {
+        // Small graph with parallel ops with barriers
+        struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
+        for (int i = 0; i < 2; i++) {
+            struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
+            out = ggml_mul_mat(ctx, a, out);
+
+            struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
+            out = ggml_mul_mat(ctx, d, out);
+        }
+
+        ggml_build_forward_expand(gf0, out);
+    }
+
+    struct ggml_cgraph * gf1 = ggml_new_graph(ctx);
+    {
+        // Small graph with parallel ops with barriers
+        // Use larger tensors to make sure work_data size is larger than gf0
+        struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  256);
+        for (int i = 0; i < 4; i++) {
+            struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 256, 128);
+            out = ggml_mul_mat(ctx, a, out);
+
+            struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 256);
+            out = ggml_mul_mat(ctx, d, out);
+        }
+
+        ggml_build_forward_expand(gf1, out);
+    }
+
+
+    // Create threadpool
+    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
+    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
+    if (!threadpool) {
+        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
+        exit(1);
+    }
+
+    std::cerr << "graph-compute with"
+              << "\n gf0 n_nodes: " << ggml_graph_n_nodes(gf0)
+              << "\n gf1 n_nodes: " << ggml_graph_n_nodes(gf1)
+              << "\n   n_threads: " << n_threads
+              << "\n    n_rounds: " << n_rounds
+              << "\n";
+
+    // In this test we keep changing the number of threads every 4th iteration
+    // and we compute two graphs back to back to test graph frequent graph switching
+
+    for (int i=0; i < n_rounds; i++) {
+        struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
+        std::vector<uint8_t> work_data0(cplan0.work_size);
+        cplan0.work_data = work_data0.data();
+
+        struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
+        std::vector<uint8_t> work_data1(cplan1.work_size);
+        cplan1.work_data = work_data1.data();
+
+        ggml_graph_compute(gf0, &cplan0);
+        ggml_graph_compute(gf1, &cplan1);
+    }
+
+    ggml_threadpool_free(threadpool);
+    ggml_free(ctx);
+}
+
+
+int main(int argc, char *argv[]) {
+
+    int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
+    int n_rounds  = 100;
+
+    if (argc > 1) {
+        n_threads = std::atoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        n_rounds  = std::atoi(argv[2]);
+    }
+
+    test_barrier(n_threads, n_rounds);
+
+    test_active(n_threads,  n_rounds * 100);
+
+    test_multi_graph(n_threads,  n_rounds * 10);
+
+    return 0;
+}
--- a/tests/test-c.c
+++ b/tests/test-c.c
@@ -0,0 +1,3 @@
+#include "llama.h"
+
+int main(void) {}
--- a/tests/test-chat-auto-parser.cpp
+++ b/tests/test-chat-auto-parser.cpp
--- a/tests/test-chat-peg-parser.cpp
+++ b/tests/test-chat-peg-parser.cpp
@@ -0,0 +1,983 @@
+#include "chat-peg-parser.h"
+#include "chat.h"
+#include "common.h"
+#include "json-schema-to-grammar.h"
+#include "peg-parser.h"
+#include "testing.h"
+#include "peg-parser/simple-tokenize.h"
+
+#include <iostream>
+#include <numeric>
+#include <string>
+
+#include "nlohmann/json.hpp"
+
+using json = nlohmann::ordered_json;
+
+static json create_tools();
+static void test_example_native(testing & t);
+static void test_example_qwen3_coder(testing & t);
+static void test_example_qwen3_non_coder(testing & t);
+static void test_command7_parser_compare(testing & t);
+static void test_prefix_tool_names(testing & t);
+static void test_tagged_peg_parser(testing & t);
+
+int main(int argc, char * argv[]) {
+    testing t(std::cout);
+    if (argc >= 2) {
+        t.set_filter(argv[1]);
+    }
+
+    const char * verbose = getenv("LLAMA_TEST_VERBOSE");
+    if (verbose) {
+        t.verbose = std::string(verbose) == "1";
+    }
+
+    t.test("native", test_example_native);
+    t.test("qwen3 coder", test_example_qwen3_coder);
+    t.test("qwen3 non-coder", test_example_qwen3_non_coder);
+    t.test("comparison", test_command7_parser_compare);
+    t.test("prefix tool names", test_prefix_tool_names);
+    t.test("tagged peg parser", test_tagged_peg_parser);
+
+    return t.summary();
+}
+
+static json create_tools() {
+    json tools = json::array();
+
+    json tool_weather = {
+        { "type",     "function" },
+        { "function",
+         {
+              { "name", "get_current_weather" },
+              { "description", "Get the current weather in a given location" },
+              { "parameters",
+                {
+                    { "type", "object" },
+                    { "properties",
+                      { { "location",
+                          { { "type", "string" }, { "description", "The city and state, e.g. San Francisco, CA" } } },
+                        { "unit",
+                          { { "type", "string" },
+                            { "enum", { "celsius", "fahrenheit" } },
+                            { "description",
+                              "The temperature unit to use. Infer this from the users location." } } } } },
+                    { "required", { "location", "unit" } },
+                } },
+          }                      }
+    };
+    tools.push_back(tool_weather);
+
+    json tool_forecast = {
+        { "type",     "function" },
+        { "function",
+         {
+              { "name", "get_forecast" },
+              { "description", "Get the weather forecast for a given location" },
+              { "parameters",
+                {
+                    { "type", "object" },
+                    { "properties",
+                      { { "location",
+                          { { "type", "string" }, { "description", "The city and state, e.g. San Francisco, CA" } } },
+                        { "unit",
+                          { { "type", "string" },
+                            { "enum", { "celsius", "fahrenheit" } },
+                            { "description", "The temperature unit to use. Infer this from the users location." } } },
+                        { "days",
+                          { { "type", "integer" },
+                            { "description", "Number of days to forecast (1-10)" },
+                            { "minimum", 1 },
+                            { "maximum", 10 } } } } },
+                    { "required", { "location", "unit" } },
+                } },
+          }                      }
+    };
+    tools.push_back(tool_forecast);
+
+    json tool_search = {
+        { "type",     "function" },
+        { "function",
+         { { "name", "search_knowledge_base" },
+            { "description", "Search the internal technical documentation knowledge base." },
+            { "parameters",
+              { { "type", "object" },
+                { "properties",
+                  { { "query", { { "type", "string" }, { "description", "The search query string." } } },
+                    { "max_results",
+                      { { "type", "integer" },
+                        { "description", "The maximum number of results to return." },
+                        { "default", 5 } } },
+                    { "category",
+                      { { "type", "string" },
+                        { "enum", { "api", "troubleshooting", "billing", "general" } },
+                        { "description", "Filter search by specific category." } } } } },
+                { "required", { "query", "category" } },
+                { "additionalProperties", false } } },
+            { "strict", true } } }
+    };
+    tools.push_back(tool_search);
+
+    return tools;
+}
+
+struct tool_argument {
+    std::string name;
+    std::string type;
+    bool        is_required;
+    json        schema;
+};
+
+struct tool_definition {
+    std::string                name;
+    std::vector<tool_argument> arguments;
+    json                       schema;
+};
+
+// Test fictitious model output that emits arguments as JSON.
+static void test_example_native(testing & t) {
+    struct test_case {
+        // Parameters
+        std::string             name;
+        json                    tools;
+        common_chat_tool_choice tool_choice;
+        common_reasoning_format reasoning_format;
+        json                    json_schema;
+        bool                    parallel_tool_calls;
+        std::string             generation_prompt;
+        std::string             input;
+
+        // Expect
+        std::string                        expect_reasoning;
+        std::string                        expect_content;
+        std::vector<common_chat_tool_call> expect_tool_calls;
+    };
+
+    auto build_parser = [](const test_case & tc) {
+        return build_chat_peg_parser([&](common_chat_peg_builder & p) {
+            auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE);
+            // Always use optional TAG_BASED pattern; generation_prompt is prepended to input
+            auto reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
+
+            // tool calling parser
+            if (tc.tools.is_array() && !tc.tools.empty()) {
+                auto tool_call =
+                    p.standard_json_tools("<tool_call>[", "]</tool_call>", tc.tools, tc.parallel_tool_calls,
+                                          tc.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED);
+
+                return p.sequence({ (reasoning_in_content ? p.eps() : reasoning), p.content(p.until("<tool_call>")),
+                                    p.optional(p.space() + tool_call), p.space(), p.end() });
+            }
+
+            // response_format parser
+            if (tc.json_schema.is_object() && !tc.json_schema.empty()) {
+                return p.sequence({ (reasoning_in_content ? p.eps() : reasoning),
+                                    p.content(p.schema(p.json(), "response-output", tc.json_schema)), p.space(),
+                                    p.end() });
+            }
+
+            // Content-only parser
+            return p.sequence({ (reasoning_in_content ? p.eps() : reasoning), p.content(p.rest()), p.end() });
+        });
+    };
+
+    std::vector<test_case> test_cases = std::vector<test_case>{
+        {
+         /* .name =                 */ "content with reasoning (no generation_prompt)",
+         /* .tools =                */ {},
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+         /* .json_schema =          */ {},
+         /* .parallel_tool_calls =  */ false,
+         /* .generation_prompt =    */ "",
+         /* .input =                */ ("<think>The user said hello, I must say hello back</think>\nHello"),
+         /* .expect_reasoning =     */ "The user said hello, I must say hello back",
+         /* .expect_content =       */ "Hello",
+         /* .expect_tool_calls =    */ {},
+         },
+        {
+         /* .name =                 */ "content without reasoning (no generation_prompt)",
+         /* .tools =                */ {},
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+         /* .json_schema =          */ {},
+         /* .parallel_tool_calls =  */ false,
+         /* .generation_prompt =    */ "",
+         /* .input =                */ ("Hello"),
+         /* .expect_reasoning =     */ "",
+         /* .expect_content =       */ "Hello",
+         /* .expect_tool_calls =    */ {},
+         },
+        {
+         /* .name =                 */ "content with reasoning_format = none (tags appear in content)",
+         /* .tools =                */ {},
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
+         /* .json_schema =          */ {},
+         /* .parallel_tool_calls =  */ false,
+         /* .generation_prompt =    */ "",
+         /* .input =                */ ("<think>The user said hello, I must say hello back</think>\nHello"),
+         /* .expect_reasoning =     */ "",
+         /* .expect_content =       */ "<think>The user said hello, I must say hello back</think>\nHello",
+         /* .expect_tool_calls =    */ {},
+         },
+        {
+         /* .name =                 */ "content with reasoning generation_prompt",
+         /* .tools =                */ {},
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+         /* .json_schema =          */ {},
+         /* .parallel_tool_calls =  */ false,
+         /* .generation_prompt =    */ "<think>",
+         /* .input =                */ ("The user said hello, I must say hello back</think>\nHello"),
+         /* .expect_reasoning =     */ "The user said hello, I must say hello back",
+         /* .expect_content =       */ "Hello",
+         /* .expect_tool_calls =    */ {},
+         },
+        {
+         /* .name =                 */ "content with reasoning generation_prompt and reasoning_format = none",
+         /* .tools =                */ {},
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
+         /* .json_schema =          */ {},
+         /* .parallel_tool_calls =  */ false,
+         /* .generation_prompt =    */ "",
+         /* .input =                */ ("The user said hello, I must say hello back</think>\nHello"),
+         /* .expect_reasoning =     */ "",
+         /* .expect_content =       */ "The user said hello, I must say hello back</think>\nHello",
+         /* .expect_tool_calls =    */ {},
+         },
+        {
+         /* .name =                 */ "content with closed reasoning generation_prompt (empty reasoning discarded)",
+         /* .tools =                */ {},
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+         /* .json_schema =          */ {},
+         /* .parallel_tool_calls =  */ false,
+         /* .generation_prompt =    */ "<think></think>",
+         /* .input =                */ ("Hello"),
+         /* .expect_reasoning =     */ "",
+         /* .expect_content =       */ "Hello",
+         /* .expect_tool_calls =    */ {},
+         },
+        {
+         /* .name =                 */ "tools with reasoning generation_prompt",
+         /* .tools =                */ create_tools(),
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+         /* .json_schema =          */ {},
+         /* .parallel_tool_calls =  */ false,
+         /* .generation_prompt =    */ "<think>",
+         /* .input =                */
+            ("I must get the weather in New York</think>\n"
+             "<tool_call>["
+             R"({"name": "get_current_weather", "arguments": {"location": "New York City, NY", "unit": "fahrenheit"}})"
+             "]</tool_call>"),
+         /* .expect_reasoning =     */ "I must get the weather in New York",
+         /* .expect_content =       */ "",
+         /* .expect_tool_calls =    */
+            { {
+                /* .name =      */ "get_current_weather",
+                /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit"})",
+                /* .id =        */ "",
+            } },
+         },
+        {
+         /* .name =                 */ "parallel tools with reasoning generation_prompt",
+         /* .tools =                */ create_tools(),
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+         /* .json_schema =          */ {},
+         /* .parallel_tool_calls =  */ true,
+         /* .generation_prompt =    */ "<think>",
+         /* .input =                */
+            ("I must get the weather in New York and San Francisco and a 3 day forecast of each.</think>\nLet me "
+             "search that for you."
+             "<tool_call>["
+             R"({"name": "get_current_weather", "arguments": {"location": "New York City, NY", "unit": "fahrenheit"}})"
+             ", "
+             R"({"name": "get_current_weather", "arguments": {"location": "San Francisco, CA", "unit": "fahrenheit"}})"
+             ", "
+             R"({"name": "get_forecast", "arguments": {"location": "New York City, NY", "unit": "fahrenheit", "days": 3}})"
+             ", "
+             R"({"name": "get_forecast", "arguments": {"location": "San Francisco, CA", "unit": "fahrenheit", "days": 3}})"
+             "]</tool_call>"),
+         /* .expect_reasoning =     */
+            "I must get the weather in New York and San Francisco and a 3 day forecast of each.",                                                                     /* .expect_content =       */ "Let me search that for you.",
+         /* .expect_tool_calls =    */
+            { {
+                  /* .name =      */ "get_current_weather",
+                  /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit"})",
+                  /* .id =        */ "",
+              },
+              {
+                  /* .name =      */ "get_current_weather",
+                  /* .arguments = */ R"({"location": "San Francisco, CA", "unit": "fahrenheit"})",
+                  /* .id =        */ "",
+              },
+              {
+                  /* .name =      */ "get_forecast",
+                  /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit", "days": 3})",
+                  /* .id =        */ "",
+              },
+              {
+                  /* .name =      */ "get_forecast",
+                  /* .arguments = */ R"({"location": "San Francisco, CA", "unit": "fahrenheit", "days": 3})",
+                  /* .id =        */ "",
+              } },
+         },
+        {
+         /* .name =                 */ "response_format with reasoning generation_prompt",
+         /* .tools =                */ {},
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+         /* .json_schema =          */
+            { { "type", "object" },
+              { "properties",
+                { { "invoice_number", { { "type", "string" } } },
+                  { "amount", { { "type", "number" } } },
+                  { "due_date", { { "type", "string" } } } } },
+              { "required", { "invoice_number", "amount", "due_date" } } },
+         /* .parallel_tool_calls =  */ false,
+         /* .generation_prompt =    */ "<think>",
+         /* .input =                */
+            ("I must produce the invoice in the requested format</think>\n"
+             R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"),
+         /* .expect_reasoning =     */ "I must produce the invoice in the requested format",
+         /* .expect_content =       */
+            R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})", /* .expect_tool_calls =    */ {},
+         },
+    };
+
+    for (const auto & tc : test_cases) {
+        t.test(tc.name, [&](testing & t) {
+            auto parser  = build_parser(tc);
+            auto lazy    = !tc.tools.empty() && tc.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+            auto grammar = build_grammar([&](const common_grammar_builder & builder) {
+                for (const auto & def : tc.tools) {
+                    auto function   = def.at("function");
+                    auto parameters = function.at("parameters");
+                    builder.resolve_refs(parameters);
+                };
+                parser.build_grammar(builder, lazy);
+            });
+
+            t.log("Grammar:");
+            for (const auto & line : string_split(grammar, "\n")) {
+                t.log(line);
+            }
+
+            std::string              effective_input = tc.generation_prompt + tc.input;
+            common_peg_parse_context ctx(effective_input);
+            auto                     result = parser.parse(ctx);
+
+            t.assert_true("success", result.success());
+
+            common_chat_msg msg;
+            auto            mapper = common_chat_peg_mapper(msg);
+            mapper.from_ast(ctx.ast, result);
+
+            t.assert_equal("content equal", tc.expect_content, msg.content);
+            t.assert_equal("reasoning equal", tc.expect_reasoning, msg.reasoning_content);
+            t.assert_equal("number of tool calls", tc.expect_tool_calls.size(), msg.tool_calls.size());
+            for (auto i = 0u; i < std::min(tc.expect_tool_calls.size(), msg.tool_calls.size()); i++) {
+                t.assert_equal("tool name", tc.expect_tool_calls[i].name, msg.tool_calls[i].name);
+                t.assert_equal("tool args", tc.expect_tool_calls[i].arguments, msg.tool_calls[i].arguments);
+            }
+        });
+    }
+}
+
+static void test_example_qwen3_coder(testing & t) {
+    auto tools  = create_tools();
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        auto content = p.rule("content", p.content(p.until("<tool_call>")));
+
+        std::vector<common_peg_parser> tool_parsers;
+        for (const auto & def : tools) {
+            auto        function   = def.at("function");
+            std::string name       = function.at("name");
+            auto        parameters = function.at("parameters");
+            auto        properties = parameters.at("properties");
+
+            std::set<std::string> required_properties;
+            if (function.contains("required")) {
+                function.at("required").get_to(required_properties);
+            }
+
+            std::vector<common_peg_parser> arg_parsers;
+            for (const auto & [param_name, param_schema] : properties.items()) {
+                bool is_required = required_properties.find(param_name) != required_properties.end();
+                auto type        = param_schema.value("type", "object");
+
+                auto arg = p.tool_arg(
+                    p.sequence({ p.tool_arg_open("<parameter=" + p.tool_arg_name(p.literal(param_name)) + ">"),
+                                 (type == "string" ?
+                                      p.tool_arg_string_value(p.schema(
+                                          p.until_one_of({ "</parameter>\n<parameter=", "</parameter>\n</function>" }),
+                                          "tool-" + name + "-arg-" + param_name + "-schema", param_schema, true)) :
+                                      p.tool_arg_json_value(p.schema(
+                                          p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema))),
+                                 p.tool_arg_close("</parameter>\n" +
+                                                  p.peek(p.literal("<parameter=") | p.literal("</function>"))) }));
+
+                arg_parsers.push_back(is_required ? p.rule("tool-" + name + "-arg-" + param_name, arg) :
+                                                    p.optional(p.rule("tool-" + name + "-arg-" + param_name, arg)));
+            }
+
+            tool_parsers.push_back(p.rule("tool-" + name, p.tool_open("<function=" + p.tool_name(p.literal(name)) + ">")
+                                                              << p.sequence(arg_parsers)
+                                                              << p.tool_close(p.literal("</function>"))));
+        };
+
+        auto tool_call = p.trigger_rule("tool-call", "<tool_call>" << p.choice(tool_parsers) << "</tool_call>");
+
+        return content + p.zero_or_more(p.space() + tool_call) + p.end();
+    });
+
+    auto grammar = build_grammar([&](const common_grammar_builder & builder) {
+        for (const auto & def : tools) {
+            auto function   = def.at("function");
+            auto parameters = function.at("parameters");
+            builder.resolve_refs(parameters);
+        };
+        parser.build_grammar(builder);
+    });
+
+    t.log("Grammar:");
+    for (const auto & line : string_split(grammar, "\n")) {
+        t.log(line);
+    }
+
+    t.test("incremental parsing", [&](testing & t) {
+        std::string input =
+            "Let me search the knowledge base for cat pictures."
+            "<tool_call>\n"
+            "<function=search_knowledge_base>\n"
+            "<parameter=query>cat pictures</parameter>\n"
+            "<parameter=category>general</parameter>\n"
+            "</function>\n"
+            "</tool_call>";
+
+        std::vector<std::string> tokens = simple_tokenize(input);
+
+        common_chat_msg prev;
+        for (auto it = tokens.begin(); it != tokens.end(); it++) {
+            std::string in = std::accumulate(tokens.begin(), it + 1, std::string());
+
+            common_peg_parse_context ctx(in, (it + 1 < tokens.end()) ? COMMON_PEG_PARSE_FLAG_LENIENT : COMMON_PEG_PARSE_FLAG_NONE);
+
+            auto result = parser.parse(ctx);
+            if (!t.assert_equal("not fail", false, result.fail())) {
+                t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+            }
+
+            common_chat_msg msg;
+            auto            mapper = common_chat_peg_mapper(msg);
+            mapper.from_ast(ctx.ast, result);
+
+            //t.log("Input: " + input);
+            t.log("===========================================");
+            t.log("Iteration " + std::to_string(in.size()));
+            t.log("Reasoning: " + msg.reasoning_content);
+            t.log("Content  : " + msg.content);
+            for (const auto & tc : msg.tool_calls) {
+                t.log("Tool name: " + tc.name);
+                t.log("Tool args: " + tc.arguments);
+            }
+
+            try {
+                // This shouldn't emit any runtime errors
+                auto diffs = common_chat_msg_diff::compute_diffs(prev, msg);
+            } catch (const std::exception & e) {
+                t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+                t.assert_true(std::string("failed with ") + e.what(), false);
+            }
+
+            prev = msg;
+        }
+    });
+}
+
+static void test_example_qwen3_non_coder(testing & t) {
+    auto tools  = create_tools();
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        // tool calling parser using standard JSON format
+        auto tool_call = p.standard_json_tools("<tool_call>", "</tool_call>", tools, true, false);
+
+        return p.sequence({ p.content(p.until("<tool_call>")), p.optional(p.space() + tool_call), p.end() });
+    });
+
+    auto grammar = build_grammar([&](const common_grammar_builder & builder) {
+        for (const auto & def : tools) {
+            auto function   = def.at("function");
+            auto parameters = function.at("parameters");
+            builder.resolve_refs(parameters);
+        };
+        parser.build_grammar(builder);
+    });
+
+    t.log("Grammar:");
+    for (const auto & line : string_split(grammar, "\n")) {
+        t.log(line);
+    }
+
+    t.test("tool call parsing", [&](testing & t) {
+        std::string input =
+            "I need to get the weather.\n"
+            "<tool_call>"
+            "{\"name\": \"get_current_weather\", \"arguments\": {\"location\": \"New York City, NY\", \"unit\": "
+            "\"fahrenheit\"}}"
+            "</tool_call>";
+
+        common_peg_parse_context ctx(input);
+        auto                     result = parser.parse(ctx);
+
+        t.assert_true("success", result.success());
+
+        common_chat_msg msg;
+        auto            mapper = common_chat_peg_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+
+        t.assert_equal("content", "I need to get the weather.\n", msg.content);
+        t.assert_equal("reasoning", "", msg.reasoning_content);
+        t.assert_equal("tool calls count", 1u, msg.tool_calls.size());
+        if (!msg.tool_calls.empty()) {
+            t.assert_equal("tool name", "get_current_weather", msg.tool_calls[0].name);
+            t.assert_equal("tool args", "{\"location\": \"New York City, NY\", \"unit\": \"fahrenheit\"}",
+                           msg.tool_calls[0].arguments);
+        }
+    });
+
+    t.test("incremental parsing", [&](testing & t) {
+        std::string input =
+            "I need to get the weather.\n"
+            "<tool_call>"
+            "{\"name\": \"get_current_weather\", \"arguments\": {\"location\": \"New York City, NY\", \"unit\": "
+            "\"fahrenheit\"}}"
+            "</tool_call>";
+
+        std::vector<std::string> tokens = simple_tokenize(input);
+
+        common_chat_msg prev;
+        for (auto it = tokens.begin(); it != tokens.end(); it++) {
+            std::string in = std::accumulate(tokens.begin(), it + 1, std::string());
+
+            common_peg_parse_context ctx(in, (it + 1 < tokens.end()) ? COMMON_PEG_PARSE_FLAG_LENIENT : COMMON_PEG_PARSE_FLAG_NONE);
+
+            auto result = parser.parse(ctx);
+            if (!t.assert_equal("not fail", false, result.fail())) {
+                t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+            }
+
+            common_chat_msg msg;
+            auto            mapper = common_chat_peg_mapper(msg);
+            mapper.from_ast(ctx.ast, result);
+
+            //t.log("Input: " + input);
+            t.log("===========================================");
+            t.log("Iteration " + std::to_string(in.size()));
+            t.log("Reasoning: " + msg.reasoning_content);
+            t.log("Content  : " + msg.content);
+            for (const auto & tc : msg.tool_calls) {
+                t.log("Tool name: " + tc.name);
+                t.log("Tool args: " + tc.arguments);
+            }
+
+            try {
+                // This shouldn't emit any runtime errors
+                auto diffs = common_chat_msg_diff::compute_diffs(prev, msg);
+            } catch (const std::exception & e) {
+                t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+                t.assert_true(std::string("failed with ") + e.what(), false);
+            }
+
+            prev = msg;
+        }
+    });
+}
+
+void test_command7_parser_compare(testing & t) {
+    auto parser = build_chat_peg_parser([](common_chat_peg_builder & p) {
+        auto thinking =
+            p.reasoning_block("<|START_THINKING|>" << p.reasoning(p.until("<|END_THINKING|>")) << "<|END_THINKING|>");
+
+        auto response = "<|START_RESPONSE|>" << p.content(p.until("<|END_RESPONSE|>")) << "<|END_RESPONSE|>";
+
+        auto tool_call_id = p.atomic("\"tool_call_id\"" << (":" << ("\"" + p.tool_id(p.string_content('"')) + "\"")));
+        auto tool_call_name =
+            p.atomic("\"tool_name\"" << (":" << ("\"" + p.tool_name(p.string_content('"')) + "\"")));
+        auto tool_call_args = "\"parameters\"" << (":" << p.tool_args(p.json()));
+
+        auto tool_call_fields = p.rule("tool-call-fields", tool_call_id | tool_call_name | tool_call_args);
+        auto tool_call =
+            p.rule("tool-call", p.tool(p.tool_open(p.literal("{"))
+                                       << tool_call_fields << p.zero_or_more(p.literal(",") << tool_call_fields)
+                                       << p.tool_close(p.literal("}"))));
+
+        auto tool_calls = p.rule(
+            "tool-calls", "<|START_ACTION|>" << ("[" << tool_call << p.zero_or_more(p.literal(",") << tool_call) << "]")
+                                             << "<|END_ACTION|>");
+
+        return p.optional(thinking) << (tool_calls | response) + p.end();
+    });
+
+    auto test_current = [&](const common_peg_arena & p, const std::string & input, bool is_partial,
+                            bool print_results) {
+        common_peg_parse_context ctx(input, is_partial ? COMMON_PEG_PARSE_FLAG_LENIENT : COMMON_PEG_PARSE_FLAG_NONE);
+        auto                     result = p.parse(ctx);
+
+        common_chat_msg msg;
+        auto            mapper = common_chat_peg_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+
+        if (print_results) {
+            std::cout << "== Parsed (new) ==\n";
+            std::cout << "=== Reasoning ===\n";
+            std::cout << msg.reasoning_content << "\n";
+            std::cout << "\n\n=== Content ===\n";
+            std::cout << msg.content << "\n";
+            std::cout << "\n\n=== Tool Calls ===\n";
+            for (const auto & tc : msg.tool_calls) {
+                std::cout << "id: " << tc.id << "\n";
+                std::cout << "name: " << tc.name << "\n";
+                std::cout << "args: " << tc.arguments << "\n";
+            }
+        }
+    };
+
+    std::string reasoning =
+        "To plan an effective trip to Japan that includes both historical sites and modern attractions within a "
+        "budget of $4000 for a two-week stay, we need to:\n\n"
+        "1. Identify key historical sites and modern attractions in Japan.\n"
+        "2. Find affordable accommodation options that provide a balance between comfort and cost.\n"
+        "3. Determine the best modes of transportation for getting around Japan.\n"
+        "4. Create a day-by-day itinerary that ensures the user gets to see a variety of attractions without "
+        "overspending.\n"
+        "5. Provide a detailed cost breakdown that includes accommodation, transportation, meals, and entry fees "
+        "to attractions.";
+
+    std::vector<std::tuple<std::string, std::string, nlohmann::json>> tool_calls = {
+        { "call_0", "plan_trip", nlohmann::json::parse(R"({
+            "destination": "Japan",
+            "duration": 14,
+            "budget": 4000,
+            "interests": ["historical sites", "modern attractions"],
+            "accommodation_preferences": "affordable",
+            "transportation_preferences": "efficient",
+            "meal_preferences": "local cuisine"
+        })") }
+    };
+
+    std::vector<std::string> tokens;
+
+    // Build tokens
+    if (!reasoning.empty()) {
+        auto tokenized = simple_tokenize(reasoning);
+        tokens.emplace_back("<|START_THINKING|>");
+        tokens.insert(tokens.end(), tokenized.begin(), tokenized.end());
+        tokens.emplace_back("<|END_THINKING|>");
+    }
+
+    if (!tool_calls.empty()) {
+        tokens.emplace_back("<|START_ACTION|>");
+
+        auto json = nlohmann::json::array();
+        for (const auto & tc : tool_calls) {
+            auto tc_json            = nlohmann::json::object();
+            tc_json["tool_call_id"] = std::get<0>(tc);
+            tc_json["tool_name"]    = std::get<1>(tc);
+            tc_json["parameters"]   = std::get<2>(tc);
+            json.push_back(tc_json);
+        }
+
+        auto tokenized = simple_tokenize(json.dump(-1, ' ', true));
+        tokens.insert(tokens.end(), tokenized.begin(), tokenized.end());
+
+        tokens.emplace_back("<|END_ACTION|>");
+    }
+
+    std::string input = std::accumulate(tokens.begin(), tokens.end(), std::string());
+
+    t.test("current_parse", [&](testing & /* t */) { test_current(parser, input, false, false); });
+    t.bench("current_parse_benchmark complete", [&]() { test_current(parser, input, false, false); }, 100);
+    t.bench(
+        "current_parse_benchmark incremental",
+        [&]() {
+            std::string in;
+            for (auto i = 0u; i < tokens.size(); i++) {
+                in += tokens[i];
+                test_current(parser, in, i + 1 < tokens.size(), false);
+            }
+        },
+        20);
+}
+
+// Test that tool names that are proper prefixes of other tool names don't cause
+// premature matching during incremental parsing.
+// For example, "special_function" should not match when parsing "special_function_with_opt".
+static void test_prefix_tool_names(testing & t) {
+    // Create tools where one name is a proper prefix of another
+    json tools = json::array();
+
+    json tool_short = {
+        { "type", "function" },
+        { "function",
+          {
+              { "name", "special_function" },
+              { "description", "A special function" },
+              { "parameters",
+                {
+                    { "type", "object" },
+                    { "properties",
+                      {
+                          { "arg1", { { "type", "integer" } } },
+                      } },
+                    { "required", { "arg1" } },
+                } },
+          } }
+    };
+    tools.push_back(tool_short);
+
+    json tool_long = {
+        { "type", "function" },
+        { "function",
+          {
+              { "name", "special_function_with_opt" },
+              { "description", "A special function with optional params" },
+              { "parameters",
+                {
+                    { "type", "object" },
+                    { "properties",
+                      {
+                          { "arg1", { { "type", "integer" } } },
+                          { "arg2", { { "type", "integer" } } },
+                      } },
+                    { "required", { "arg1" } },
+                } },
+          } }
+    };
+    tools.push_back(tool_long);
+
+    // Use standard_constructed_tools which had the prefix matching bug
+    std::map<std::string, std::string> markers = {
+        { "tool_call_start_marker", "<tool_call>" },
+        { "tool_call_end_marker", "</tool_call>" },
+        { "function_opener", "<function=" },
+        { "function_closer", "</function>" },
+        { "function_name_suffix", ">" },
+        { "parameter_key_prefix", "<param=" },
+        { "parameter_key_suffix", ">" },
+        { "parameter_closer", "</param>" },
+    };
+
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        auto content   = p.rule("content", p.content(p.until("<tool_call>")));
+        auto tool_call = p.standard_constructed_tools(markers, tools, false, false);
+        return content + p.zero_or_more(p.space() + tool_call) + p.end();
+    });
+
+    // Test parsing the long tool name - this should NOT trigger the short tool name
+    t.test("parse long tool name", [&](testing & t) {
+        std::string input =
+            "Let me call the function."
+            "<tool_call>"
+            "<function=special_function_with_opt>"
+            "<param=arg1>42</param>"
+            "</function>"
+            "</tool_call>";
+
+        common_peg_parse_context ctx(input);
+        auto                     result = parser.parse(ctx);
+
+        t.assert_true("success", result.success());
+
+        common_chat_msg msg;
+        auto            mapper = common_chat_peg_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+
+        t.assert_equal("content", "Let me call the function.", msg.content);
+        t.assert_equal("tool calls count", 1u, msg.tool_calls.size());
+        if (!msg.tool_calls.empty()) {
+            t.assert_equal("tool name", "special_function_with_opt", msg.tool_calls[0].name);
+        }
+    });
+
+    // Test incremental parsing - the key test case
+    // This ensures that when incrementally parsing "special_function_with_opt",
+    // we don't prematurely emit "special_function" as a tool call
+    t.test("incremental parse long tool name", [&](testing & t) {
+        std::string input =
+            "Let me call the function."
+            "<tool_call>"
+            "<function=special_function_with_opt>"
+            "<param=arg1>42</param>"
+            "</function>"
+            "</tool_call>";
+
+        std::vector<std::string> tokens = simple_tokenize(input);
+
+        common_chat_msg prev;
+        for (auto it = tokens.begin(); it != tokens.end(); it++) {
+            std::string in = std::accumulate(tokens.begin(), it + 1, std::string());
+
+            common_peg_parse_context ctx(in, (it + 1 < tokens.end()) ? COMMON_PEG_PARSE_FLAG_LENIENT : COMMON_PEG_PARSE_FLAG_NONE);
+            auto                     result = parser.parse(ctx);
+
+            if (!t.assert_equal("not fail", false, result.fail())) {
+                t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+                return;
+            }
+
+            common_chat_msg msg;
+            auto            mapper = common_chat_peg_mapper(msg);
+            mapper.from_ast(ctx.ast, result);
+
+            // The critical check: during incremental parsing, we should never
+            // see "special_function" as the tool name when parsing "special_function_with_opt"
+            for (const auto & tc : msg.tool_calls) {
+                if (!t.assert_equal("tool name should not be short prefix", false,
+                                    tc.name == "special_function")) {
+                    t.log("Premature tool name match at input: " + in);
+                    return;
+                }
+            }
+
+            try {
+                auto diffs = common_chat_msg_diff::compute_diffs(prev, msg);
+            } catch (const std::exception & e) {
+                t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+                t.assert_true(std::string("diff failed with ") + e.what(), false);
+                return;
+            }
+
+            prev = msg;
+        }
+
+        // Final check: the complete parse should have the correct tool name
+        t.assert_equal("final tool calls count", 1u, prev.tool_calls.size());
+        if (!prev.tool_calls.empty()) {
+            t.assert_equal("final tool name", "special_function_with_opt", prev.tool_calls[0].name);
+        }
+    });
+
+    // Test parsing the short tool name still works
+    t.test("parse short tool name", [&](testing & t) {
+        std::string input =
+            "Let me call the function."
+            "<tool_call>"
+            "<function=special_function>"
+            "<param=arg1>42</param>"
+            "</function>"
+            "</tool_call>";
+
+        common_peg_parse_context ctx(input);
+        auto                     result = parser.parse(ctx);
+
+        t.assert_true("success", result.success());
+
+        common_chat_msg msg;
+        auto            mapper = common_chat_peg_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+
+        t.assert_equal("content", "Let me call the function.", msg.content);
+        t.assert_equal("tool calls count", 1u, msg.tool_calls.size());
+        if (!msg.tool_calls.empty()) {
+            t.assert_equal("tool name", "special_function", msg.tool_calls[0].name);
+        }
+    });
+}
+
+static void test_tagged_peg_parser(testing & t) {
+    t.test("basic tag extraction", [&](testing & t) {
+        auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
+            return p.tag("greeting", p.until(" ")) + " " + p.tag("name", p.rest()) + p.end();
+        });
+
+        auto result = parser.parse_and_extract("Hello World");
+        t.assert_true("success", result.result.success());
+        t.assert_equal("greeting tag", "Hello", result.tags.at("greeting"));
+        t.assert_equal("name tag", "World", result.tags.at("name"));
+    });
+
+    t.test("duplicate tags overwrite", [&](testing & t) {
+        auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
+            return p.tag("item", p.until(",")) + "," + p.tag("item", p.rest()) + p.end();
+        });
+
+        auto result = parser.parse_and_extract("first,second");
+        t.assert_true("success", result.result.success());
+        t.assert_equal("item tag", "second", result.tags.at("item"));
+    });
+
+    t.test("no tags extracted", [&](testing & t) {
+        auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
+            return p.rest() + p.end();
+        });
+
+        auto result = parser.parse_and_extract("Hello");
+        t.assert_true("success", result.result.success());
+        t.assert_equal("empty tags", 0u, result.tags.size());
+    });
+
+    t.test("structured extraction", [&](testing & t) {
+        auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
+            auto header = p.tag("header", p.until("\n"));
+            auto body = p.tag("body", p.rest());
+            return header + "\n" + body + p.end();
+        });
+
+        auto result = parser.parse_and_extract("Title\nBody content here");
+        t.assert_true("success", result.result.success());
+        t.assert_equal("header", "Title", result.tags.at("header"));
+        t.assert_equal("body", "Body content here", result.tags.at("body"));
+    });
+
+    t.test("partial parse", [&](testing & t) {
+        auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
+            return p.tag("prefix", p.until(":")) + ":" + p.tag("value", p.rest()) + p.end();
+        });
+
+        auto result = parser.parse_and_extract("key:val", COMMON_PEG_PARSE_FLAG_LENIENT);
+        t.assert_true("not fail", !result.result.fail());
+        t.assert_equal("prefix tag", "key", result.tags.at("prefix"));
+        t.assert_equal("value tag", "val", result.tags.at("value"));
+    });
+
+    t.test("find in the middle", [&](testing & t) {
+        auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
+            return p.choice({ p.literal("{"), p.literal(":") }) + p.space() + p.literal("\"") + p.atomic(p.literal("fun_name"));
+        });
+
+        std::string tpl = "This is a very long jinja template string. We have tools. We will try to call them now: <tool_call>{ \"fun_name\" : { \"arg\" : 1 }</tool_call>";
+        auto result = parser.parse_anywhere_and_extract(tpl);
+        t.assert_true("success", result.result.success());
+    });
+
+    t.test("fail find in the middle", [&](testing & t) {
+        auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
+            return p.choice({ p.literal("{"), p.literal(":") }) + p.space() + p.literal("\"") + p.atomic(p.literal("fun_name"));
+        });
+
+        std::string tpl = "This is a very long jinja template string. We have tools. We will try to call them now: <tool_call><fun=fun_name><arg name=arg>1</arg></tool_call>";
+        auto result = parser.parse_anywhere_and_extract(tpl);
+        t.assert_true("failure", result.result.fail());
+    });
+
+    t.test("find function tag with name", [&](testing &t) {
+        std::string haystack = "\n<tool_call>\n<function=foofoo>\n<parameter=first>\nXXXX\n</parameter>\n<parameter=second>\nYYYY\n</parameter>\n</function>\n</tool_call>\n";
+        auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
+            std::string needle = "foofoo";
+            return p.tag("fun_marker", p.choice({
+            p.tag("fun_pre", p.literal("<") + p.until_one_of({ ">", needle })) + p.literal(needle) +
+                p.tag("fun_post", p.negate(p.space() + p.literal("<")) + p.until(">") + p.literal(">")) + p.space(),
+            p.tag("fun_pre", p.literal("[") + p.until_one_of({ "]", needle })) + p.literal(needle) +
+                p.tag("fun_post", p.negate(p.space() + p.literal("[") + p.until("]") + p.literal("]")) + p.space()) }));
+        });
+        auto result = parser.parse_anywhere_and_extract(haystack);
+        t.assert_true("success", result.result.success());
+        t.assert_equal("fun_pre should be '<function='", "<function=", result.tags["fun_pre"]);
+        t.assert_equal("fun_post should be '>'", ">", result.tags["fun_post"]);
+    });
+}
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -0,0 +1,712 @@
+#include <string>
+#include <utility>
+#include <vector>
+#include <sstream>
+#include <regex>
+#include <iostream>
+#include <fstream>
+#include <filesystem>
+
+#include <nlohmann/json.hpp>
+
+#undef NDEBUG
+#include <cassert>
+
+#include "llama.h"
+#include "common.h"
+#include "chat.h"
+#include "jinja/runtime.h"
+#include "jinja/parser.h"
+#include "jinja/lexer.h"
+#include "jinja/caps.h"
+
+using json = nlohmann::ordered_json;
+
+static int main_automated_tests(void);
+
+static void run_multiple(const std::string& dir_path, bool stop_on_first_failure, const json& input, bool use_common = false);
+static void run_single(const std::string& contents, json input, bool use_common = false, const std::string & output_path = "");
+
+static std::string HELP = R"(
+Usage: test-chat-template [OPTIONS] PATH_TO_TEMPLATE
+Options:
+  -h, --help               Show this help message and exit.
+  --with-tools             Add a tool and a tool call to the default JSON input
+  --json <path>            Path to the JSON input file.
+  --stop-on-first-fail     Stop testing on the first failure (default: false).
+  --no-common              Use direct Jinja engine instead of common chat templates (default: use common).
+  --output <path>          Path to output results (only for single template runs).
+If PATH_TO_TEMPLATE is a file, runs that single template.
+If PATH_TO_TEMPLATE is a directory, runs all .jinja files in that directory.
+If PATH_TO_TEMPLATE is omitted, runs automated tests (default CI mode).
+)";
+
+static std::string DEFAULT_JSON = R"({
+    "messages": [
+        {
+            "role": "user",
+            "content": "Hello, how are you?"
+        },
+        {
+            "role": "assistant",
+            "content": "I am fine, thank you!"
+        }
+    ],
+    "bos_token": "<s>",
+    "eos_token": "</s>",
+    "add_generation_prompt": true
+})";
+
+static std::string DEFAULT_JSON_WITH_TOOLS = R"({
+    "messages": [
+        {
+            "role": "user",
+            "content": "Hello, how are you?"
+        },
+        {
+            "role": "assistant",
+            "content": "I am fine, thank you!"
+        },
+        {
+            "role": "user",
+            "content": "Call a tool!"
+        },
+        {
+            "role": "assistant",
+            "tool_calls": [
+                {
+                    "id": "call00001",
+                    "type": "function",
+                    "function": {
+                        "name": "test",
+                        "arguments": { "arg": "hello" }
+                    }
+                }
+            ]
+        }
+    ],
+    "tools": [
+        {
+            "type": "function",
+            "function": {
+                "name": "test",
+                "description": "Test",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "arg": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "required": ["arg"]
+            }
+        }
+    ],
+    "bos_token": "<s>",
+    "eos_token": "</s>",
+    "add_generation_prompt": true
+})";
+
+
+int main(int argc, char ** argv) {
+    std::vector<std::string> args(argv, argv + argc);
+
+    std::string tmpl_path;
+    std::string json_path;
+    std::string output_path;
+    std::string & json_to_use = DEFAULT_JSON;
+    bool stop_on_first_fail = false;
+    bool use_common = true;
+
+    for (size_t i = 1; i < args.size(); i++) {
+        if (args[i] == "--help" || args[i] == "-h") {
+            std::cout << HELP << "\n";
+            return 0;
+        }
+        if (args[i] == "--json" && i + 1 < args.size()) {
+            json_path = args[i + 1];
+            i++;
+        } else if (args[i] == "--with-tools") {
+            json_to_use = DEFAULT_JSON_WITH_TOOLS;
+        } else if (args[i] == "--stop-on-first-fail") {
+            stop_on_first_fail = true;
+        } else if (args[i] == "--output" && i + 1 < args.size()) {
+            output_path = args[i + 1];
+            i++;
+        } else if (args[i] == "--no-common") {
+            use_common = true;
+        } else if (tmpl_path.empty()) {
+            tmpl_path = args[i];
+        } else {
+            std::cerr << "Unknown argument: " << args[i] << "\n";
+            std::cout << HELP << "\n";
+            return 1;
+        }
+    }
+
+    if (tmpl_path.empty()) {
+        return main_automated_tests();
+    }
+
+    json input_json;
+    if (!json_path.empty()) {
+        std::ifstream json_file(json_path);
+        if (!json_file) {
+            std::cerr << "Error: Could not open JSON file: " << json_path << "\n";
+            return 1;
+        }
+        std::string content = std::string(
+            std::istreambuf_iterator<char>(json_file),
+            std::istreambuf_iterator<char>());
+        input_json = json::parse(content);
+    } else {
+        input_json = json::parse(json_to_use);
+    }
+
+    std::filesystem::path p(tmpl_path);
+    if (std::filesystem::is_directory(p)) {
+        run_multiple(tmpl_path, stop_on_first_fail, input_json, use_common);
+    } else if (std::filesystem::is_regular_file(p)) {
+        std::ifstream infile(tmpl_path);
+        std::string contents = std::string(
+            std::istreambuf_iterator<char>(infile),
+            std::istreambuf_iterator<char>());
+        run_single(contents, input_json, use_common, output_path);
+    } else {
+        std::cerr << "Error: PATH_TO_TEMPLATE is not a valid file or directory: " << tmpl_path << "\n";
+        return 1;
+    }
+
+    return 0;
+}
+
+void run_multiple(const std::string& dir_path, bool stop_on_first_fail, const json& input, bool use_common) {
+    std::vector<std::string> failed_tests;
+
+    // list all files in models/templates/ and run each
+    size_t test_count = 0;
+
+    for (const auto & entry : std::filesystem::directory_iterator(dir_path)) {
+        // only process .jinja files
+        if (entry.path().extension() == ".jinja" && entry.is_regular_file()) {
+            test_count++;
+            std::cout << "\n\n=== RUNNING TEMPLATE FILE: " << entry.path().string() << " ===\n";
+            std::ifstream infile(entry.path());
+            std::string contents((std::istreambuf_iterator<char>(infile)), std::istreambuf_iterator<char>());
+            try {
+                run_single(contents, input, use_common);
+            } catch (const std::exception & e) {
+                std::cout << "Exception: " << e.what() << "\n";
+                std::cout << "=== ERROR WITH TEMPLATE FILE: " << entry.path().string() << " ===\n";
+                failed_tests.push_back(entry.path().string());
+                if (stop_on_first_fail) {
+                    break;
+                }
+            }
+        }
+    }
+
+    std::cout << "\n\n=== TEST SUMMARY ===\n";
+    std::cout << "Total tests run: " << test_count << "\n";
+    std::cout << "Total failed tests: " << failed_tests.size() << "\n";
+    for (const auto & test : failed_tests) {
+        std::cout << "FAILED TEST: " << test << "\n";
+    }
+}
+
+
+static std::string normalize_newlines(const std::string & s) {
+#ifdef _WIN32
+  static const std::regex nl_regex("\r\n");
+  return std::regex_replace(s, nl_regex, "\n");
+#else
+  return s;
+#endif
+}
+
+
+static std::string format_using_common(
+            const std::string & template_str,
+            const std::string & bos_token,
+            const std::string & eos_token,
+            std::vector<common_chat_msg> & messages,
+            std::vector<common_chat_tool> tools = {}) {
+    auto tmpls = common_chat_templates_init(/* model= */ nullptr, template_str, bos_token, eos_token);
+    common_chat_templates_inputs inputs;
+    inputs.use_jinja = true;
+    inputs.messages = messages;
+    inputs.tools = std::move(tools);
+    inputs.add_generation_prompt = true;
+    auto output = common_chat_templates_apply(tmpls.get(), inputs).prompt;
+    output = normalize_newlines(output);
+    return output;
+}
+
+
+// skip libcommon, use direct jinja engine
+static jinja::value_string format_using_direct_engine(
+            const std::string & template_str,
+            json & input) {
+    // lexing
+    jinja::lexer lexer;
+    auto lexer_res = lexer.tokenize(template_str);
+
+    // compile to AST
+    jinja::program ast = jinja::parse_from_tokens(lexer_res);
+
+    // check caps for workarounds
+    jinja::caps_get(ast);
+
+    std::cout << "\n=== RUN ===\n";
+    jinja::context ctx(template_str);
+
+    jinja::global_from_json(ctx, input, true);
+
+    jinja::runtime runtime(ctx);
+    const jinja::value results = runtime.execute(ast);
+    auto parts = jinja::runtime::gather_string_parts(results);
+
+    std::cout << "\n=== RESULTS ===\n";
+    for (const auto & part : parts->as_string().parts) {
+        std::cout << (part.is_input ? "DATA" : "TMPL") << ": " << part.val << "\n";
+    }
+
+    return parts;
+}
+
+
+void run_single(const std::string& contents, json input, bool use_common, const std::string & output_path) {
+    jinja::enable_debug(true);
+
+    jinja::value_string output_parts;
+
+    if (use_common) {
+        std::string bos_token = "<s>";
+        std::string eos_token = "</s>";
+        if (input.contains("bos_token")) {
+            bos_token = input["bos_token"].get<std::string>();
+        }
+        if (input.contains("eos_token")) {
+            eos_token = input["eos_token"].get<std::string>();
+        }
+        nlohmann::ordered_json msgs_json = input["messages"];
+        nlohmann::ordered_json tools_json = input["tools"];
+        auto messages = common_chat_msgs_parse_oaicompat(msgs_json);
+        auto tools = common_chat_tools_parse_oaicompat(tools_json);
+        auto output = format_using_common(contents, bos_token, eos_token, messages, tools);
+        std::cout << "\n=== OUTPUT ===\n";
+        std::cout << output << "\n";
+        output_parts = jinja::mk_val<jinja::value_string>(output);
+
+    } else {
+        output_parts = format_using_direct_engine(contents, input);
+        std::cout << "\n=== OUTPUT ===\n";
+        std::cout << output_parts->as_string().str() << "\n";
+    }
+
+    if (!output_path.empty()) {
+        std::ofstream outfile(output_path);
+        if (!outfile) {
+            throw std::runtime_error("Could not open output file: " + output_path);
+        }
+        outfile << output_parts->as_string().str();
+        outfile.close();
+        std::cout << "\n=== OUTPUT WRITTEN TO " << output_path << " ===\n";
+    }
+}
+
+
+
+
+
+//
+// Automated tests for chat templates
+//
+
+#define U8C(x) (const char*)(u8##x)
+
+static common_chat_msg simple_msg(const std::string & role, const std::string & content) {
+    common_chat_msg msg;
+    msg.role = role;
+    msg.content = content;
+    return msg;
+}
+
+int main_automated_tests(void) {
+    // jinja::enable_debug(true);
+
+    std::vector<llama_chat_message> conversation {
+        {"system", "You are a helpful assistant"},
+        {"user", "Hello"},
+        {"assistant", "Hi there"},
+        {"user", "Who are you"},
+        {"assistant", "   I am an assistant   "},
+        {"user", "Another question"},
+    };
+
+    // std::string wrong = /* .template_str= */ u8"[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}";
+    struct TestCase {
+        std::string name;
+        std::string template_str;
+        std::string expected_output;
+        std::string expected_output_jinja;
+        std::string bos_token = "";
+        std::string eos_token = "";
+        bool supported_with_jinja = true;
+        std::vector<llama_chat_message> extra_conversation = {};
+    };
+    std::vector<TestCase> test_cases {
+        {
+            /* .name= */ "teknium/OpenHermes-2.5-Mistral-7B",
+            /* .template_str= */ "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+            /* .expected_output= */ "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "mistralai/Mistral-7B-Instruct-v0.2 (NOTE: Old pre-v1 without a system prompt)",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+            /* .expected_output= */ "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "TheBloke/FusionNet_34Bx2_MoE-AWQ",
+            /* .template_str= */ "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}",
+            /* .expected_output= */       "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST]Hi there</s><s>[INST] Who are you [/INST]   I am an assistant   </s><s>[INST] Another question [/INST]",
+            /* .expected_output_jinja= */ "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
+            /* .bos_token= */ "<s>",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "bofenghuang/vigogne-2-70b-chat",
+            /* .template_str= */ "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+            /* .expected_output= */       "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST]Hi there</s>[INST] Who are you [/INST]I am an assistant</s>[INST] Another question [/INST]",
+            /* .expected_output_jinja= */ "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "mlabonne/AlphaMonarch-7B",
+            /* .template_str= */ "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
+            /* .expected_output= */ "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
+            /* .expected_output_jinja= */ "<s>system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
+            /* .bos_token= */ "<s>",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "google/gemma-7b-it",
+            /* .template_str= */ "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
+            /* .expected_output= */       "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
+            /* .expected_output_jinja= */ "<start_of_turn>user\nYou are a helpful assistant\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
+        },
+        {
+            /* .name= */ "OrionStarAI/Orion-14B-Chat",
+            /* .template_str= */ "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
+            /* .expected_output= */       "Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
+            /* .expected_output_jinja= */ "Human: You are a helpful assistant\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "openchat/openchat-3.5-0106",
+            // The included chat_template differs from the author's suggestions here: https://huggingface.co/openchat/openchat_3.5/discussions/5#65448109b4a3f3a2f486fd9d
+            // So we match against the included template but implement the suggested version.
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
+            /* .expected_output= */                            "You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant:    I am an assistant   <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:",
+            /* .expected_output_jinja= */ "GPT4 Correct System: You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant:    I am an assistant   <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:",
+        },
+        {
+            /* .name= */ "deepseek-ai/deepseek-coder-33b-instruct",
+            /* .template_str= */ "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
+            /* .expected_output= */ "You are a helpful assistant### Instruction:\nHello\n### Response:\nHi there\n<|EOT|>\n### Instruction:\nWho are you\n### Response:\n   I am an assistant   \n<|EOT|>\n### Instruction:\nAnother question\n### Response:\n",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "eachadea/vicuna-13b-1.1",
+            // No template included in tokenizer_config.json, so this template likely needs to be manually set.
+            /* .template_str= */ "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '' + message['content'] + '\n\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
+            /* .expected_output= */ "You are a helpful assistant\n\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "Orca-Vicuna",
+            // No template included in tokenizer_config.json, so this template likely needs to be manually set.
+            /* .template_str= */ "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
+            /* .expected_output= */ "SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "CohereForAI/c4ai-command-r-plus",
+            /* .template_str= */ "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
+            /* .expected_output= */ "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "Llama-3",
+            /* .template_str= */ "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
+            /* .expected_output= */ "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "Phi-3-mini",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+            /* .expected_output= */     "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "<|user|>\nYou are a helpful assistant\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        },
+        {
+            /* .name= */ "Phi-3-small",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+            /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "",
+        },
+        {
+            /* .name= */ "Phi-3-medium",
+            /* .template_str= */ "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+            /* .expected_output= */     "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "<|user|>\nYou are a helpful assistant\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        },
+        {
+            /* .name= */ "Phi-3-vision",
+            /* .template_str= */ "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
+            /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "ChatGLM3",
+            /* .template_str= */ "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
+            /* .expected_output= */       "[gMASK]sop<|system|>\n You are a helpful assistant<|user|>\n Hello<|assistant|>\n Hi there<|user|>\n Who are you<|assistant|>\n    I am an assistant   <|user|>\n Another question<|assistant|>",
+            /* .expected_output_jinja= */ "[gMASK]sop<|system|>\n You are a helpful assistant<|user|>\n Hello<|assistant|>\n Hi there<|user|>\n Who are you<|assistant|>\n    I am an assistant   <|user|>\n Another question<|assistant|>",
+        },
+        {
+            /* .name= */ "ChatGLM4",
+            /* .template_str= */ U8C("[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}"),
+            /* .expected_output= */ "[gMASK]<sop><|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>\n",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "GLMEdge",
+            /* .template_str= */ "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>",
+            /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
+            /* .expected_output_jinja= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "MiniCPM-3B-OpenHermes-2.5-v2-GGUF",
+            /* .template_str= */ U8C("{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}"),
+            /* .expected_output= */ U8C("You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>"),
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "DeepSeek-V2",
+            /* .template_str= */ "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+            /* .expected_output= */ U8C("You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:"),
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "<｜end▁of▁sentence｜>",
+        },
+        {
+            /* .name= */ "ibm-granite/granite-3.0-8b-instruct",
+            /* .template_str= */ "{%- if tools %}\n    {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n    {%- for tool in tools %}\n    {{- tool | tojson(indent=4) }}\n    {%- if not loop.last %}\n        {{- '\n\n' }}\n    {%- endif %}\n    {%- endfor %}\n    {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n    {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'user' %}\n    {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'assistant_tool_call' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- elif message['role'] == 'tool_response' %}\n    {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n    {%- endif %}\n    {%- if loop.last and add_generation_prompt %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n    {%- endif %}\n{%- endfor %}",
+            /* .expected_output= */       "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>",
+            /* .expected_output_jinja= */ "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>",
+        },
+        {
+            /* .name= */ "mistralai/Mistral-7B-Instruct-v0.2 (mistralai 'v1' template with a system prompt)",
+            /* .template_str= */ "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
+            /* .expected_output= */ " [INST] You are a helpful assistant\n\nHello [/INST] Hi there</s> [INST] Who are you [/INST]    I am an assistant   </s> [INST] Another question [/INST]",
+            /* .expected_output_jinja= */ " [INST] You are a helpful assistant\n\nHello [/INST] Hi there</s> [INST] Who are you [/INST]    I am an assistant   </s> [INST] Another question [/INST]",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "Mistral-Large-Instruct-2407 (mistralai 'v3' template; modified to have system prompt at start)",
+            /* .template_str= */ "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS] [\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST] \" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST] \" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- \"[TOOL_CALLS] [\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- \" \" + message[\"content\"]|trim + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS] {\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+            /* .expected_output= */       "[INST] You are a helpful assistant\n\nHello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant</s>[INST] Another question[/INST]",
+            /* .expected_output_jinja= */ "[INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST] I am an assistant</s>[INST] You are a helpful assistant\n\nAnother question[/INST]",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "Mistral-Nemo-Instruct-2407 (mistralai 'v3-tekken' template; modified to have system prompt at start)",
+            /* .template_str= */ "{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n            {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- \"[AVAILABLE_TOOLS][\" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- '{\"type\": \"function\", \"function\": {' }}\n                {%- for key, val in tool.items() if key != \"return\" %}\n                    {%- if val is string %}\n                        {{- '\"' + key + '\": \"' + val + '\"' }}\n                    {%- else %}\n                        {{- '\"' + key + '\": ' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- \", \" }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- \"}}\" }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- else %}\n                    {{- \"]\" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- \"[/AVAILABLE_TOOLS]\" }}\n            {%- endif %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST]\" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n        {%- else %}\n            {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n        {%- endif %}\n    {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n        {{- \"[TOOL_CALLS][\" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n            {%- endif %}\n            {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n            {%- if not loop.last %}\n                {{- \", \" }}\n            {%- else %}\n                {{- \"]\" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- message[\"content\"] + eos_token}}\n    {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n        {%- endif %}\n        {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}\n",
+            /* .expected_output= */       "[INST]You are a helpful assistant\n\nHello[/INST]Hi there</s>[INST]Who are you[/INST]   I am an assistant   </s>[INST]Another question[/INST]",
+            /* .expected_output_jinja= */ "[INST]Hello[/INST]Hi there</s>[INST]Who are you[/INST]   I am an assistant   </s>[INST]You are a helpful assistant\n\nAnother question[/INST]",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template)",
+            /* .template_str= */ "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
+            /* .expected_output= */ "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST]    I am an assistant   </s>[INST] Another question[/INST]",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "</s>",
+        },
+        {
+            /* .name= */ "ai-sage/GigaChat-20B-A3B-instruct",
+            /* .template_str= */ "{% if messages[0]['role'] == 'system' -%}\n    {%- set loop_messages = messages[1:] -%}\n    {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n    {%- set loop_messages = messages -%}\n    {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n    {% endif %}\n    \n    {%- if loop.index0 == 0 -%}\n        {{ system_message -}}\n    {%- endif -%}\n    {%- if message['role'] == 'user' -%}\n        {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n        {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3]  + additional_special_tokens[1] -}}\n    {%- endif -%}\n    {%- if message['role'] == 'assistant' -%}\n        {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n    {%- endif -%}\n    {%- if loop.last and add_generation_prompt -%}\n        {{ 'assistant' + additional_special_tokens[0] -}}\n    {%- endif -%}\n{%- endfor %}",
+            /* .expected_output= */ "<s>You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>   I am an assistant   <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+            /* .supported_with_jinja= */ false, // Requires additional_special_tokens as extra context
+        },
+        {
+            /* .name= */ "Infinigence/Megrez-3B-Instruct",
+            /* .template_str= */ U8C("{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|role_start|>system<|role_end|>你是Megrez-3B-Instruct，将针对用户的问题给出详细的、积极的回答。<|turn_end|>' }}{% endif %}{{ '<|role_start|>' + message['role'] + '<|role_end|>' + message['content'] + '<|turn_end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|role_start|>assistant<|role_end|>' }}{% endif %}"),
+            /* .expected_output= */ "<|role_start|>system<|role_end|>You are a helpful assistant<|turn_end|><|role_start|>user<|role_end|>Hello<|turn_end|><|role_start|>assistant<|role_end|>Hi there<|turn_end|><|role_start|>user<|role_end|>Who are you<|turn_end|><|role_start|>assistant<|role_end|>   I am an assistant   <|turn_end|><|role_start|>user<|role_end|>Another question<|turn_end|><|role_start|>assistant<|role_end|>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "phi-4",
+            /* .template_str= */ "{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|><|im_start|>assistant<|im_sep|>'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}",
+            /* .expected_output= */ "<|im_start|>system<|im_sep|>You are a helpful assistant<|im_end|><|im_start|>user<|im_sep|>Hello<|im_end|><|im_start|>assistant<|im_sep|>Hi there<|im_end|><|im_start|>user<|im_sep|>Who are you<|im_end|><|im_start|>assistant<|im_sep|>   I am an assistant   <|im_end|><|im_start|>user<|im_sep|>Another question<|im_end|><|im_start|>assistant<|im_sep|>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "yandex/YandexGPT-5-Lite-8B-instruct",
+            /* .template_str= */ "<s>{%- set names = {'assistant': ' Ассистент:', 'user': ' Пользователь:'} %}\n{%- set tools_prefix = 'Тебе доступны следующие функции:' %}\n{%- macro __render_tool(tool) %}\n    {%- set name = tool.function.name %}\n    {%- set description = tool.function.description|default('') %}\n    {%- set parameters = tool.function.parameters|tojson %}\n    {{- '\\n' }}function {{ '{' }}'name':'{{ name }}',\n    {%- if tool.function.description %}'description':'{{ description }}',{% endif %}\n'parameters':{{ parameters }}\n    {{- '}' }}\n{%- endmacro %}\n{%- macro __render_tools(tools) %}\n    {{- tools_prefix }}\n    {%- for tool in tools %}\n        {{- __render_tool(tool) }}\n    {%- endfor %}\n    {{- '\\n\\n' }}\n{%- endmacro %}\n{%- macro __render_tool_message(message) %}\n    {{- '\\n\\nРезультат вызова' }} {{ message.name }}: {{ message.content }} {{ '\\n\\n' }}\n{%- endmacro %}\n{%- if tools -%}\n    {{- __render_tools(tools) }}\n{%- endif -%}\n{%- macro __render_user_message(message) %}\n{{ names.user }} {{ message.content + '\\n\\n' }}\n{%- endmacro %}\n{%- macro __render_assistant_message(message) %}\n    {{- names.assistant }}\n    {%- set call = message['function_call'] %}\n    {%- if call %}\n        {{- '\\n[TOOL_CALL_START]' }}{{ call.name }}{{ '\\n' }}{{ call.arguments|tojson }}\n    {%- else %}\n        {{- ' ' + message.content + '\\n\\n' }}\n    {%- endif %}\n{%- endmacro %}\n{%- if not add_generation_prompt is defined %}\n{%- set add_generation_prompt = false %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- __render_user_message(message) }}\n    {%- endif %}\n    {%- if message.role == 'assistant' and not loop.last %}\n        {{- __render_assistant_message(message) }}\n    {%- endif %}\n    {%- if message.role == 'tool' %}\n        {{- __render_tool_message(message) }}\n    {%- endif %}\n    {%- if loop.last %}\n        {{- ' Ассистент:[SEP]' }}\n    {%- endif %}\n{%- endfor %}\n",
+            /* .expected_output= */ " Пользователь: Hello\n\n Ассистент: Hi there\n\n Пользователь: Who are you\n\n Ассистент:    I am an assistant   \n\n Пользователь: Another question\n\n Ассистент:[SEP]",
+            /* .expected_output_jinja= */ "<s> Пользователь: You are a helpful assistant\nHello\n\n Ассистент: Hi there\n\n Пользователь: Who are you\n\n Ассистент:    I am an assistant   \n\n Пользователь: Another question\n\n Ассистент:[SEP]",
+            /* .bos_token= */ "<s>",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "inclusionAI/Ling-lite",
+            /* .template_str */ "{% for message in messages %}{% set role = message['role'] | lower %}{% if role == 'user' %}{% set role = 'HUMAN' %}{% endif %}{% set role = role | upper %}{{ '<role>' + role + '</role>' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '<role>ASSISTANT</role>' }}{% endif %}",
+            /* .expected_output= */ "<role>SYSTEM</role>You are a helpful assistant<role>HUMAN</role>Hello<role>ASSISTANT</role>Hi there<role>HUMAN</role>Who are you<role>ASSISTANT</role>   I am an assistant   <role>HUMAN</role>Another question<role>ASSISTANT</role>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+        },
+        {
+            /* .name= */ "ByteDance-Seed/Seed-OSS-36B-Instruct",
+            /* .template_str */ "{# <seed:bos> #}{%- for message in messages %}{%- if message.role in [\"user\", \"system\"] %}{{ bos_token + message.role + \"\\n\" + message.content + eos_token }}{%- elif message.role == \"assistant\" %}{{ bos_token + message.role }}{%- if message.content is defined and message.content is string and message.content|trim|length > 0 %}{{ \"\\n\" + message.content|trim + eos_token }}{%- endif %}{%- else %}{{ bos_token + message.role + \"\\n\" + message.content + eos_token }}{%- endif %}{%- endfor %}{%- if add_generation_prompt %}{{ bos_token + \"assistant\\n\" }}{%- endif %}",
+            /* .expected_output= */ "<seed:bos>system\nYou are a helpful assistant<seed:eos><seed:bos>user\nHello<seed:eos><seed:bos>assistant\nHi there<seed:eos><seed:bos>user\nWho are you<seed:eos><seed:bos>assistant\nI am an assistant<seed:eos><seed:bos>user\nAnother question<seed:eos><seed:bos>assistant\n",
+            /* .expected_output_jinja= */ "<seed:bos>system\nYou are a helpful assistant<seed:eos><seed:bos>user\nHello<seed:eos><seed:bos>assistant\nHi there<seed:eos><seed:bos>user\nWho are you<seed:eos><seed:bos>assistant\nI am an assistant<seed:eos><seed:bos>user\nAnother question<seed:eos><seed:bos>assistant\n",
+            /* .bos_token= */ "<seed:bos>",
+            /* .eos_token= */ "<seed:eos>",
+        },
+        {
+            /* .name= */ "ibm-granite/granite-3.x (tool call)",
+            /* .template_str= */ "{%- for message in messages %}\n    {%- if message['role'] == 'assistant_tool_call' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\\n' }}\n    {%- else %}\n    {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>\\n' }}\n    {%- endif %}\n    {%- if loop.last and add_generation_prompt %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n    {%- endif %}\n{%- endfor %}",
+            /* .expected_output= */       "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What is the weather?<|end_of_text|>\n<|start_of_role|>assistant_tool_call<|end_of_role|><|tool_call|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}]<|end_of_text|>\n<|start_of_role|>tool_response<|end_of_role|>{\"temperature\": 72}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>",
+            /* .expected_output_jinja= */ "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What is the weather?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|><|tool_call|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}]<|end_of_text|>\n<|start_of_role|>tool_response<|end_of_role|>{\"temperature\": 72}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+            /* .supported_with_jinja= */ true,
+            /* .extra_conversation= */ {{"user", "What is the weather?"}, {"assistant_tool_call", "[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}]"}, {"tool_response", "{\"temperature\": 72}"}},
+        },
+        {
+            /* .name= */ "ibm-granite/granite-4.0 (tool call)",
+            /* .template_str= */ "{%- for message in messages %}\n    {%- if message['role'] == 'assistant_tool_call' %}\n    {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\\n' }}\n    {%- else %}\n    {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>\\n' }}\n    {%- endif %}\n    {%- if loop.last and add_generation_prompt %}\n    {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n    {%- endif %}\n{%- endfor %}\n{# <tool_call> <tools> #}",
+            /* .expected_output= */       "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>   I am an assistant   <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What is the weather?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|><|tool_call|><tool_call>\n{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}\n</tool_call><|end_of_text|>\n<|start_of_role|>tool_response<|end_of_role|>{\"temperature\": 72}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+            /* .supported_with_jinja= */ true,
+            /* .extra_conversation= */ {{"user", "What is the weather?"}, {"assistant_tool_call", "<tool_call>\n{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}\n</tool_call>"}, {"tool_response", "{\"temperature\": 72}"}},
+        }
+    };
+    std::vector<char> formatted_chat(1024);
+    int32_t res;
+
+    // list all supported templates
+    std::vector<const char *> supported_tmpl;
+    res = llama_chat_builtin_templates(nullptr, 0);
+    assert(res > 0);
+    supported_tmpl.resize(res);
+    res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
+    std::cout << "Built-in chat templates:\n";
+    for (const auto *tmpl : supported_tmpl) {
+        std::cout << "  " << tmpl << "\n";
+    }
+
+    // test invalid chat template
+    res = llama_chat_apply_template("INVALID TEMPLATE", conversation.data(), conversation.size(), true, formatted_chat.data(), formatted_chat.size());
+    assert(res < 0);
+    const auto add_generation_prompt = true;
+
+    for (const auto & test_case : test_cases) {
+        std::cout << "\n\n=== " << test_case.name << " ===\n\n";
+        auto conv = conversation;
+        conv.insert(conv.end(), test_case.extra_conversation.begin(), test_case.extra_conversation.end());
+        formatted_chat.resize(2048);
+        res = llama_chat_apply_template(
+            test_case.template_str.c_str(),
+            conv.data(),
+            conv.size(),
+            add_generation_prompt,
+            formatted_chat.data(),
+            formatted_chat.size()
+        );
+        formatted_chat.resize(res);
+        std::string output(formatted_chat.data(), formatted_chat.size());
+        if (output != test_case.expected_output) {
+            std::cout << "Expected:\n" << test_case.expected_output << "\n";
+            std::cout << "-------------------------\n";
+            std::cout << "Actual:\n" << output << "\n";
+            std::cout.flush();
+            assert(output == test_case.expected_output);
+        }
+    }
+
+    std::vector<common_chat_msg> messages;
+    messages.reserve(conversation.size());
+    for (const auto & msg : conversation) {
+        messages.push_back(simple_msg(msg.role, msg.content));
+    }
+    for (const auto & test_case : test_cases) {
+        if (!test_case.supported_with_jinja) {
+            continue;
+        }
+        std::cout << "\n\n=== " << test_case.name << " (jinja) ===\n\n";
+        try {
+            auto msgs = messages;
+            for (const auto & msg : test_case.extra_conversation) {
+                msgs.push_back(simple_msg(msg.role, msg.content));
+            }
+            auto output = format_using_common(
+                                test_case.template_str,
+                                test_case.bos_token,
+                                test_case.eos_token,
+                                msgs);
+            auto expected_output = normalize_newlines(test_case.expected_output_jinja.empty() ? test_case.expected_output : test_case.expected_output_jinja);
+            if (output != expected_output) {
+                std::cout << "Template:```\n" << test_case.template_str << "\n```";
+                std::cout << "-------------------------\n";
+                std::cout << "Expected:```\n" << expected_output << "\n```";
+                std::cout << "-------------------------\n";
+                std::cout << "Actual:```\n" << output << "\n```";
+                std::cout.flush();
+                assert(output == expected_output);
+            }
+        } catch (const std::exception & e) {
+            std::cerr << "ERROR: " << e.what() << "\n";
+            assert(false);
+        }
+    }
+
+    std::cout << "\nOK: All tests passed successfully.\n";
+
+    return 0;
+}
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@@ -0,0 +1,57 @@
+// These tests may take a long time!
+// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result.
+// This is done by checking all finite (non-NaN, non-infinite) floats.
+
+#undef NDEBUG
+#include <cassert>
+#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
+#include <immintrin.h>
+#endif
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+
+// ggml.c::quantize_row_q4_0_ref
+inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
+
+// ggml.c::ggml_silu_f32
+inline static float silu_orig(float x) {
+    return x/(1.0 + exp(-x));
+}
+
+#pragma GCC diagnostic pop
+
+// ggml.c::quantize_row_q4_0_ref
+inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
+
+// ggml.c::ggml_silu_f32
+inline static float silu_float(float x) {
+    return x/(1.0f + expf(-x));
+}
+
+int main(void) {
+    uint32_t x = UINT32_MAX;
+    do {
+        float f;
+        memcpy(&f, &x, sizeof(x));
+        assert(!std::isfinite(f) || (round_orig(f) == round_float(f)));
+    } while (x--);
+
+#ifdef __F16C__
+    // GELU and SILU implementations are used with a FP16 lookup table.
+    // The original and float-only results are not equal for all inputs after converting to FP16.
+    // GELU is an approximation anyway (tanh), not tested here.
+    // For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match.
+    for (x = 0; x <= UINT16_MAX; x++) {
+        float f = _cvtsh_ss(x);
+        const float so = silu_orig(f);
+        const float sf = silu_float(f);
+        assert(   (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0))
+               || (nextafterf(so, sf) == sf)
+               || (nextafterf(sf, so) == so));
+    }
+#endif
+}
--- a/tests/test-gbnf-validator.cpp
+++ b/tests/test-gbnf-validator.cpp
@@ -0,0 +1,109 @@
+#include "../src/unicode.h"
+#include "../src/llama-grammar.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <sstream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
+    const auto cpts = unicode_cpts_from_utf8(input_str);
+
+    auto & stacks_cur = llama_grammar_get_stacks(grammar);
+
+    size_t pos = 0;
+    for (const auto & cpt : cpts) {
+        llama_grammar_accept(grammar, cpt);
+
+        if (stacks_cur.empty()) {
+            error_pos = pos;
+            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
+            return false;
+        }
+        ++pos;
+    }
+
+    for (const auto & stack : stacks_cur) {
+        if (stack.empty()) {
+            return true;
+        }
+    }
+
+    error_pos = pos;
+    error_msg = "Unexpected end of input";
+    return false;
+}
+
+static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
+    fprintf(stdout, "Input string is invalid according to the grammar.\n");
+    fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "Input string:\n");
+    fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
+    if (error_pos < input_str.size()) {
+        fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
+        if (error_pos+1 < input_str.size()) {
+            fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
+        }
+        fprintf(stdout, "\033[0m\n");
+    }
+}
+
+int main(int argc, char** argv) {
+    if (argc != 3) {
+        fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
+        return 1;
+    }
+
+    const std::string grammar_filename = argv[1];
+    const std::string input_filename = argv[2];
+
+    // Read the GBNF grammar file
+    FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
+    if (!grammar_file) {
+        fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
+        return 1;
+    }
+
+    std::string grammar_str;
+    {
+        std::ifstream grammar_file(grammar_filename);
+        GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
+        std::stringstream buffer;
+        buffer << grammar_file.rdbuf();
+        grammar_str = buffer.str();
+    }
+
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
+    if (grammar == nullptr) {
+        fprintf(stdout, "Failed to initialize llama_grammar\n");
+        return 1;
+    }
+    // Read the input file
+    std::string input_str;
+    {
+        std::ifstream input_file(input_filename);
+        GGML_ASSERT(input_file.is_open() && "Failed to open input file");
+        std::stringstream buffer;
+        buffer << input_file.rdbuf();
+        input_str = buffer.str();
+    }
+
+    // Validate the input string against the grammar
+    size_t error_pos;
+    std::string error_msg;
+    bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
+
+    if (is_valid) {
+        fprintf(stdout, "Input string is valid according to the grammar.\n");
+    } else {
+        print_error_message(input_str, error_pos, error_msg);
+    }
+
+    // Clean up
+    llama_grammar_free_impl(grammar);
+
+    return 0;
+}
--- a/tests/test-gguf-model-data.cpp
+++ b/tests/test-gguf-model-data.cpp
@@ -0,0 +1,154 @@
+#include "gguf-model-data.h"
+
+#include <cstdio>
+
+#define TEST_ASSERT(cond, msg) \
+    do { \
+        if (!(cond)) { \
+            fprintf(stderr, "FAIL: %s (line %d): %s\n", #cond, __LINE__, msg); \
+            return 1; \
+        } \
+    } while (0)
+
+int main() {
+    fprintf(stderr, "=== test-gguf-model-data ===\n");
+
+    // Fetch Qwen3-0.6B Q8_0 metadata
+    auto result = gguf_fetch_model_meta("ggml-org/Qwen3-0.6B-GGUF", "Q8_0");
+
+    if (!result.has_value()) {
+        fprintf(stderr, "SKIP: could not fetch model metadata (no network or HTTP disabled)\n");
+        return 0;
+    }
+
+    const auto & model = result.value();
+
+    fprintf(stderr, "Architecture:  %s\n", model.architecture.c_str());
+    fprintf(stderr, "n_embd:        %u\n", model.n_embd);
+    fprintf(stderr, "n_ff:          %u\n", model.n_ff);
+    fprintf(stderr, "n_vocab:       %u\n", model.n_vocab);
+    fprintf(stderr, "n_layer:       %u\n", model.n_layer);
+    fprintf(stderr, "n_head:        %u\n", model.n_head);
+    fprintf(stderr, "n_head_kv:     %u\n", model.n_head_kv);
+    fprintf(stderr, "n_expert:      %u\n", model.n_expert);
+    fprintf(stderr, "n_embd_head_k: %u\n", model.n_embd_head_k);
+    fprintf(stderr, "n_embd_head_v: %u\n", model.n_embd_head_v);
+    fprintf(stderr, "tensors:       %zu\n", model.tensors.size());
+
+    // Verify architecture
+    TEST_ASSERT(model.architecture == "qwen3", "expected architecture 'qwen3'");
+
+    // Verify key dimensions (Qwen3-0.6B)
+    TEST_ASSERT(model.n_layer == 28, "expected n_layer == 28");
+    TEST_ASSERT(model.n_embd == 1024, "expected n_embd == 1024");
+    TEST_ASSERT(model.n_head == 16, "expected n_head == 16");
+    TEST_ASSERT(model.n_head_kv == 8, "expected n_head_kv == 8");
+    TEST_ASSERT(model.n_expert == 0, "expected n_expert == 0 (not MoE)");
+    TEST_ASSERT(model.n_vocab == 151936, "expected n_vocab == 151936");
+
+    // Verify tensor count
+    TEST_ASSERT(model.tensors.size() == 311, "expected tensor count == 311");
+
+    // Verify known tensor names exist
+    bool found_attn_q = false;
+    bool found_token_embd = false;
+    bool found_output_norm = false;
+    for (const auto & t : model.tensors) {
+        if (t.name == "blk.0.attn_q.weight") {
+            found_attn_q = true;
+        }
+        if (t.name == "token_embd.weight") {
+            found_token_embd = true;
+        }
+        if (t.name == "output_norm.weight") {
+            found_output_norm = true;
+        }
+    }
+    TEST_ASSERT(found_attn_q, "expected tensor 'blk.0.attn_q.weight'");
+    TEST_ASSERT(found_token_embd, "expected tensor 'token_embd.weight'");
+    TEST_ASSERT(found_output_norm, "expected tensor 'output_norm.weight'");
+
+    // Verify token_embd.weight shape
+    for (const auto & t : model.tensors) {
+        if (t.name == "token_embd.weight") {
+            TEST_ASSERT(t.ne[0] == 1024, "expected token_embd.weight ne[0] == 1024");
+            TEST_ASSERT(t.n_dims == 2, "expected token_embd.weight to be 2D");
+            break;
+        }
+    }
+
+    // Test that second call uses cache (just call again, it should work)
+    auto result2 = gguf_fetch_model_meta("ggml-org/Qwen3-0.6B-GGUF", "Q8_0");
+    TEST_ASSERT(result2.has_value(), "cached fetch should succeed");
+    TEST_ASSERT(result2->tensors.size() == model.tensors.size(), "cached result should match");
+
+    // Test a split MoE model without specifying quant (should default to Q8_0)
+    auto result3 = gguf_fetch_model_meta("ggml-org/GLM-4.6V-GGUF");
+    if (!result3.has_value()) {
+        fprintf(stderr, "SKIP: could not fetch GLM-4.6V metadata (no network?)\n");
+        return 0;
+    }
+    const auto & model3 = result3.value();
+
+    fprintf(stderr, "Architecture:  %s\n", model3.architecture.c_str());
+    fprintf(stderr, "n_embd:        %u\n", model3.n_embd);
+    fprintf(stderr, "n_ff:          %u\n", model3.n_ff);
+    fprintf(stderr, "n_vocab:       %u\n", model3.n_vocab);
+    fprintf(stderr, "n_layer:       %u\n", model3.n_layer);
+    fprintf(stderr, "n_head:        %u\n", model3.n_head);
+    fprintf(stderr, "n_head_kv:     %u\n", model3.n_head_kv);
+    fprintf(stderr, "n_expert:      %u\n", model3.n_expert);
+    fprintf(stderr, "n_embd_head_k: %u\n", model3.n_embd_head_k);
+    fprintf(stderr, "n_embd_head_v: %u\n", model3.n_embd_head_v);
+    fprintf(stderr, "tensors:       %zu\n", model3.tensors.size());
+
+    // Verify architecture
+    TEST_ASSERT(model3.architecture == "glm4moe", "expected architecture 'glm4moe'");
+
+    // Verify key dimensions (GLM-4.6V)
+    TEST_ASSERT(model3.n_layer == 46, "expected n_layer == 46");
+    TEST_ASSERT(model3.n_embd == 4096, "expected n_embd == 4096");
+    TEST_ASSERT(model3.n_head == 96, "expected n_head == 96");
+    TEST_ASSERT(model3.n_head_kv == 8, "expected n_head_kv == 8");
+    TEST_ASSERT(model3.n_expert == 128, "expected n_expert == 128 (MoE)");
+    TEST_ASSERT(model3.n_vocab == 151552, "expected n_vocab == 151552");
+
+    // Verify tensor count
+    TEST_ASSERT(model3.tensors.size() == 780, "expected tensor count == 780");
+
+    // Test a hybrid-attention model with array-valued head counts
+    auto result4 = gguf_fetch_model_meta("ggml-org/Step-3.5-Flash-GGUF", "Q4_K");
+    if (!result4.has_value()) {
+        fprintf(stderr, "FAIL: could not fetch Step-3.5-Flash metadata\n");
+        return 1;
+    }
+    const auto & model4 = result4.value();
+
+    fprintf(stderr, "Architecture:  %s\n", model4.architecture.c_str());
+    fprintf(stderr, "n_embd:        %u\n", model4.n_embd);
+    fprintf(stderr, "n_ff:          %u\n", model4.n_ff);
+    fprintf(stderr, "n_vocab:       %u\n", model4.n_vocab);
+    fprintf(stderr, "n_layer:       %u\n", model4.n_layer);
+    fprintf(stderr, "n_head:        %u\n", model4.n_head);
+    fprintf(stderr, "n_head_kv:     %u\n", model4.n_head_kv);
+    fprintf(stderr, "n_expert:      %u\n", model4.n_expert);
+    fprintf(stderr, "n_embd_head_k: %u\n", model4.n_embd_head_k);
+    fprintf(stderr, "n_embd_head_v: %u\n", model4.n_embd_head_v);
+    fprintf(stderr, "tensors:       %zu\n", model4.tensors.size());
+
+    TEST_ASSERT(model4.architecture == "step35", "expected architecture 'step35'");
+
+    TEST_ASSERT(model4.n_layer == 45, "expected n_layer == 45");
+    TEST_ASSERT(model4.n_embd == 4096, "expected n_embd == 4096");
+    TEST_ASSERT(model4.n_ff == 11264, "expected n_ff == 11264");
+    TEST_ASSERT(model4.n_head == 64, "expected n_head == 64 (first element of per-layer array)");
+    TEST_ASSERT(model4.n_head_kv == 8, "expected n_head_kv == 8 (first element of per-layer array)");
+    TEST_ASSERT(model4.n_expert == 288, "expected n_expert == 288");
+    TEST_ASSERT(model4.n_embd_head_k == 128, "expected n_embd_head_k == 128");
+    TEST_ASSERT(model4.n_embd_head_v == 128, "expected n_embd_head_v == 128");
+    TEST_ASSERT(model4.n_vocab == 128896, "expected n_vocab == 128896");
+    TEST_ASSERT(model4.tensors.size() == 754, "expected tensor count == 754");
+
+    fprintf(stderr, "=== ALL TESTS PASSED ===\n");
+    return 0;
+}
--- a/tests/test-gguf.cpp
+++ b/tests/test-gguf.cpp
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
--- a/tests/test-grammar-llguidance.cpp
+++ b/tests/test-grammar-llguidance.cpp
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@@ -0,0 +1,537 @@
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include "llama.h"
+
+// TODO: shold not include libllama sources
+#include "../src/llama-grammar.h"
+
+#include <cassert>
+
+static const char * type_str(llama_gretype type) {
+    switch (type) {
+        case LLAMA_GRETYPE_CHAR: return "LLAMA_GRETYPE_CHAR";
+        case LLAMA_GRETYPE_CHAR_NOT: return "LLAMA_GRETYPE_CHAR_NOT";
+        case LLAMA_GRETYPE_CHAR_ALT: return "LLAMA_GRETYPE_CHAR_ALT";
+        case LLAMA_GRETYPE_CHAR_RNG_UPPER: return "LLAMA_GRETYPE_CHAR_RNG_UPPER";
+        case LLAMA_GRETYPE_RULE_REF: return "LLAMA_GRETYPE_RULE_REF";
+        case LLAMA_GRETYPE_ALT: return "LLAMA_GRETYPE_ALT";
+        case LLAMA_GRETYPE_END: return "LLAMA_GRETYPE_END";
+        default: return "?";
+    }
+}
+
+static void verify_parsing(const char *grammar_bytes, const std::vector<std::pair<std::string, uint32_t>> expected, const std::vector<llama_grammar_element> &expected_rules) {
+    uint32_t index = 0;
+    llama_grammar_parser parsed_grammar;
+    parsed_grammar.parse(grammar_bytes);
+
+    std::map<uint32_t, std::string> symbol_names;
+    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
+        symbol_names[it->second] = it->first;
+    }
+
+    auto print_all = [&]() {
+        fprintf(stderr, "    verify_parsing(R\"\"\"(%s)\"\"\", {\n", grammar_bytes);
+        for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
+            fprintf(stderr, "        {\"%s\", %u},\n", it->first.c_str(), it->second);
+        }
+        fprintf(stderr, "    }, {\n");
+        for (size_t i_rule = 0; i_rule < parsed_grammar.rules.size(); i_rule++) {
+            fprintf(stderr, "        // %s (index %zu)\n", symbol_names[i_rule].c_str(), i_rule);
+            auto & rule = parsed_grammar.rules[i_rule];
+            for (uint32_t i = 0; i < rule.size(); i++) {
+                std::string rule_str;
+                fprintf(stderr, "        {%s, ", type_str(rule[i].type));
+                if (rule[i].type == LLAMA_GRETYPE_CHAR || rule[i].type == LLAMA_GRETYPE_CHAR_ALT ||
+                    rule[i].type == LLAMA_GRETYPE_CHAR_NOT || rule[i].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+                    char c = rule[i].value;
+                    if (c == '\n') {
+                        fprintf(stderr, "'\\n'");
+                    } else if (c == '\t') {
+                        fprintf(stderr, "'\\t'");
+                    } else if (c == '\r') {
+                        fprintf(stderr, "'\\r'");
+                    } else if (c == '\0') {
+                        fprintf(stderr, "'\\0'");
+                    } else {
+                        fprintf(stderr, "'%c'", c);
+                    }
+                } else if (rule[i].type == LLAMA_GRETYPE_RULE_REF) {
+                    fprintf(stderr, "/* %s */ %u", symbol_names[rule[i].value].c_str(), rule[i].value);
+                } else {
+                    fprintf(stderr, "%u", rule[i].value);
+                }
+                fprintf(stderr, "},\n");
+            }
+        }
+        fprintf(stderr, "    });\n");
+    };
+
+    if (getenv("TEST_GRAMMAR_PARSER_PRINT_ALL")) {
+        print_all();
+        fprintf(stderr, "\n");
+        return;
+    }
+
+    fprintf(stderr, "Testing grammar:%s\n", grammar_bytes);
+
+    if (parsed_grammar.symbol_ids.size() != expected.size()) {
+        fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+        print_all();
+        assert(parsed_grammar.symbol_ids.size() == expected.size());
+    }
+
+    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
+    {
+        std::string key = it->first;
+        uint32_t value = it->second;
+        std::pair<std::string, uint32_t> expected_pair = expected[index];
+
+        // pretty print error message before asserting
+        if (expected_pair.first != key || expected_pair.second != value)
+        {
+            fprintf(stderr, "index: %u\n", index);
+            fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
+            fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
+            fprintf(stderr, "expected_pair != actual_pair\n");
+            fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+            print_all();
+        }
+
+        assert(expected_pair.first == key && expected_pair.second == value);
+
+        index++;
+    }
+
+    index = 0;
+    for (auto rule : parsed_grammar.rules)
+    {
+        // compare rule to expected rule
+        for (uint32_t i = 0; i < rule.size(); i++)
+        {
+            llama_grammar_element element = rule[i];
+            llama_grammar_element expected_element = expected_rules[index];
+
+            // pretty print error message before asserting
+            if (expected_element.type != element.type || expected_element.value != element.value)
+            {
+                fprintf(stderr, "index: %u\n", index);
+                fprintf(stderr, "expected_element: %s, %u\n", type_str(expected_element.type), expected_element.value);
+                fprintf(stderr, "actual_element: %s, %u\n", type_str(element.type), element.value);
+                fprintf(stderr, "expected_element != actual_element\n");
+                fprintf(stderr, "all elements:\n");
+                fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+                print_all();
+            }
+
+            assert(expected_element.type == element.type && expected_element.value == element.value);
+            index++;
+        }
+    }
+}
+
+static void verify_failure(const char * grammar_bytes) {
+    fprintf(stderr, "Testing expected failure:%s\n", grammar_bytes);
+    llama_grammar_parser result;
+    result.parse(grammar_bytes);
+    assert(result.rules.empty() && "should have failed");
+}
+
+int main()
+{
+    verify_failure(R"""(
+        root ::= "a"{,}"
+    )""");
+
+    verify_failure(R"""(
+        root ::= (((((([^x]*){0,99}){0,99}){0,99}){0,99}){0,99}){0,99}
+    )""");
+
+    verify_failure(R"""(
+        root ::= "a"{,10}"
+    )""");
+
+    verify_parsing(R"""(
+        root  ::= "a"
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a" | [bdx-z] | [^1-3]
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR, 'b'},
+        {LLAMA_GRETYPE_CHAR_ALT, 'd'},
+        {LLAMA_GRETYPE_CHAR_ALT, 'x'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR_NOT, '1'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '3'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a+
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"+
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a?
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"?
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a*
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"*
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2}
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2,}
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{ 4}
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2,4}
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= (expr "=" term "\n")+
+        expr  ::= term ([-+*/] term)*
+        term  ::= [0-9]+
+    )""", {
+        {"expr", 2},
+        {"expr_5", 5},
+        {"expr_6", 6},
+        {"root", 0},
+        {"root_1", 1},
+        {"root_4", 4},
+        {"term", 3},
+        {"term_7", 7},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, '='},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_CHAR, '\n'},
+        {LLAMA_GRETYPE_END, 0},
+        // expr (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_END, 0},
+        // term (index 3)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
+        {LLAMA_GRETYPE_END, 0},
+        // root_4 (index 4)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_5 (index 5)
+        {LLAMA_GRETYPE_CHAR, '-'},
+        {LLAMA_GRETYPE_CHAR_ALT, '+'},
+        {LLAMA_GRETYPE_CHAR_ALT, '*'},
+        {LLAMA_GRETYPE_CHAR_ALT, '/'},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_6 (index 6)
+        {LLAMA_GRETYPE_RULE_REF, /* expr_5 */ 5},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // term_7 (index 7)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= (expr "=" ws term "\n")+
+        expr  ::= term ([-+*/] term)*
+        term  ::= ident | num | "(" ws expr ")" ws
+        ident ::= [a-z] [a-z0-9_]* ws
+        num   ::= [0-9]+ ws
+        ws    ::= [ \t\n]*
+    )""", {
+        {"expr", 2},
+        {"expr_6", 6},
+        {"expr_7", 7},
+        {"ident", 8},
+        {"ident_10", 10},
+        {"num", 9},
+        {"num_11", 11},
+        {"root", 0},
+        {"root_1", 1},
+        {"root_5", 5},
+        {"term", 4},
+        {"ws", 3},
+        {"ws_12", 12},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, '='},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
+        {LLAMA_GRETYPE_CHAR, '\n'},
+        {LLAMA_GRETYPE_END, 0},
+        // expr (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
+        {LLAMA_GRETYPE_END, 0},
+        // ws (index 3)
+        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
+        {LLAMA_GRETYPE_END, 0},
+        // term (index 4)
+        {LLAMA_GRETYPE_RULE_REF, /* ident */ 8},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_RULE_REF, /* num */ 9},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR, '('},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, ')'},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // root_5 (index 5)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_6 (index 6)
+        {LLAMA_GRETYPE_CHAR, '-'},
+        {LLAMA_GRETYPE_CHAR_ALT, '+'},
+        {LLAMA_GRETYPE_CHAR_ALT, '*'},
+        {LLAMA_GRETYPE_CHAR_ALT, '/'},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_7 (index 7)
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // ident (index 8)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // num (index 9)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // ident_10 (index 10)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_CHAR_ALT, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_CHAR_ALT, '_'},
+        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // num_11 (index 11)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // ws_12 (index 12)
+        {LLAMA_GRETYPE_CHAR, ' '},
+        {LLAMA_GRETYPE_CHAR_ALT, '\t'},
+        {LLAMA_GRETYPE_CHAR_ALT, '\n'},
+        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    // <[1000]> = "<think>"
+    // <[1001]> = "</think>"
+    verify_parsing(R"""(
+        root  ::= <[1000]> !<[1001]> <[1001]>
+    )""", {
+        {"root", 0}
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_TOKEN, 1000},
+        {LLAMA_GRETYPE_TOKEN_NOT, 1001},
+        {LLAMA_GRETYPE_TOKEN, 1001},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    return 0;
+}
--- a/tests/test-jinja.cpp
+++ b/tests/test-jinja.cpp
--- a/tests/test-json-partial.cpp
+++ b/tests/test-json-partial.cpp
@@ -0,0 +1,287 @@
+#include "common.h"
+#include "json-partial.h"
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+
+template <class T> static void assert_equals(const T & expected, const T & actual) {
+  if (expected != actual) {
+      std::cerr << "Expected: " << expected << std::endl;
+      std::cerr << "Actual: " << actual << std::endl;
+      std::cerr << std::flush;
+      throw std::runtime_error("Test failed");
+  }
+}
+
+static void test_json_healing() {
+  auto parse = [](const std::string & str) {
+      std::cerr << "# Parsing: " << str << '\n';
+      std::string::const_iterator it = str.begin();
+      const auto end = str.end();
+      common_json out;
+      std::string healing_marker = "$llama.cpp.json$";
+      if (common_json_parse(it, end, healing_marker, out)) {
+          auto dump = out.json.dump();
+          std::cerr << "Parsed: " << dump << '\n';
+          std::cerr << "Magic: " << out.healing_marker.json_dump_marker << '\n';
+          std::string result;
+          if (!out.healing_marker.json_dump_marker.empty()) {
+              auto i = dump.find(out.healing_marker.json_dump_marker);
+              if (i == std::string::npos) {
+                  throw std::runtime_error("Failed to find magic in dump " + dump + " (magic: " + out.healing_marker.json_dump_marker + ")");
+              }
+              result = dump.substr(0, i);
+          } else {
+            result = dump;
+          }
+          std::cerr << "Result: " << result << '\n';
+          if (string_starts_with(str, result)) {
+            std::cerr << "Failure!\n";
+          }
+        //   return dump;
+      } else {
+        throw std::runtime_error("Failed to parse: " + str);
+      }
+
+  };
+  auto parse_all = [&](const std::string & str) {
+      for (size_t i = 1; i < str.size(); i++) {
+          parse(str.substr(0, i));
+      }
+  };
+  parse_all("{\"a\": \"b\"}");
+  parse_all("{\"hey\": 1, \"ho\\\"ha\": [1]}");
+
+  parse_all("[{\"a\": \"b\"}]");
+
+  auto test = [&](const std::vector<std::string> & inputs, const std::string & expected, const std::string & expected_marker) {
+      for (const auto & input : inputs) {
+        common_json out;
+        assert_equals(true, common_json_parse(input, "$foo", out));
+        assert_equals<std::string>(expected, out.json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true));
+        assert_equals<std::string>(expected_marker, out.healing_marker.json_dump_marker);
+      }
+  };
+  // No healing needed:
+  test(
+    {
+      R"([{"a":"b"}, "y"])",
+    },
+    R"([{"a":"b"},"y"])",
+    ""
+  );
+  // Partial literals can't be healed:
+  test(
+    {
+      R"([1)",
+      R"([tru)",
+      R"([n)",
+      R"([nul)",
+      R"([23.2)",
+    },
+    R"(["$foo"])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"({"a": 1)",
+      R"({"a": tru)",
+      R"({"a": n)",
+      R"({"a": nul)",
+      R"({"a": 23.2)",
+    },
+    R"({"a":"$foo"})",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"({)",
+    },
+    R"({"$foo":1})",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"([)",
+    },
+    R"(["$foo"])",
+    R"("$foo)"
+  );
+  // Healing right after a full literal
+  test(
+    {
+      R"(1 )",
+    },
+    R"(1)",
+    ""
+  );
+  test(
+    {
+      R"(true)",
+      R"(true )",
+    },
+    R"(true)",
+    ""
+  );
+  test(
+    {
+      R"(null)",
+      R"(null )",
+    },
+    R"(null)",
+    ""
+  );
+  test(
+    {
+      R"([1 )",
+    },
+    R"([1,"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([{})",
+      R"([{} )",
+    },
+    R"([{},"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([true)",
+    },
+    // TODO: detect the true/false/null literal was complete
+    R"(["$foo"])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"([true )",
+    },
+    R"([true,"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([true,)",
+    },
+    R"([true,"$foo"])",
+    R"("$foo)"
+  );
+  // Test nesting
+  test(
+    {
+      R"([{"a": [{"b": [{)",
+    },
+    R"([{"a":[{"b":[{"$foo":1}]}]}])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"([{"a": [{"b": [)",
+    },
+    R"([{"a":[{"b":["$foo"]}]}])",
+    R"("$foo)"
+  );
+
+  test(
+    {
+      R"([{"a": "b"})",
+      R"([{"a": "b"} )",
+    },
+    R"([{"a":"b"},"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([{"a": "b"},)",
+      R"([{"a": "b"}, )",
+    },
+    R"([{"a":"b"},"$foo"])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"({ "code)",
+    },
+    R"({"code$foo":1})",
+    R"($foo)"
+  );
+  test(
+    {
+      R"({ "code\)",
+    },
+    R"({"code\\$foo":1})",
+    R"(\$foo)"
+  );
+  test(
+    {
+      R"({ "code")",
+    },
+    R"({"code":"$foo"})",
+    R"(:"$foo)"
+  );
+  test(
+    {
+      R"({ "key")",
+    },
+    R"({"key":"$foo"})",
+    R"(:"$foo)"
+  );
+  // Test unicode escape sequences
+  test(
+    {
+      R"({"a":"\u)",
+    },
+    R"({"a":"\u0000$foo"})",
+    R"(0000$foo)"
+  );
+  test(
+    {
+      R"({"a":"\u00)",
+    },
+    R"({"a":"\u0000$foo"})",
+    R"(00$foo)"
+  );
+  test(
+    {
+      R"({"a":"\ud300)",
+    },
+    R"({"a":"\ud300$foo"})",
+    R"($foo)"
+  );
+  test(
+    {
+      R"({"a":"\ud800)",
+    },
+    R"({"a":"\ud800\udc00$foo"})",
+    R"(\udc00$foo)"
+  );
+  test(
+    {
+      R"({"a":"\ud800\)",
+    },
+    R"({"a":"\ud800\udc00$foo"})",
+    R"(udc00$foo)"
+  );
+  test(
+    {
+      R"({"a":"\ud800\u)",
+    },
+    R"({"a":"\ud800\udc00$foo"})",
+    R"(dc00$foo)"
+  );
+  test(
+    {
+      R"({"a":"\ud800\udc00)",
+    },
+    R"({"a":"\ud800\udc00$foo"})",
+    R"($foo)"
+  );
+}
+
+int main() {
+    test_json_healing();
+    std::cerr << "All tests passed.\n";
+    return 0;
+}
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
--- a/tests/test-llama-archs.cpp
+++ b/tests/test-llama-archs.cpp
@@ -0,0 +1,663 @@
+#include "common.h"
+#include "log.h"
+#include "ggml-backend.h"
+#include "ggml.h"
+#include "gguf.h"
+#include "ggml-cpp.h"
+#include "llama.h"
+#include "llama-cpp.h"
+
+// TODO: replace with #include "llama-ext.h" in the future
+#include "../src/llama-arch.h"
+#include "../src/llama-model-saver.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include <cstring>
+#include <cstdint>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+// normalized mean squared error = mse(a, b) / mse(a, 0)
+static double nmse(const std::vector<float> & a, const std::vector<float> & b) {
+    GGML_ASSERT(a.size() == b.size());
+    double mse_a_b = 0.0;
+    double mse_a_0 = 0.0;
+
+    for (size_t i = 0; i < a.size(); i++) {
+        float a_i = a[i];
+        float b_i = b[i];
+
+        mse_a_b += (a_i - b_i) * (a_i - b_i);
+        mse_a_0 += a_i * a_i;
+    }
+
+    return mse_a_b / mse_a_0;
+}
+
+static void set_tensor_data(struct ggml_tensor * tensor, void * userdata) {
+    std::hash<std::string> hasher;
+    std::mt19937 gen(hasher(tensor->name) + *(const size_t *) userdata);
+    std::normal_distribution<float> dis(0.0f, 1.0e-2f);
+
+    const int64_t ne = ggml_nelements(tensor);
+    if (tensor->type == GGML_TYPE_F32) {
+        std::vector<float> tmp(ne);
+        for (int64_t i = 0; i < ne; i++) {
+            tmp[i] = dis(gen);
+        }
+        ggml_backend_tensor_set(tensor, tmp.data(), 0, ggml_nbytes(tensor));
+    } else if (tensor->type == GGML_TYPE_F16) {
+        std::vector<ggml_fp16_t> tmp(ne);
+        for (int64_t i = 0; i < ne; i++) {
+            tmp[i] = ggml_fp32_to_fp16(dis(gen));
+        }
+        ggml_backend_tensor_set(tensor, tmp.data(), 0, ggml_nbytes(tensor));
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+static void usage(char ** argv) {
+    printf("Usage: %s [-a/--arch arch] [-s/--seed seed] [-v/--verbose]\n", argv[0]);
+}
+
+static std::vector<llama_token> get_tokens(const uint32_t n_tokens, const uint32_t n_vocab, const size_t seed){
+    std::mt19937 gen(seed);
+    std::uniform_int_distribution<> dis(0, n_vocab - 1);
+    std::vector<llama_token> ret;
+    ret.reserve(n_tokens);
+    for (uint32_t i = 0; i < n_tokens; i++) {
+        ret.push_back(dis(gen));
+    }
+    return ret;
+}
+
+static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
+    gguf_context_ptr ret(gguf_init_empty());
+    llama_model_saver ms(arch, ret.get());
+    const uint32_t n_ctx = 128;
+
+    uint32_t n_vocab = 128;
+    uint32_t n_embd  = 256;
+    uint32_t n_head  = 2;
+    uint32_t n_ff    = 384;
+    uint32_t n_layer = 2;
+    if (arch == LLM_ARCH_LLAMA4) {
+        n_layer = 4; // hparams.n_no_rope_layer_step is hard-coded to 4
+    } else if (arch == LLM_ARCH_GEMMA4) {
+        n_embd = 128;
+        n_head = 2;
+        n_ff   = 192;
+        n_layer = 5; // need at least 5 for swa_pattern (every 5th is full_attention)
+    } else if (arch == LLM_ARCH_GEMMA3N) {
+        n_embd = 64;
+        n_head = 1;
+        n_ff   = 96;
+        n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded
+    } else if (arch == LLM_ARCH_DEEPSEEK2
+            || arch == LLM_ARCH_GLM_DSA
+            || arch == LLM_ARCH_KIMI_LINEAR
+            || arch == LLM_ARCH_MISTRAL4) {
+        n_embd = 128;
+        n_head = 1;
+        n_ff   = 192;
+    } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
+        n_layer = 3;
+    } else if (arch == LLM_ARCH_CHAMELEON) {
+        n_vocab = 10240;
+    }
+
+    const uint32_t n_embd_head = n_embd / n_head;
+
+    ms.add_kv(LLM_KV_GENERAL_ARCHITECTURE,      llm_arch_name(arch));
+    ms.add_kv(LLM_KV_VOCAB_SIZE,                n_vocab);
+    ms.add_kv(LLM_KV_CONTEXT_LENGTH,            n_ctx);
+    ms.add_kv(LLM_KV_EMBEDDING_LENGTH,          n_embd);
+    ms.add_kv(LLM_KV_FEATURES_LENGTH,           n_embd);
+    ms.add_kv(LLM_KV_BLOCK_COUNT,               n_layer);
+    ms.add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, uint32_t(1));
+
+    if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
+        std::vector<uint32_t> n_ff_per_layer;
+        n_ff_per_layer.reserve(n_layer);
+        for (uint32_t il = 0; il < n_layer; il++) {
+            n_ff_per_layer.push_back(il <= 1 ? 0 : n_ff);
+        }
+        ms.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff_per_layer);
+    } else {
+        ms.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
+    }
+
+    ms.add_kv(LLM_KV_USE_PARALLEL_RESIDUAL,   false);
+    ms.add_kv(LLM_KV_LOGIT_SCALE,             1.0f);
+    ms.add_kv(LLM_KV_TIME_MIX_EXTRA_DIM,      uint32_t(64));
+    ms.add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM,    uint32_t(128));
+    ms.add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, uint32_t(2));
+
+    if (arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_JAMBA || arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE ||
+            arch == LLM_ARCH_GRANITE_HYBRID || arch == LLM_ARCH_LFM2 || arch == LLM_ARCH_LFM2MOE || arch == LLM_ARCH_KIMI_LINEAR) {
+        GGML_ASSERT(n_layer >= 2);
+        std::vector<uint32_t> n_head_per_layer;
+        n_head_per_layer.reserve(n_layer);
+        for (uint32_t il = 0; il < n_layer; il++) {
+            n_head_per_layer.push_back(il == 1 ? 0 : n_head);
+        }
+        ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head_per_layer);
+        ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_per_layer);
+    } else {
+        ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
+        ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head);
+    }
+
+    ms.add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, 8.0f);
+    if (arch == LLM_ARCH_DEEPSEEK2
+            || arch == LLM_ARCH_GLM_DSA
+            || arch == LLM_ARCH_KIMI_LINEAR
+            || arch == LLM_ARCH_MISTRAL4) {
+        ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH,       uint32_t(576));
+        ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH,     uint32_t(512));
+        ms.add_kv(LLM_KV_ROPE_DIMENSION_COUNT,       uint32_t(64));
+        ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   uint32_t(192));
+        ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, uint32_t(128));
+    }
+    ms.add_kv(LLM_KV_ATTENTION_CLAMP_KQV,              1.0f);
+    ms.add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS,          1e-5f);
+    ms.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      1e-5f);
+    ms.add_kv(LLM_KV_ATTENTION_GROUPNORM_EPS,          1e-5f);
+    ms.add_kv(LLM_KV_ATTENTION_GROUPNORM_GROUPS,       uint32_t(8));
+    ms.add_kv(LLM_KV_ATTENTION_Q_LORA_RANK,            uint32_t(512));
+    ms.add_kv(LLM_KV_ATTENTION_KV_LORA_RANK,           uint32_t(512));
+    ms.add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, uint32_t(8));
+    ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW,         n_ctx/8);
+
+    if (arch == LLM_ARCH_GEMMA4) {
+        ms.add_kv(LLM_KV_EMBEDDING_LENGTH_PER_LAYER,      n_embd/2);
+        ms.add_kv(LLM_KV_ATTENTION_SHARED_KV_LAYERS,      uint32_t(0));
+        ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA,        n_embd_head);
+        ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA,      n_embd_head);
+        ms.add_kv(LLM_KV_ROPE_FREQ_BASE_SWA,              10000.0f);
+        // SWA pattern: every 5th layer is full attention (matches E2B layer_types)
+        ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, uint32_t(5));
+    } else if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
+        std::vector<uint32_t> pattern;
+        pattern.reserve(n_layer);
+        for (uint32_t il = 0; il < n_layer; il++) {
+            pattern.push_back(il % 2);
+        }
+        ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, pattern);
+    } else {
+        ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, uint32_t(2));
+    }
+
+    ms.add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, uint32_t(1));
+    ms.add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, uint32_t(64));
+    ms.add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K,      uint32_t(8));
+    ms.add_kv(LLM_KV_ROPE_DIMENSION_SECTIONS, std::vector<uint32_t>({n_embd_head/4, n_embd_head/4, n_embd_head/4, n_embd_head/4}));
+    ms.add_kv(LLM_KV_TOKENIZER_MODEL,         "no_vocab");
+    // ms.add_kv(LLM_KV_DENSE_2_FEAT_OUT,     n_embd);
+    // ms.add_kv(LLM_KV_DENSE_3_FEAT_IN,      n_embd);
+
+    if (moe) {
+        ms.add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff);
+        ms.add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,  uint32_t(2));
+        ms.add_kv(LLM_KV_EXPERT_COUNT,               uint32_t(2));
+        ms.add_kv(LLM_KV_EXPERT_USED_COUNT,          uint32_t(1));
+        ms.add_kv(LLM_KV_EXPERT_SHARED_COUNT,        uint32_t(1));
+        ms.add_kv(LLM_KV_EXPERT_GATING_FUNC,         uint32_t(2)); // sigmoid
+        ms.add_kv(LLM_KV_EXPERT_GROUP_SCALE,         1.0f);
+        ms.add_kv(LLM_KV_EXPERTS_PER_GROUP,          uint32_t(1));
+    }
+
+    ms.add_kv(LLM_KV_POSNET_EMBEDDING_LENGTH,   n_embd);
+    ms.add_kv(LLM_KV_POSNET_BLOCK_COUNT,        n_layer);
+    ms.add_kv(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, n_embd);
+    ms.add_kv(LLM_KV_CONVNEXT_BLOCK_COUNT,      n_layer);
+    ms.add_kv(LLM_KV_XIELU_ALPHA_N,             1.0f);
+    ms.add_kv(LLM_KV_XIELU_ALPHA_P,             1.0f);
+    ms.add_kv(LLM_KV_XIELU_BETA,                1.0f);
+    ms.add_kv(LLM_KV_XIELU_EPS,                 1.0e-7f);
+    ms.add_kv(LLM_KV_SSM_INNER_SIZE,            arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 256 : 2*n_embd);
+    ms.add_kv(LLM_KV_SSM_CONV_KERNEL,           uint32_t(4));
+    ms.add_kv(LLM_KV_SSM_STATE_SIZE,            uint32_t(128));
+    ms.add_kv(LLM_KV_SSM_TIME_STEP_RANK,        n_head);
+    ms.add_kv(LLM_KV_SSM_GROUP_COUNT,           arch == LLM_ARCH_PLAMO2 ? 0 : uint32_t(2));
+    ms.add_kv(LLM_KV_KDA_HEAD_DIM,              uint32_t(128));
+    ms.add_kv(LLM_KV_WKV_HEAD_SIZE,             n_embd/n_head);
+    ms.add_kv(LLM_KV_SHORTCONV_L_CACHE,         uint32_t(3));
+
+    for (uint32_t il = 0; il < n_layer; il++) {
+        ggml_tensor t;
+        memset(&t, 0, sizeof(ggml_tensor));
+        t.type = GGML_TYPE_F16;
+        ggml_format_name(&t, "conv%" PRIu32 "d.weight", il);
+        gguf_add_tensor(ms.gguf_ctx, &t);
+        ggml_format_name(&t, "posnet.%" PRIu32 ".conv1.weight", il);
+        gguf_add_tensor(ms.gguf_ctx, &t);
+        ggml_format_name(&t, "posnet.%" PRIu32 ".conv2.weight", il);
+        gguf_add_tensor(ms.gguf_ctx, &t);
+        ggml_format_name(&t, "convnext.%" PRIu32 ".dw.weight", il);
+        gguf_add_tensor(ms.gguf_ctx, &t);
+    }
+    return ret;
+}
+
+static bool silent_model_load_progress(float /*progress*/, void * /*user_data*/) {
+    return true;
+}
+
+static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
+        struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector<ggml_backend_dev_t> & devs,
+        const llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER, bool encode = false) {
+    GGML_ASSERT((gguf_ctx == nullptr) != (file == nullptr));
+    llama_model_params model_params = llama_model_default_params();
+    model_params.progress_callback = silent_model_load_progress;
+    std::vector<ggml_backend_dev_t> devs_copy = devs;
+    devs_copy.push_back(nullptr);
+    model_params.devices = devs_copy.data();
+    model_params.split_mode = split_mode;
+
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx = 0;
+    ctx_params.n_threads = 4;
+    ctx_params.n_threads_batch = 4;
+    if (!encode) {
+        ctx_params.n_ubatch = 64;
+    }
+
+    size_t tmp = seed;
+    llama_model_ptr model(gguf_ctx != nullptr ?
+        llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params) :
+        llama_model_load_from_file_ptr(file, model_params));
+    if (!model) {
+        throw std::runtime_error("failed to create llama model");
+    }
+    llama_context_ptr lctx(llama_init_from_model(model.get(), ctx_params));
+    if (!lctx) {
+        throw std::runtime_error("failed to create llama context");
+    }
+    return std::make_pair(std::move(model), std::move(lctx));
+}
+
+static std::vector<float> get_logits(
+        llama_model * model, llama_context * lctx, const std::vector<llama_token> & tokens, bool encode = false) {
+    const uint32_t n_vocab  = llama_vocab_n_tokens(llama_model_get_vocab(model));
+    const uint32_t n_ctx    = llama_n_ctx(lctx);
+    const uint32_t n_tokens = tokens.size();
+    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
+    GGML_ASSERT(n_tokens <= n_ctx);
+    for (uint32_t pos = 0; pos < n_tokens; pos++) {
+        common_batch_add(batch, tokens[pos], pos, {0}, true);
+    }
+    batch.n_tokens = n_tokens;
+    if (encode) {
+        if (llama_encode(lctx, batch)) {
+            llama_batch_free(batch);
+            throw std::runtime_error("failed to encode batch");
+        }
+    }
+    if (llama_decode(lctx, batch)) {
+        llama_batch_free(batch);
+        throw std::runtime_error("failed to decode batch");
+    }
+
+    std::vector<float> ret;
+    ret.reserve(n_tokens*n_vocab);
+    for (uint32_t i = 0; i < n_tokens; i++) {
+        const float * logits_ith = llama_get_logits_ith(lctx, i);
+        for (uint32_t j = 0; j < n_vocab; j++) {
+            ret.push_back(logits_ith[j]);
+        }
+    }
+    llama_batch_free(batch);
+    return ret;
+}
+
+static bool moe_mandatory(const llm_arch arch) {
+    switch (arch) {
+        case LLM_ARCH_LLAMA4:
+        case LLM_ARCH_GROK:
+        case LLM_ARCH_QWEN2MOE:
+        case LLM_ARCH_QWEN3MOE:
+        case LLM_ARCH_QWEN3NEXT:
+        case LLM_ARCH_QWEN3VLMOE:
+        case LLM_ARCH_QWEN35MOE:
+        case LLM_ARCH_PHIMOE:
+        case LLM_ARCH_DBRX:
+        case LLM_ARCH_OLMOE:
+        case LLM_ARCH_ARCTIC:
+        case LLM_ARCH_DEEPSEEK:
+        case LLM_ARCH_DEEPSEEK2:
+        case LLM_ARCH_GLM4_MOE:
+        case LLM_ARCH_GLM_DSA:
+        case LLM_ARCH_EXAONE_MOE:
+        case LLM_ARCH_BAILINGMOE:
+        case LLM_ARCH_BAILINGMOE2:
+        case LLM_ARCH_DOTS1:
+        case LLM_ARCH_AFMOE:
+        case LLM_ARCH_ERNIE4_5:
+        case LLM_ARCH_ERNIE4_5_MOE:
+        case LLM_ARCH_HUNYUAN_MOE:
+        case LLM_ARCH_OPENAI_MOE:
+        case LLM_ARCH_LFM2MOE:
+        case LLM_ARCH_SMALLTHINKER:
+        case LLM_ARCH_LLADA_MOE:
+        case LLM_ARCH_GROVEMOE:
+        case LLM_ARCH_MINIMAX_M2:
+        case LLM_ARCH_RND1:
+        case LLM_ARCH_PADDLEOCR:
+        case LLM_ARCH_MIMO2:
+        case LLM_ARCH_KIMI_LINEAR:
+        case LLM_ARCH_STEP35:
+        case LLM_ARCH_MISTRAL4:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool moe_implemented(const llm_arch arch) {
+    if (moe_mandatory(arch)) {
+        return true;
+    }
+    switch (arch) {
+        case LLM_ARCH_LLAMA:
+        case LLM_ARCH_REFACT:
+        case LLM_ARCH_MINICPM:
+        case LLM_ARCH_GRANITE:
+        case LLM_ARCH_GRANITE_MOE:
+        case LLM_ARCH_MISTRAL3:
+        case LLM_ARCH_LLAMA_EMBED:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool arch_supported(const llm_arch arch) {
+    if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
+        return false; // These models don't have usable implementations.
+    }
+    if (arch == LLM_ARCH_CHAMELEON) {
+        return false; // Only half-implemented and to be removed in the future.
+    }
+    if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
+        return false; // FIXME CUDA backend crashes.
+    }
+    if (arch == LLM_ARCH_GEMMA4) {
+        return false; // FIXME @ngxson
+    }
+    if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
+        return false; // FIXME Embedding (?) models produce inconsistent results.
+    }
+    if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
+        return false; // FIXME RWKV models hang indefinitely.
+    }
+    if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
+            arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
+        return false; // TODO vocab
+    }
+    if (arch == LLM_ARCH_PLM) {
+        return false; // TODO tensor shapes
+    }
+    if (arch == LLM_ARCH_DEEPSEEK2OCR) {
+        return false;
+    }
+
+    // FIXME some models are segfaulting with WebGPU:
+#ifdef GGML_USE_WEBGPU
+    if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
+        return false;
+    }
+#endif // GGML_USE_WEBGPU
+
+    return true;
+}
+
+static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) {
+    struct user_data_t {
+        struct {
+            ggml_log_callback callback;
+            void * user_data;
+        } original_logger;
+        ggml_log_level min_level; // prints below this log level go to debug log
+    };
+    user_data_t ud;
+    llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
+    ud.min_level = log_level;
+
+    llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
+        const user_data_t * ud = (const user_data_t *) user_data;
+        const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
+        ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
+    }, &ud);
+
+    for (const llm_arch & arch : llm_arch_all()) {
+        if (arch == LLM_ARCH_UNKNOWN) {
+            continue;
+        }
+        if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
+            continue;
+        }
+        if (arch == LLM_ARCH_GEMMA4) {
+            continue; // FIXME: ISWA KV cache initialization needs more fixture params
+        }
+        for (bool moe : {false, true}) {
+            if (moe && !moe_implemented(arch)) {
+                continue;
+            }
+            if (!moe && moe_mandatory(arch)) {
+                continue;
+            }
+            if (!llama_model_saver_supports_arch(arch)) {
+                LOG_INF("%s: %s model (%s) is unsupported, skipping\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense");
+                continue;
+            }
+            gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
+            auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {});
+            const std::string path = dir + "/" + llm_arch_name(arch) + (moe ? "-moe.gguf" : "-dense.gguf");
+            LOG_INF("%s: Saving %s model (%s) to %s...\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense", path.c_str());
+            llama_model_save_to_file(model_and_ctx.first.get(), path.c_str());
+        }
+    }
+    llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+    return 0;
+}
+
+static int test_backends(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level) {
+    struct user_data_t {
+        struct {
+            ggml_log_callback callback;
+            void * user_data;
+        } original_logger;
+        ggml_log_level min_level; // prints below this log level go to debug log
+    };
+    user_data_t ud;
+    llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
+    ud.min_level = log_level;
+
+    llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
+        const user_data_t * ud = (const user_data_t *) user_data;
+        const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
+        ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
+    }, &ud);
+
+    const std::vector<llama_token> tokens = get_tokens(128, 128, seed);
+
+    struct device_config {
+        std::vector<ggml_backend_dev_t> devs;
+        std::string                     label;
+        llama_split_mode                split_mode;
+
+        device_config(std::vector<ggml_backend_dev_t> devs, std::string name, llama_split_mode split_mode)
+            : devs(std::move(devs)), label(std::move(name)), split_mode(split_mode) {}
+    };
+
+    std::vector<device_config> dev_configs;
+    {
+        std::vector<ggml_backend_dev_t> devices_meta;
+        {
+            const size_t device_count = ggml_backend_dev_count();
+            for (size_t i = 0; i < device_count; i++) {
+                ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+                dev_configs.emplace_back(std::vector<ggml_backend_dev_t>{dev}, ggml_backend_dev_description(dev), LLAMA_SPLIT_MODE_LAYER);
+
+                // cpu-based devices cannot be used in tensor split mode
+                if (ggml_backend_dev_buffer_type(dev) != ggml_backend_cpu_buffer_type()) {
+                    devices_meta.push_back(dev);
+                }
+            }
+        }
+
+        dev_configs.emplace_back(devices_meta, "Meta", LLAMA_SPLIT_MODE_TENSOR);
+    }
+
+    bool all_ok = true;
+    common_log_flush(common_log_main());
+    printf("|%16s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip");
+    printf("|----------------|------------------------------|------|---------------|---------|\n");
+    for (const llm_arch & arch : llm_arch_all()) {
+        if (arch == LLM_ARCH_UNKNOWN) {
+            continue;
+        }
+        if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
+            continue;
+        }
+        if (arch == LLM_ARCH_GEMMA4) {
+            continue; // FIXME: ISWA KV cache initialization needs more fixture params
+        }
+
+        const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1;
+        for (bool moe : {false, true}) {
+            if (moe && !moe_implemented(arch)) {
+                continue;
+            }
+            if (!moe && moe_mandatory(arch)) {
+                continue;
+            }
+            const std::string config_name = moe ? "MoE" : "Dense";
+            gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
+            std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_cpu;
+            std::vector<float> logits_cpu;
+            for (device_config & dc : dev_configs) {
+                std::pair<llama_model_ptr, llama_context_ptr> model_and_ctx_dev;
+                std::vector<float> logits_dev;
+                std::string status_nmse      = "\033[1;33mSKIP\033[0m";
+                std::string status_roundtrip = "\033[1;33mSKIP\033[0m";
+                char nmse_str[12] = {0};
+                bool skip = !arch_supported(arch) || (dc.split_mode == LLAMA_SPLIT_MODE_TENSOR && dc.devs.empty());
+#if defined(GGML_USE_WEBGPU)
+                skip = true; // FIXME
+#endif // GGML_USE_WEBGPU
+                if (!skip) {
+                    if (logits_cpu.empty()) {
+                        model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {}, LLAMA_SPLIT_MODE_LAYER, encode);
+                        logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
+                    }
+                    if (dc.split_mode != LLAMA_SPLIT_MODE_TENSOR || llm_arch_supports_sm_tensor(arch)) {
+                        model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, dc.devs, dc.split_mode, encode);
+                        logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
+                        const double nmse_val = nmse(logits_cpu, logits_dev);
+                        snprintf(nmse_str, sizeof(nmse_str), "(%.2e)", nmse_val);
+                        status_nmse = "\033[1;32mOK\033[0m";
+                        if (nmse_val > 1e-4) {
+                            all_ok = false;
+                            status_nmse = "\033[1;31mFAIL\033[0m";
+                        }
+                    }
+
+                    FILE * file = tmpfile(); // Can be null on Windows without administrator privileges.
+                    // FIXME: when adding a tensor to a gguf_context a copy is made, this changes the pointer which the meta backend
+                    //     in turn uses to map the tensors to their simple equivalents - this is fundamentally incompatible
+                    if (file != nullptr && llama_model_saver_supports_arch(arch) && dc.split_mode != LLAMA_SPLIT_MODE_TENSOR) {
+                        GGML_ASSERT(model_and_ctx_dev.first && model_and_ctx_dev.second);
+                        llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get());
+                        ms.add_kv_from_model();
+                        ms.add_tensors_from_model();
+                        ms.save(file);
+                        rewind(file);
+
+                        auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, dc.devs, dc.split_mode, encode);
+                        const std::vector<float> logits_roundtrip = get_logits(
+                            model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode);
+                        status_roundtrip = "\033[1;32mOK\033[0m";
+                        GGML_ASSERT(logits_roundtrip.size() == logits_dev.size());
+                        for (size_t i = 0; i < logits_roundtrip.size(); i++) {
+                            if (logits_roundtrip[i] != logits_dev[i]) {
+                                all_ok = false;
+                                status_roundtrip = "\033[1;31mFAIL\033[0m";
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                printf("|%16s|%30s|%6s|%15s %10s|%20s|\n", llm_arch_name(arch), dc.label.c_str(),
+                    config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str());
+            }
+        }
+    }
+    llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+    return all_ok ? 0 : 1;
+}
+
+int main(int argc, char ** argv) {
+    // FIXME these tests are disabled in the CI for macOS-latest-cmake-arm64 because they are segfaulting
+    common_init();
+    std::random_device rd;
+
+    llm_arch arch = LLM_ARCH_UNKNOWN;
+    size_t seed = rd();
+    ggml_log_level log_level = GGML_LOG_LEVEL_ERROR;
+    std::string out;
+
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "-a") == 0 || strcmp(argv[i], "--arch") == 0) {
+            if (i + 1 < argc) {
+                const std::string arch_name = argv[++i];
+                arch = llm_arch_from_string(arch_name);
+                if (arch == LLM_ARCH_UNKNOWN) {
+                    LOG_ERR("%s: unkown LLM architecture: %s\n", __func__, arch_name.c_str());
+                    return 1;
+                }
+            } else {
+                usage(argv);
+                return 1;
+            }
+        }
+        if (strcmp(argv[i], "-s") == 0 || strcmp(argv[i], "--seed") == 0) {
+            if (i + 1 < argc) {
+                seed = std::stoull(argv[++i]);
+            } else {
+                usage(argv);
+                return 1;
+            }
+        }
+        if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) {
+            log_level = GGML_LOG_LEVEL_INFO;
+            continue;
+        }
+        if (strcmp(argv[i], "-o") == 0 || strcmp(argv[i], "--out") == 0) {
+            if (i + 1 < argc) {
+                out = argv[++i];
+            } else {
+                usage(argv);
+                return 1;
+            }
+        }
+    }
+    printf("%s: using seed %zu\n", __func__, seed);
+
+    try {
+        if (!out.empty()) {
+            return save_models(arch, seed, log_level, out);
+        }
+        return test_backends(arch, seed, log_level);
+    } catch (const std::exception & err) {
+        fprintf(stderr, "encountered runtime error: %s\n", err.what());
+        return -1;
+    }
+}
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -0,0 +1,406 @@
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include "llama.h"
+
+#include "../src/llama-grammar.h"
+
+#include <cassert>
+#include <stdexcept>
+
+int main()
+{
+    llama_grammar_parser parsed_grammar;
+
+    std::vector<std::pair<std::string, uint32_t>> expected = {
+        {"expr", 2},
+        {"expr_6", 6},
+        {"expr_7", 7},
+        {"ident", 8},
+        {"ident_10", 10},
+        {"num", 9},
+        {"num_11", 11},
+        {"root", 0},
+        {"root_1", 1},
+        {"root_5", 5},
+        {"term", 4},
+        {"ws", 3},
+        {"ws_12", 12},
+    };
+
+    std::vector<std::vector<llama_grammar_element>> expected_rules = {
+        {{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_END, 0}},
+        {
+            {LLAMA_GRETYPE_RULE_REF, 2},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_RULE_REF, 4},
+            {LLAMA_GRETYPE_CHAR, 10},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {{LLAMA_GRETYPE_RULE_REF, 4}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_END, 0}},
+        {{LLAMA_GRETYPE_RULE_REF, 12}, {LLAMA_GRETYPE_END, 0}},
+        {
+            {LLAMA_GRETYPE_RULE_REF, 8},
+            {LLAMA_GRETYPE_ALT, 0},
+            {LLAMA_GRETYPE_RULE_REF, 9},
+            {LLAMA_GRETYPE_ALT, 0},
+            {LLAMA_GRETYPE_CHAR, 40},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_RULE_REF, 2},
+            {LLAMA_GRETYPE_CHAR, 41},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {{LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_END, 0}},
+        {
+            {LLAMA_GRETYPE_CHAR, 45},
+            {LLAMA_GRETYPE_CHAR_ALT, 43},
+            {LLAMA_GRETYPE_CHAR_ALT, 42},
+            {LLAMA_GRETYPE_CHAR_ALT, 47},
+            {LLAMA_GRETYPE_RULE_REF, 4},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {{LLAMA_GRETYPE_RULE_REF, 6}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_END, 0}},
+        {
+            {LLAMA_GRETYPE_CHAR, 97},
+            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
+            {LLAMA_GRETYPE_RULE_REF, 10},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {{LLAMA_GRETYPE_RULE_REF, 11}, {LLAMA_GRETYPE_RULE_REF, 3}, {LLAMA_GRETYPE_END, 0}},
+        {
+            {LLAMA_GRETYPE_CHAR, 97},
+            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
+            {LLAMA_GRETYPE_CHAR_ALT, 48},
+            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+            {LLAMA_GRETYPE_CHAR_ALT, 95},
+            {LLAMA_GRETYPE_RULE_REF, 10},
+            {LLAMA_GRETYPE_ALT, 0},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 48},
+            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+            {LLAMA_GRETYPE_RULE_REF, 11},
+            {LLAMA_GRETYPE_ALT, 0},
+            {LLAMA_GRETYPE_CHAR, 48},
+            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+            {LLAMA_GRETYPE_END, 0},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 32},
+            {LLAMA_GRETYPE_CHAR_ALT, 9},
+            {LLAMA_GRETYPE_CHAR_ALT, 10},
+            {LLAMA_GRETYPE_RULE_REF, 12},
+            {LLAMA_GRETYPE_ALT, 0},
+            {LLAMA_GRETYPE_END, 0},
+        },
+    };
+
+    for (auto pair : expected)
+    {
+        parsed_grammar.symbol_ids[pair.first] = pair.second;
+    }
+
+    for (auto rule : expected_rules)
+    {
+        parsed_grammar.rules.emplace_back();
+        for (auto element : rule)
+        {
+            parsed_grammar.rules.back().push_back(element);
+        }
+    }
+
+    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    if (grammar == nullptr) {
+        throw std::runtime_error("Failed to initialize llama_grammar");
+    }
+
+    std::vector<std::vector<llama_grammar_element>> expected_stacks = {
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_CHAR, 40},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_CHAR, 97},
+        },
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_CHAR, 40},
+        },
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_CHAR, 97},
+        }};
+
+    auto index = 0;
+    for (const llama_grammar_stack & stack : llama_grammar_get_stacks(grammar))
+    {
+        // compare stack to expected_stack
+        for (uint32_t i = 0; i < stack.size(); i++)
+        {
+            const llama_grammar_element * element = stack[i];
+            const llama_grammar_element & expected_element = expected_stacks[index][i];
+
+            // pretty print error message before asserting
+            if (expected_element.type != element->type || expected_element.value != element->value)
+            {
+                fprintf(stderr, "index: %d\n", index);
+                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
+                fprintf(stderr, "actual_element: %d, %u\n", element->type, element->value);
+                fprintf(stderr, "expected_element != actual_element\n");
+            }
+
+            assert(expected_element.type == element->type && expected_element.value == element->value);
+        }
+        index++;
+    }
+
+    std::vector<llama_grammar_candidate> next_candidates;
+    next_candidates.resize(23);
+
+    for (size_t i = 0; i < 23; ++i)
+    {
+        uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
+        cp[0] = 37 + i;
+        cp[1] = 0;
+        next_candidates[i] = {i, cp, {}, 0};
+    }
+
+    std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {11, 48},
+            {12, 49},
+            {13, 50},
+            {14, 51},
+            {15, 52},
+            {16, 53},
+            {17, 54},
+            {18, 55},
+            {19, 56},
+            {20, 57},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {11, 48},
+            {12, 49},
+            {13, 50},
+            {14, 51},
+            {15, 52},
+            {16, 53},
+            {17, 54},
+            {18, 55},
+            {19, 56},
+            {20, 57},
+            {21, 58},
+            {22, 59},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {11, 48},
+            {12, 49},
+            {13, 50},
+            {14, 51},
+            {15, 52},
+            {16, 53},
+            {17, 54},
+            {18, 55},
+            {19, 56},
+            {20, 57},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {21, 58},
+            {22, 59},
+            {23, 60},
+        },
+        {
+            {0, 37},
+            {1, 38},
+            {2, 39},
+            {3, 40},
+            {4, 41},
+            {5, 42},
+            {6, 43},
+            {7, 44},
+            {8, 45},
+            {9, 46},
+            {10, 47},
+            {11, 48},
+            {12, 49},
+            {13, 50},
+            {14, 51},
+            {15, 52},
+            {16, 53},
+            {17, 54},
+            {18, 55},
+            {19, 56},
+            {20, 57},
+            {21, 58},
+            {22, 59},
+        },
+    };
+
+    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
+
+    std::vector<std::vector<llama_grammar_candidate>> all_rejects;
+
+    for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
+    {
+        rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
+        all_rejects.push_back(rejects);
+    }
+
+    index = 0;
+    for (auto rej : all_rejects)
+    {
+        for (uint32_t i = 0; i < rej.size(); i++)
+        {
+            auto element = rej[i];
+            auto expected_element = expected_reject[index][i];
+            assert(element.index == expected_element.first && *element.code_points == expected_element.second);
+        }
+        index++;
+    }
+
+    for (auto &candidate : next_candidates)
+    {
+        delete[] candidate.code_points;
+        candidate.code_points = nullptr;
+    }
+
+    llama_grammar_free_impl(grammar);
+
+    return 0;
+}
--- a/tests/test-log.cpp
+++ b/tests/test-log.cpp
@@ -0,0 +1,43 @@
+#include "log.h"
+
+#include <cstdlib>
+#include <thread>
+
+int main() {
+    const int n_thread = 8;
+
+    std::thread threads[n_thread];
+    for (int i = 0; i < n_thread; i++) {
+        threads[i] = std::thread([i]() {
+            const int n_msg = 1000;
+
+            for (int j = 0; j < n_msg; j++) {
+                const int log_type = std::rand() % 4;
+
+                switch (log_type) {
+                    case 0: LOG_INF("Thread %d: %d\n", i, j); break;
+                    case 1: LOG_WRN("Thread %d: %d\n", i, j); break;
+                    case 2: LOG_ERR("Thread %d: %d\n", i, j); break;
+                    case 3: LOG_DBG("Thread %d: %d\n", i, j); break;
+                    default:
+                        break;
+                }
+
+                if (rand () % 10 < 5) {
+                    common_log_set_timestamps(common_log_main(), rand() % 2);
+                    common_log_set_prefix    (common_log_main(), rand() % 2);
+                }
+            }
+        });
+    }
+
+    for (int i = 0; i < n_thread; i++) {
+        threads[i].join();
+    }
+
+    common_log_flush(common_log_main());
+    // We explicitly free the logger singleton to avoid hanging on Windows
+    // related to timing issues of thread startup and DLL teardown
+    common_log_free(common_log_main());
+    return 0;
+}
--- a/tests/test-lora-conversion-inference.sh
+++ b/tests/test-lora-conversion-inference.sh
@@ -0,0 +1,144 @@
+#!/usr/bin/env bash
+set -e
+
+# Array of models to iterate over
+declare -a params=(
+    "Gemma2ForCausalLM 64"
+    "LlamaForCausalLM 64"
+    "Phi3ForCausalLM 64"
+)
+
+MODELS_REPO=lora-tests
+MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO
+COMMIT=c26d5fb85b4070a9e9c4e65d132c783b98086890
+
+# Clone the Hugging Face repository if the directory does not exist
+if [ ! -d "$MODELS_REPO" ]; then
+    echo "Cloning the Hugging Face repository..."
+    git clone $MODELS_REPO_URL --depth 1
+    cd $MODELS_REPO
+    git fetch --depth=1 origin $COMMIT
+    git reset --hard $COMMIT
+    cd -
+else
+    echo "Repository already exists. Skipping clone."
+fi
+
+# Array to store results to print
+results=()
+
+trim_leading_whitespace() {
+    local input_string="$1"
+    echo "${input_string#"${input_string%%[![:space:]]*}"}"
+}
+
+extract_starting_substring() {
+    local reference_string="$1"
+    local target_string="$2"
+
+    local target_length=${#target_string}
+    echo "${reference_string:0:$target_length}"
+}
+
+get_first_word() {
+    local input_string="$1"
+    read -r first_word _ <<< "$input_string"
+    echo "$first_word"
+}
+
+# Load the expected strings
+EXPECTED_BASE_FULL=$(cat $MODELS_REPO/data/pale_blue_dot.txt)
+EXPECTED_LORA_FULL=$(cat $MODELS_REPO/data/bohemian_rhapsody.txt)
+EXPECTED_BASE_FIRST_WORD=$(get_first_word "$EXPECTED_BASE_FULL")
+EXPECTED_LORA_FIRST_WORD=$(get_first_word "$EXPECTED_LORA_FULL")
+
+run_conversion_and_inference_lora() {
+    local model_name=$1
+    local hidden_size=$2
+
+    echo -e "\n\n-------- RUNNING TEST FOR MODEL $model_name --------\n\n"
+
+    # Convert safetensors to gguf
+    echo "Running convert_hf_to_gguf.py for $model_name with hidden_size $hidden_size..."
+    python convert_hf_to_gguf.py $MODELS_REPO/$model_name/hidden_size=$hidden_size/base \
+        --outfile $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        --outtype f32
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running convert_lora_to_gguf.py for $model_name with hidden_size $hidden_size..."
+    python3 convert_lora_to_gguf.py $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora \
+        --base $MODELS_REPO/$model_name/hidden_size=$hidden_size/base \
+        --outtype f32
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-export-lora with lora for $model_name with hidden_size $hidden_size..."
+    ./llama-export-lora \
+        -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        -o $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
+        --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf
+
+    # Run inference
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-completion without lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_BASE=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        -p "$EXPECTED_BASE_FIRST_WORD" -n 50 --seed 42 --temp 0)
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-completion with hot lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_LORA_HOT=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
+        --lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf \
+        -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
+
+    echo -e "\n\n---------------------------\n\n"
+    echo "Running llama-completion with merged lora for $model_name with hidden_size $hidden_size..."
+    OUTPUT_LORA_MERGED=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
+        -p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
+
+    # Remove any initial white space
+    OUTPUT_BASE=$(trim_leading_whitespace "$OUTPUT_BASE")
+    OUTPUT_LORA_HOT=$(trim_leading_whitespace "$OUTPUT_LORA_HOT")
+    OUTPUT_LORA_MERGED=$(trim_leading_whitespace "$OUTPUT_LORA_MERGED")
+    # Extract the corresponding substring from full string
+    EXPECTED_BASE=$(extract_starting_substring "$EXPECTED_BASE_FULL" "$OUTPUT_BASE")
+    EXPECTED_LORA=$(extract_starting_substring "$EXPECTED_LORA_FULL" "$OUTPUT_LORA_HOT")
+
+    # Assert output equals the expected output
+    if [[ "$OUTPUT_BASE" != "$EXPECTED_BASE" ]]; then
+        echo "Error: $model_name OUTPUT_BASE does not start with the expected string."
+        echo -e "Out=$OUTPUT_BASE\n\nExp=$EXPECTED_BASE"
+        exit 1
+    fi
+    if [[ "$OUTPUT_LORA_HOT" != "$EXPECTED_LORA" ]]; then
+        echo "Error: $model_name OUTPUT_LORA_HOT does not start with the expected string."
+        echo -e "Out=$OUTPUT_LORA_HOT\n\nExp=$EXPECTED_LORA"
+        exit 1
+    fi
+    if [[ "$OUTPUT_LORA_MERGED" != "$EXPECTED_LORA" ]]; then
+        echo "Error: $model_name OUTPUT_LORA_MERGED does not start with the expected string."
+        echo -e "Out=$OUTPUT_LORA_MERGED\n\nExp=$EXPECTED_LORA"
+        exit 1
+    fi
+
+    # Store the results
+    results+=("
+    \n\033[1mResults for $model_name with hidden_size $hidden_size:\033[0m
+    \n\033[32m  • Base:\n$OUTPUT_BASE
+    \n\033[34m  • Lora hot:\n$OUTPUT_LORA_HOT
+    \n\033[36m  • Lora merged:\n$OUTPUT_LORA_MERGED
+    \n \033[0m
+    ")
+
+    echo "All tests passed for $model_name with hidden_size $hidden_size!"
+}
+
+# Run test for each model
+for param in "${params[@]}"; do
+    run_conversion_and_inference_lora $param
+done
+
+# Print results
+echo -e "\n\n---------------------------\n\n"
+echo -e "\n\033[1mSummary of All Results:\033[0m"
+for result in "${results[@]}"; do
+    echo -e "$result"
+done
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@@ -0,0 +1,27 @@
+#include "llama.h"
+#include "get-model.h"
+
+#include <cstdlib>
+
+int main(int argc, char *argv[] ) {
+    auto * model_path = get_model_or_exit(argc, argv);
+    auto * file = fopen(model_path, "r");
+    if (file == nullptr) {
+        fprintf(stderr, "no model at '%s' found\n", model_path);
+        return EXIT_FAILURE;
+    }
+
+    fprintf(stderr, "using '%s'\n", model_path);
+    fclose(file);
+
+    llama_backend_init();
+    auto params = llama_model_params{};
+    params.use_mmap = false;
+    params.progress_callback = [](float progress, void * ctx){
+        (void) ctx;
+        return progress > 0.50;
+    };
+    auto * model = llama_model_load_from_file(model_path, params);
+    llama_backend_free();
+    return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
+}
--- a/tests/test-mtmd-c-api.c
+++ b/tests/test-mtmd-c-api.c
@@ -0,0 +1,65 @@
+#include <stdio.h>
+#include <assert.h>
+
+#include "mtmd.h"
+
+int main(void) {
+    printf("\n\nTesting libmtmd C API...\n");
+    printf("--------\n\n");
+
+    struct mtmd_context_params params = mtmd_context_params_default();
+    printf("Default image marker: %s\n", params.image_marker);
+
+    mtmd_input_chunks * chunks = mtmd_test_create_input_chunks();
+
+    if (!chunks) {
+        fprintf(stderr, "Failed to create input chunks\n");
+        return 1;
+    }
+
+    size_t n_chunks = mtmd_input_chunks_size(chunks);
+    printf("Number of chunks: %zu\n", n_chunks);
+    assert(n_chunks > 0);
+
+    for (size_t i = 0; i < n_chunks; i++) {
+        const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
+        assert(chunk != NULL);
+        enum mtmd_input_chunk_type type = mtmd_input_chunk_get_type(chunk);
+        printf("Chunk %zu type: %d\n", i, type);
+
+        if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            size_t n_tokens;
+            const llama_token * tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+            printf("    Text chunk with %zu tokens\n", n_tokens);
+            assert(tokens != NULL);
+            assert(n_tokens > 0);
+            for (size_t j = 0; j < n_tokens; j++) {
+                assert(tokens[j] >= 0);
+                printf("    > Token %zu: %d\n", j, tokens[j]);
+            }
+
+        } else if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+            size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
+            // get position of the last token, which should be (nx - 1, ny - 1)
+            struct mtmd_decoder_pos pos = mtmd_image_tokens_get_decoder_pos(image_tokens, 0, n_tokens - 1);
+            size_t nx = pos.x + 1;
+            size_t ny = pos.y + 1;
+            const char * id = mtmd_image_tokens_get_id(image_tokens);
+            assert(n_tokens > 0);
+            assert(nx > 0);
+            assert(ny > 0);
+            assert(id != NULL);
+            printf("    Image chunk with %zu tokens\n", n_tokens);
+            printf("    Image size: %zu x %zu\n", nx, ny);
+            printf("    Image ID: %s\n", id);
+        }
+    }
+
+    // Free the chunks
+    mtmd_input_chunks_free(chunks);
+
+    printf("\n\nDONE: test libmtmd C API...\n");
+
+    return 0;
+}
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
--- a/tests/test-peg-parser.cpp
+++ b/tests/test-peg-parser.cpp
@@ -0,0 +1,26 @@
+#include <cstdlib>
+#include <string>
+#include <iostream>
+
+#include "peg-parser/tests.h"
+
+int main(int argc, char *argv[]) {
+    testing t(std::cout);
+    if (argc >= 2) {
+        t.set_filter(argv[1]);
+    }
+
+    const char * verbose = getenv("LLAMA_TEST_VERBOSE");
+    if (verbose) {
+        t.verbose = std::string(verbose) == "1";
+    }
+
+    t.test("basic", test_basic);
+    t.test("unicode", test_unicode);
+    t.test("json", test_json_parser);
+    t.test("gbnf", test_gbnf_generation);
+    t.test("serialization", test_json_serialization);
+    t.test("python-dict", test_python_dict_parser);
+
+    return t.summary();
+}
--- a/tests/test-quant-type-selection.cpp
+++ b/tests/test-quant-type-selection.cpp
@@ -0,0 +1,520 @@
+#include "../src/llama-ext.h"
+#include "ggml-cpp.h"
+#include "gguf-model-data.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+// ---------------------------------------------------------------------------
+// ftype name <-> enum mapping
+// ---------------------------------------------------------------------------
+
+struct ftype_name_entry {
+    const char * name;
+    llama_ftype  ftype;
+};
+
+static const ftype_name_entry ftype_name_table[] = {
+    { "F32",       LLAMA_FTYPE_ALL_F32          },
+    { "F16",       LLAMA_FTYPE_MOSTLY_F16       },
+    { "BF16",      LLAMA_FTYPE_MOSTLY_BF16      },
+    { "Q4_0",      LLAMA_FTYPE_MOSTLY_Q4_0      },
+    { "Q4_1",      LLAMA_FTYPE_MOSTLY_Q4_1      },
+    { "Q5_0",      LLAMA_FTYPE_MOSTLY_Q5_0      },
+    { "Q5_1",      LLAMA_FTYPE_MOSTLY_Q5_1      },
+    { "Q8_0",      LLAMA_FTYPE_MOSTLY_Q8_0      },
+    { "Q2_K",      LLAMA_FTYPE_MOSTLY_Q2_K      },
+    { "Q2_K_S",    LLAMA_FTYPE_MOSTLY_Q2_K_S    },
+    { "Q3_K_S",    LLAMA_FTYPE_MOSTLY_Q3_K_S    },
+    { "Q3_K_M",    LLAMA_FTYPE_MOSTLY_Q3_K_M    },
+    { "Q3_K_L",    LLAMA_FTYPE_MOSTLY_Q3_K_L    },
+    { "Q4_K_S",    LLAMA_FTYPE_MOSTLY_Q4_K_S    },
+    { "Q4_K_M",    LLAMA_FTYPE_MOSTLY_Q4_K_M    },
+    { "Q5_K_S",    LLAMA_FTYPE_MOSTLY_Q5_K_S    },
+    { "Q5_K_M",    LLAMA_FTYPE_MOSTLY_Q5_K_M    },
+    { "Q6_K",      LLAMA_FTYPE_MOSTLY_Q6_K      },
+    { "IQ1_S",     LLAMA_FTYPE_MOSTLY_IQ1_S     },
+    { "IQ1_M",     LLAMA_FTYPE_MOSTLY_IQ1_M     },
+    { "IQ2_XXS",   LLAMA_FTYPE_MOSTLY_IQ2_XXS   },
+    { "IQ2_XS",    LLAMA_FTYPE_MOSTLY_IQ2_XS    },
+    { "IQ2_S",     LLAMA_FTYPE_MOSTLY_IQ2_S     },
+    { "IQ2_M",     LLAMA_FTYPE_MOSTLY_IQ2_M     },
+    { "IQ3_XXS",   LLAMA_FTYPE_MOSTLY_IQ3_XXS   },
+    { "IQ3_XS",    LLAMA_FTYPE_MOSTLY_IQ3_XS    },
+    { "IQ3_S",     LLAMA_FTYPE_MOSTLY_IQ3_S     },
+    { "IQ3_M",     LLAMA_FTYPE_MOSTLY_IQ3_M     },
+    { "IQ4_NL",    LLAMA_FTYPE_MOSTLY_IQ4_NL    },
+    { "IQ4_XS",    LLAMA_FTYPE_MOSTLY_IQ4_XS    },
+    { "TQ1_0",     LLAMA_FTYPE_MOSTLY_TQ1_0     },
+    { "TQ2_0",     LLAMA_FTYPE_MOSTLY_TQ2_0     },
+    { "MXFP4_MOE", LLAMA_FTYPE_MOSTLY_MXFP4_MOE },
+    { "NVFP4",     LLAMA_FTYPE_MOSTLY_NVFP4     },
+};
+
+static llama_ftype llama_ftype_from_name(const char * name) {
+    for (const auto & e : ftype_name_table) {
+        if (strcmp(name, e.name) == 0) {
+            return e.ftype;
+        }
+    }
+    return (llama_ftype) -1;
+}
+
+static const char * llama_ftype_to_name(llama_ftype ftype) {
+    for (const auto & e : ftype_name_table) {
+        if (e.ftype == ftype) {
+            return e.name;
+        }
+    }
+    return nullptr;
+}
+
+// ---------------------------------------------------------------------------
+// ggml_type name lookup
+// ---------------------------------------------------------------------------
+
+static ggml_type ggml_type_from_name(const std::string & name) {
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        const char * tname = ggml_type_name((ggml_type) i);
+        if (tname && name == tname) {
+            return (ggml_type) i;
+        }
+    }
+    return GGML_TYPE_COUNT;
+}
+
+// ---------------------------------------------------------------------------
+// File parser for snapshot files (quant type schemas)
+// ---------------------------------------------------------------------------
+
+struct snapshot_section {
+    llama_ftype                                    ftype;
+    ggml_type                                      default_type;
+    std::vector<std::pair<std::string, ggml_type>> overrides;
+};
+
+// This function is pretty ugly, but it's a trade-off of readable snapshot files
+// versus readable parsing code
+static bool parse_snapshot_file(const std::string & path, std::vector<snapshot_section> & sections) {
+    std::ifstream f(path);
+    if (!f.good()) {
+        return false;
+    }
+
+    snapshot_section * cur = nullptr;
+    std::string        line;
+
+    while (std::getline(f, line)) {
+        if (line.empty() || line[0] == '#') {
+            continue;
+        }
+
+        // section header: [FTYPE_NAME] default_type
+        if (line[0] == '[') {
+            auto close = line.find(']');
+            if (close == std::string::npos) {
+                fprintf(stderr, "parse error: missing ] in '%s'\n", line.c_str());
+                return false;
+            }
+            std::string ftype_str = line.substr(1, close - 1);
+            std::string default_str;
+            size_t      pos = close + 1;
+            while (pos < line.size() && line[pos] == ' ') {
+                pos++;
+            }
+            default_str = line.substr(pos);
+
+            llama_ftype ftype = llama_ftype_from_name(ftype_str.c_str());
+            if ((int) ftype < 0) {
+                fprintf(stderr, "parse error: unknown ftype '%s'\n", ftype_str.c_str());
+                return false;
+            }
+
+            ggml_type dtype = ggml_type_from_name(default_str);
+            if (dtype == GGML_TYPE_COUNT) {
+                fprintf(stderr, "parse error: unknown default type '%s'\n", default_str.c_str());
+                return false;
+            }
+
+            sections.push_back({ ftype, dtype, {} });
+            cur = &sections.back();
+            continue;
+        }
+
+        if (!cur) {
+            fprintf(stderr, "parse error: tensor line before any section: '%s'\n", line.c_str());
+            return false;
+        }
+
+        auto sp = line.rfind(' ');
+        if (sp == std::string::npos) {
+            fprintf(stderr, "parse error: no space in tensor line: '%s'\n", line.c_str());
+            return false;
+        }
+
+        std::string tname = line.substr(0, sp);
+        std::string ttype = line.substr(sp + 1);
+
+        ggml_type gt = ggml_type_from_name(ttype);
+        if (gt == GGML_TYPE_COUNT) {
+            fprintf(stderr, "parse error: unknown type '%s' for tensor '%s'\n", ttype.c_str(), tname.c_str());
+            return false;
+        }
+
+        cur->overrides.push_back({ tname, gt });
+    }
+
+    return true;
+}
+
+// ---------------------------------------------------------------------------
+// Remote model support using gguf-model-data.cpp
+// ---------------------------------------------------------------------------
+
+struct remote_model_spec {
+    const char * repo;
+    const char * quant;
+};
+
+// Get model name from repo: strip org prefix, strip -GGUF suffix,
+// and strip anything up to and including first '_' (e.g. "deepseek-ai_DeepSeek-V3.1").
+static std::string model_name_from_repo(const char * repo) {
+    std::string s(repo);
+
+    auto slash = s.find('/');
+    if (slash != std::string::npos) {
+        s = s.substr(slash + 1);
+    }
+
+    const std::string suffix = "-GGUF";
+    if (s.size() >= suffix.size() && s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0) {
+        s = s.substr(0, s.size() - suffix.size());
+    }
+
+    auto underscore = s.find('_');
+    if (underscore != std::string::npos) {
+        s = s.substr(underscore + 1);
+    }
+
+    return s;
+}
+
+static std::string snapshot_file_from_name(const std::string & name) {
+    std::string lower = name;
+    for (auto & c : lower) {
+        c = std::tolower(c);
+    }
+    return lower;
+}
+
+static const remote_model_spec model_specs[] = {
+    { "ggml-org/Qwen3-0.6B-GGUF",                   "Q8_0"   },
+    { "ggml-org/GLM-4.6V-GGUF",                     "Q8_0"   },
+    { "ggml-org/Step-3.5-Flash-GGUF",               "Q4_K"   },
+    { "ggml-org/Qwen3-Coder-Next-GGUF",             "Q8_0"   },
+    { "ggml-org/Qwen3-14B-GGUF",                    "Q8_0"   },
+    { "ggml-org/Nemotron-Nano-3-30B-A3B-GGUF",      "Q8_0"   },
+    { "ggml-org/gpt-oss-120b-GGUF",                 "mxfp4"  },
+    { "ggml-org/gemma-3-4b-it-GGUF",                "Q8_0"   },
+    { "bartowski/Meta-Llama-3.1-70B-Instruct-GGUF", "Q4_K_M" },
+    { "bartowski/deepseek-ai_DeepSeek-V3.1-GGUF",   "IQ1_M"  },
+    { "bartowski/Qwen_Qwen3.5-397B-A17B-GGUF",      "IQ1_S"  }, // TODO: swap with ggml-org if/when it's released
+    { "bartowski/Qwen_Qwen3.5-27B-GGUF",            "Q8_0"   }, // TODO: swap with ggml-org if/when it's released
+};
+
+static const int n_model_specs = (int) (sizeof(model_specs) / sizeof(model_specs[0]));
+
+static llama_model * build_mock_model_from_remote(const gguf_remote_model & remote) {
+    llama_quant_model_desc desc = {};
+    desc.architecture           = remote.architecture.c_str();
+    desc.n_embd                 = remote.n_embd;
+    desc.n_ff                   = remote.n_ff;
+    desc.n_layer                = remote.n_layer;
+    desc.n_head                 = remote.n_head;
+    desc.n_head_kv              = remote.n_head_kv;
+    desc.n_expert               = remote.n_expert;
+    desc.n_embd_head_k          = remote.n_embd_head_k;
+    desc.n_embd_head_v          = remote.n_embd_head_v;
+    return llama_quant_model_from_metadata(&desc);
+}
+
+// Single ggml context holding all quantizable tensors for a model.
+struct mock_tensors {
+    ggml_context_ptr           ctx;
+    std::vector<ggml_tensor *> tensors;
+};
+
+static mock_tensors build_mock_tensors(const quantize_state_impl * qs, const gguf_remote_model & remote) {
+    const size_t            ctx_size = remote.tensors.size() * ggml_tensor_overhead();
+    struct ggml_init_params params   = { ctx_size, nullptr, true };
+    ggml_context_ptr        ctx(ggml_init(params));
+
+    std::vector<ggml_tensor *> result;
+
+    for (const auto & t : remote.tensors) {
+        ggml_tensor * gt = ggml_new_tensor_4d(ctx.get(), GGML_TYPE_F32, t.ne[0], t.ne[1], t.ne[2], t.ne[3]);
+        ggml_set_name(gt, t.name.c_str());
+        if (llama_quant_tensor_allows_quantization(qs, gt)) {
+            result.push_back(gt);
+        }
+    }
+
+    // sort by layer index then name, matching llama_model_loader::weight_name_comparer
+    std::sort(result.begin(), result.end(), [](const ggml_tensor * a, const ggml_tensor * b) {
+        int a_layer = -1, b_layer = -1;
+        sscanf(a->name, "blk.%d.", &a_layer);
+        sscanf(b->name, "blk.%d.", &b_layer);
+        if (a_layer != b_layer) {
+            return a_layer < b_layer;
+        }
+        return strcmp(a->name, b->name) < 0;
+    });
+
+    return { std::move(ctx), std::move(result) };
+}
+
+// ---------------------------------------------------------------------------
+// Generate mode: regenerate all snapshot files
+// Use this when either adding new models or modifying quants
+// ---------------------------------------------------------------------------
+
+static std::string generate_snapshot(const std::string &       name,
+                                     const gguf_remote_model & remote,
+                                     quantize_state_impl *     qs,
+                                     mock_tensors &            mt) {
+    std::ostringstream out;
+
+    out << "# Model: " << name << "\n";
+    out << "# n_embd=" << remote.n_embd << ", n_ff=" << remote.n_ff << ", n_vocab=" << remote.n_vocab
+        << ", n_layer=" << remote.n_layer << ", n_head=" << remote.n_head << ", n_head_kv=" << remote.n_head_kv;
+    if (remote.n_expert > 0) {
+        out << ", n_expert=" << remote.n_expert;
+    }
+    out << "\n";
+
+    for (int i = 0; i < LLAMA_FTYPE_GUESSED; i++) {
+        llama_ftype ft           = (llama_ftype) i;
+        ggml_type   default_type = llama_ftype_get_default_type(ft);
+        if (default_type == GGML_TYPE_COUNT) {
+            continue;
+        }
+        const char * fname = llama_ftype_to_name(ft);
+        if (!fname) {
+            continue;
+        }
+
+        std::vector<ggml_type> result_types(mt.tensors.size());
+        llama_quant_compute_types(qs, ft, mt.tensors.data(), result_types.data(), mt.tensors.size());
+
+        out << "\n[" << fname << "] " << ggml_type_name(default_type) << "\n";
+        for (size_t j = 0; j < mt.tensors.size(); j++) {
+            if (result_types[j] != default_type) {
+                out << ggml_get_name(mt.tensors[j]) << " " << ggml_type_name(result_types[j]) << "\n";
+            }
+        }
+    }
+
+    return out.str();
+}
+
+static int run_generate(const std::string & snapshot_dir) {
+    fprintf(stderr, "This will overwrite all snapshot files in:\n  %s\n", snapshot_dir.c_str());
+    fprintf(stderr, "Continue? [y/N] ");
+    int ch = fgetc(stdin);
+    if (ch != 'y' && ch != 'Y') {
+        fprintf(stderr, "Aborted.\n");
+        return 1;
+    }
+
+    fprintf(stderr, "\n");
+
+    int n_written = 0;
+
+    for (int m = 0; m < n_model_specs; m++) {
+        const auto & spec = model_specs[m];
+        std::string  name = model_name_from_repo(spec.repo);
+
+        fprintf(stderr, "Fetching model metadata for %s from %s...\n", name.c_str(), spec.repo);
+        auto result = gguf_fetch_model_meta(spec.repo, spec.quant);
+        if (!result.has_value()) {
+            fprintf(stderr, "ERROR: could not fetch model metadata for %s\n", name.c_str());
+            return 1;
+        }
+
+        const auto &                remote  = result.value();
+        llama_model *               model   = build_mock_model_from_remote(remote);
+        llama_model_quantize_params qparams = llama_model_quantize_default_params();
+        quantize_state_impl *       qs      = llama_quant_init(model, &qparams);
+        auto                        mt      = build_mock_tensors(qs, remote);
+
+        std::string content = generate_snapshot(name, remote, qs, mt);
+        std::string path    = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema";
+
+        std::ofstream f(path);
+        if (!f.good()) {
+            fprintf(stderr, "ERROR: could not write %s\n", path.c_str());
+            llama_quant_free(qs);
+            llama_model_free(model);
+            return 1;
+        }
+        f << content;
+        n_written++;
+        fprintf(stderr, "  wrote %s\n", path.c_str());
+        llama_quant_free(qs);
+        llama_model_free(model);
+    }
+
+    fprintf(stderr, "%d files written\n", n_written);
+    return 0;
+}
+
+// ---------------------------------------------------------------------------
+// Test mode: compare against snapshot files
+// ---------------------------------------------------------------------------
+
+static bool run_test_section(quantize_state_impl * qs, mock_tensors & mt, const snapshot_section & section) {
+    // verify default_type matches what llama_ftype_get_default_type returns
+    ggml_type computed_default = llama_ftype_get_default_type(section.ftype);
+    if (computed_default != section.default_type) {
+        printf("  FAIL  [%s] default type mismatch: file says %s, code says %s\n", llama_ftype_to_name(section.ftype),
+               ggml_type_name(section.default_type), ggml_type_name(computed_default));
+        return false;
+    }
+
+    std::vector<ggml_type> result_types(mt.tensors.size());
+    llama_quant_compute_types(qs, section.ftype, mt.tensors.data(), result_types.data(), mt.tensors.size());
+
+    std::map<std::string, ggml_type> override_map(section.overrides.begin(), section.overrides.end());
+
+    bool all_pass         = true;
+    int  n_override_found = 0;
+
+    for (size_t i = 0; i < mt.tensors.size(); i++) {
+        const char * name = ggml_get_name(mt.tensors[i]);
+        ggml_type    got  = result_types[i];
+
+        ggml_type expected = section.default_type;
+        auto      it       = override_map.find(name);
+        if (it != override_map.end()) {
+            expected = it->second;
+            n_override_found++;
+        }
+
+        if (got != expected) {
+            printf("  FAIL  %-50s %-10s expected %s, got %s\n", name, llama_ftype_to_name(section.ftype),
+                   ggml_type_name(expected), ggml_type_name(got));
+            all_pass = false;
+        }
+    }
+
+    if (n_override_found != (int) section.overrides.size()) {
+        printf("  FAIL  [%s] override count mismatch: listed %d, matched %d\n", llama_ftype_to_name(section.ftype),
+               (int) section.overrides.size(), n_override_found);
+        all_pass = false;
+    }
+
+    return all_pass;
+}
+
+static int run_remote_tests(const std::string & snapshot_dir, const char * argv0) {
+    int total_pass = 0;
+    int total_fail = 0;
+    int total_skip = 0;
+
+    for (int m = 0; m < n_model_specs; m++) {
+        const auto & spec = model_specs[m];
+        std::string  name = model_name_from_repo(spec.repo);
+        printf("=== %s ===\n", name.c_str());
+
+        auto result = gguf_fetch_model_meta(spec.repo, spec.quant, "", false);
+        if (!result.has_value()) {
+            printf("  SKIP  (could not fetch model metadata)\n\n");
+            total_skip++;
+            continue;
+        }
+
+        const auto &                remote  = result.value();
+        llama_model *               model   = build_mock_model_from_remote(remote);
+        llama_model_quantize_params qparams = llama_model_quantize_default_params();
+        quantize_state_impl *       qs      = llama_quant_init(model, &qparams);
+        auto                        mt      = build_mock_tensors(qs, remote);
+
+        std::string                   snapshot_path = snapshot_dir + "/" + snapshot_file_from_name(name) + ".schema";
+        std::vector<snapshot_section> sections;
+        if (!parse_snapshot_file(snapshot_path, sections)) {
+            printf("  SKIP  (could not read snapshot file: %s)\n\n", snapshot_path.c_str());
+            llama_quant_free(qs);
+            llama_model_free(model);
+            total_skip++;
+            continue;
+        }
+
+        int model_pass = 0;
+        int model_fail = 0;
+
+        for (const auto & section : sections) {
+            bool pass = run_test_section(qs, mt, section);
+            if (pass) {
+                model_pass++;
+            } else {
+                model_fail++;
+            }
+        }
+
+        printf("  %s  %s: %d/%d ftype sections passed (%d tensors)\n", model_fail == 0 ? "PASS" : "FAIL", name.c_str(),
+               model_pass, model_pass + model_fail, (int) mt.tensors.size());
+        printf("\n");
+
+        if (model_fail == 0) {
+            total_pass++;
+        } else {
+            total_fail++;
+        }
+
+        llama_quant_free(qs);
+        llama_model_free(model);
+    }
+
+    printf("%d/%d models passed", total_pass, total_pass + total_fail);
+    if (total_skip > 0) {
+        printf(", %d skipped", total_skip);
+    }
+    printf("\n");
+
+    if (total_fail > 0) {
+        printf("\nIf these changes are intentional, regenerate snapshot files with:\n");
+        printf("  %s --generate\n", argv0);
+    }
+
+    return total_fail > 0 ? 1 : 0;
+}
+
+int main(int argc, char ** argv) {
+    std::string snapshot_dir = SNAPSHOT_DIR;
+    bool        generate     = false;
+
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--generate") == 0) {
+            generate = true;
+        } else if (strcmp(argv[i], "--snapshot-dir") == 0 && i + 1 < argc) {
+            snapshot_dir = argv[++i];
+        }
+    }
+
+    if (generate) {
+        return run_generate(snapshot_dir);
+    }
+
+    // suppress llama log warnings during test (e.g. tensor type fallback messages)
+    llama_log_set([](enum ggml_log_level, const char *, void *) {}, nullptr);
+
+    return run_remote_tests(snapshot_dir, argv[0]);
+}
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -0,0 +1,196 @@
+// Unit tests for quantization specific functions - quantize, dequantize and dot product
+
+#include "ggml.h"
+#include "ggml-cpu.h"
+
+#undef NDEBUG
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_BINARY = 0.025f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.01f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
+constexpr float MAX_QUANTIZATION_TOTAL_ERROR_FP4 = 0.0030f;
+constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
+constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
+constexpr float MAX_DOT_PRODUCT_ERROR_FP4 = 0.03f;
+constexpr float MAX_DOT_PRODUCT_ERROR_BINARY = 0.40f;
+constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.15f;
+
+static const char* RESULT_STR[] = {"ok", "FAILED"};
+
+
+// Generate synthetic data
+static void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+// Calculate RMSE between two float arrays
+static float array_rmse(const float * a1, const float * a2, size_t n) {
+    double sum = 0;
+    for (size_t i = 0; i < n; i++) {
+        double diff = a1[i] - a2[i];
+        sum += diff * diff;
+    }
+    return sqrtf(sum) / n;
+}
+
+// Total quantization error on test data
+static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(2*test_size);
+    std::vector<float> tmp_out(test_size);
+
+    qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
+    return array_rmse(test_data, tmp_out.data(), test_size);
+}
+
+// Total quantization error on test data
+static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(2*test_size);
+    std::vector<float> tmp_out(test_size);
+    std::vector<float> tmp_out_ref(test_size);
+
+    // FIXME: why is done twice?
+    qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
+
+    qfns->from_float_ref(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
+
+    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
+}
+
+static float dot_product(const float * a1, const float * a2, size_t test_size) {
+    double sum = 0;
+    for (size_t i = 0; i < test_size; i++) {
+        sum += a1[i] * a2[i];
+    }
+    return sum;
+}
+
+// Total dot product error
+static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
+    GGML_UNUSED(qfns);
+
+    std::vector<uint8_t> tmp_q1(2*test_size);
+    std::vector<uint8_t> tmp_q2(2*test_size);
+
+    const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
+
+    qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
+    vdot->from_float(test_data2, tmp_q2.data(), test_size);
+
+    float result = INFINITY;
+    qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
+
+    const float dot_ref = dot_product(test_data1, test_data2, test_size);
+
+    return fabsf(result - dot_ref) / test_size;
+}
+
+int main(int argc, char * argv[]) {
+    bool verbose = false;
+    const size_t test_size = 32 * 128;
+
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-v") {
+            verbose = true;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+
+    std::vector<float> test_data(test_size);
+    std::vector<float> test_data2(test_size);
+
+    generate_data(0.0, test_data.size(), test_data.data());
+    generate_data(1.0, test_data2.size(), test_data2.data());
+
+    ggml_cpu_init();
+
+    int num_failed = 0;
+    bool failed = false;
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        const auto * qfns = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
+
+        // deprecated - skip
+        if (qfns->blck_size == 0) {
+            continue;
+        }
+
+        const ggml_type ei = (ggml_type)i;
+
+        printf("Testing %s\n", ggml_type_name((ggml_type) i));
+        ggml_quantize_init(ei);
+
+        if (qfns_cpu->from_float && qfns->to_float) {
+            const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
+            const float max_quantization_error =
+                type == GGML_TYPE_Q1_0    ? MAX_QUANTIZATION_TOTAL_ERROR_BINARY :
+                type == GGML_TYPE_TQ1_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
+                type == GGML_TYPE_TQ2_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
+                type == GGML_TYPE_Q2_K    ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
+                type == GGML_TYPE_IQ2_S   ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
+                type == GGML_TYPE_Q3_K    ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
+                type == GGML_TYPE_IQ3_S   ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
+                type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS :
+                type == GGML_TYPE_NVFP4   ? MAX_QUANTIZATION_TOTAL_ERROR_FP4 : MAX_QUANTIZATION_TOTAL_ERROR;
+            failed = !(total_error < max_quantization_error);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
+            }
+
+            const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
+            failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
+            }
+
+            const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
+            const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
+                                            type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
+                                          ? MAX_DOT_PRODUCT_ERROR_LOWBIT
+                                          : type == GGML_TYPE_Q1_0
+                                          ? MAX_DOT_PRODUCT_ERROR_BINARY
+                                          : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
+                                          ? MAX_DOT_PRODUCT_ERROR_TERNARY
+                                          : type == GGML_TYPE_NVFP4
+                                          ? MAX_DOT_PRODUCT_ERROR_FP4
+                                          : MAX_DOT_PRODUCT_ERROR;
+            failed = !(vec_dot_error < max_allowed_error);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s dot product error:              %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
+            }
+        }
+    }
+
+    if (num_failed || verbose) {
+        printf("%d tests failed\n", num_failed);
+    }
+
+    return num_failed > 0;
+}
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -0,0 +1,356 @@
+// Benchmark quantization specific functions on synthetic data
+
+#include "ggml.h"
+#include "ggml-cpu.h"
+
+#undef NDEBUG
+#include <algorithm>
+#include <assert.h>
+#include <functional>
+#include <math.h>
+#include <memory>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define MAX_ALIGNMENT 64
+#define QK 32
+#define WARMUP 5
+#define ITERATIONS 10
+#define MAX_ITERATIONS 100000000
+
+#define L1_SIZE      32*128
+#define L2_SIZE     32*2048
+#define L3_SIZE    32*20480
+#define MEM_SIZE 32*2048000
+
+struct quantize_perf_params {
+    std::vector<std::string> include_types;
+    std::vector<size_t> test_sizes;
+    size_t alignment_offset = 0;
+    bool op_quantize_row_q_reference = false;
+    bool op_quantize_row_q = false;
+    bool op_dequantize_row_q = false;
+    bool op_quantize_row_q_dot = false;
+    bool op_vec_dot_q = false;
+    int64_t iterations = ITERATIONS;
+};
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#include <x86intrin.h>
+inline int64_t cpu_cycles() {
+// Rough way to detect new-ish CPUs
+#ifdef __POPCNT__
+    unsigned int dummy;
+    return __rdtscp(&dummy);
+#else
+    return __rdtsc();
+#endif
+}
+
+#else
+
+#define cpu_cycles() 0
+
+#endif
+
+
+// Generate synthetic data
+static void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+static float gigabytes_per_second(size_t bytes, int64_t usecs) {
+    return bytes / (float) usecs * 1000000 / (1024*1024*1024);
+}
+
+static void * align_with_offset(void * ptr, int offset) {
+    size_t dummy_size = MAX_ALIGNMENT * 4;
+    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
+}
+
+static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
+    int64_t min_time_us = INT64_MAX;
+    int64_t total_time_us = 0;
+    int64_t min_time_cycles = INT64_MAX;
+    int64_t total_time_cycles = 0;
+
+    for (int i = 0; i < WARMUP; i++) {
+        func();
+    }
+
+    for (int i = 0; i < iterations; i++) {
+        const int64_t start_time = ggml_time_us();
+        const int64_t start_cycles = cpu_cycles();
+
+        func();
+
+        const int64_t end_cycles = cpu_cycles();
+        const int64_t end_time = ggml_time_us();
+
+        total_time_cycles += end_cycles - start_cycles;
+        min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
+        total_time_us += end_time - start_time;
+        min_time_us = std::min(min_time_us, end_time - start_time);
+    }
+
+    printf("      min cycles/%d vals   : %9.2f\n",  QK, QK * min_time_cycles / (float) size);
+    printf("      avg cycles/%d vals   : %9.2f\n",  QK, QK * total_time_cycles / (float) (size * iterations));
+    printf("      float32 throughput   : %9.2f GB/s\n",  gigabytes_per_second(4 * size * iterations, total_time_us));
+    printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * iterations, total_time_us));
+}
+
+static void usage(char * argv[]) {
+    printf("Benchmark quantization specific functions on synthetic data\n");
+    printf("\n");
+    printf("usage: %s [options]\n", argv[0]);
+    printf("\n");
+    printf("options: (default)\n");
+    printf("  -h, --help            show this help message and exit\n");
+    printf("  --size SIZE           set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
+    printf("  -3                    use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
+    printf("  -4                    use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
+    printf("  --op OP               set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
+    printf("                        quantize_row_q_dot, vec_dot_q (all)\n");
+    printf("  --type TYPE           set test type as");
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        const auto * qfns     = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
+        if (ggml_type_name(type) != NULL) {
+            if (qfns_cpu->from_float && qfns->to_float) {
+                printf(" %s", ggml_type_name(type));
+            }
+        }
+    }
+    printf(" (all)\n");
+    printf("  --alignment-offset OFFSET\n");
+    printf("                        set alignment offset as OFFSET (0)\n");
+    printf("  -i NUM, --iterations NUM\n");
+    printf("                        set test iteration number (%d)\n", ITERATIONS);
+}
+
+int main(int argc, char * argv[]) {
+    quantize_perf_params params {};
+
+    // read command line
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "--size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            size_t size = std::stoi(argv[i]);
+            if (size % 32 != 0) {
+                fprintf(stderr, "error: size %zu not divisible by 32\n", size);
+                invalid_param = true;
+                break;
+            }
+            params.test_sizes.push_back(size);
+        } else if (arg == "-3") {
+            // quick select sizes that probably fit in CPU caches
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+        } else if (arg == "-4") {
+            // quick select cache sizes + memory
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+            params.test_sizes.push_back(MEM_SIZE);
+        } else if (arg == "--op") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string op {argv[i]};
+            if (op == "quantize_row_q_reference") {
+                params.op_quantize_row_q_reference = true;
+            } else if (op == "quantize_row_q") {
+                params.op_quantize_row_q = true;
+            } else if (op == "dequantize_row_q") {
+                params.op_dequantize_row_q = true;
+            } else if (op == "quantize_row_q_dot") {
+                params.op_quantize_row_q_dot = true;
+            } else if (op == "vec_dot_q") {
+                params.op_vec_dot_q = true;
+            } else {
+                invalid_param = true;
+                break;
+            }
+        } else if (arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_types.push_back(argv[i]);
+        } else if (arg == "--alignment-offset") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int alignment = std::stoi(argv[i]);
+            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
+            fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
+                invalid_param = true;
+                break;
+            }
+            params.alignment_offset = alignment;
+        } else if ((arg == "-i") || (arg == "--iterations")) {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int number = std::stoi(argv[i]);
+            if (number < 0 || number > MAX_ITERATIONS) {
+            fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
+                invalid_param = true;
+                break;
+            }
+            params.iterations = number;
+        } else if ((arg == "-h") || (arg == "--help")) {
+            usage(argv);
+            return 1;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        return 1;
+    }
+
+    if (params.test_sizes.empty()) {
+        params.test_sizes.push_back(L1_SIZE);
+    }
+    if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
+        params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
+    }
+
+    std::sort(params.test_sizes.begin(), params.test_sizes.end());
+    size_t largest = params.test_sizes.back();
+
+    std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q1_v   (largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q2_v   (largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_out_v  (largest*4 + MAX_ALIGNMENT*2);
+
+    float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
+    float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
+    float * test_q1    = (float *) align_with_offset(test_q1_v.data(),    params.alignment_offset);
+    float * test_q2    = (float *) align_with_offset(test_q2_v.data(),    params.alignment_offset);
+    float * test_out   = (float *) align_with_offset(test_out_v.data(),   params.alignment_offset);
+
+    generate_data(0, largest, test_data1);
+    generate_data(1, largest, test_data2);
+
+    int64_t iterations = params.iterations;
+
+    ggml_cpu_init();
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        const auto * qfns = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
+        if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
+            continue;
+        }
+
+        if (qfns_cpu->from_float && qfns->to_float) {
+            printf("%s\n", ggml_type_name(type));
+
+            ggml_quantize_init(type);
+
+            if (params.op_quantize_row_q_reference) {
+                printf("  quantize_row_q_reference\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        qfns->from_float_ref(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = ggml_row_size(type, size);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q) {
+                printf("  quantize_row_q\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        qfns_cpu->from_float(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = ggml_row_size(type, size);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_dequantize_row_q) {
+                printf("  dequantize_row_q\n");
+                qfns_cpu->from_float(test_data1, test_q1, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        qfns->to_float(test_q1, test_out, size);
+                        return test_out[0];
+                    };
+                    size_t quantized_size = ggml_row_size(type, size);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q_dot) {
+                printf("  quantize_row_q_dot\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
+                        vdot->from_float(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = ggml_row_size(type, size);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_vec_dot_q) {
+                printf("  vec_dot_q\n");
+                qfns_cpu->from_float(test_data1, test_q1, largest);
+                qfns_cpu->from_float(test_data2, test_q2, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void) -> float {
+                        float result;
+                        qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
+                        return result;
+                    };
+                    size_t quantized_size = ggml_row_size(type, size);
+                    benchmark_function(size, quantized_size, iterations, quantize_fn);
+                }
+                printf("\n");
+            }
+        }
+    }
+
+    return 0;
+}
--- a/tests/test-quantize-stats.cpp
+++ b/tests/test-quantize-stats.cpp
@@ -0,0 +1,427 @@
+#include "llama.h"
+
+#include "build-info.h"
+#include "common.h"
+
+#include "../src/llama-model.h"
+
+#include "ggml.h"
+#include "ggml-cpu.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <numeric>
+#include <regex>
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+struct quantize_stats_params {
+    std::string model = "models/7B/ggml-model-f16.gguf";
+    bool verbose = false;
+    bool per_layer_stats = false;
+    bool print_histogram = false;
+    bool reference = false;
+    std::vector<std::string> include_layers;
+    std::vector<std::string> exclude_layers;
+    std::vector<enum ggml_type> include_types;
+};
+
+constexpr size_t HISTOGRAM_BUCKETS = 150;
+constexpr double HISTOGRAM_RANGE = 0.03;
+
+struct error_stats {
+    size_t num_samples;
+    double total_error;
+    double max_error;
+    uint64_t error_histogram[HISTOGRAM_BUCKETS];
+};
+
+static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
+    quantize_stats_params params;
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -r, --reference\n");
+    fprintf(stderr, "                        use reference implementation (default: false)\n");
+    fprintf(stderr, "  -v, --verbose\n");
+    fprintf(stderr, "                        verbose output (default: false)\n");
+    fprintf(stderr, "  -p, --per-layer-stats\n");
+    fprintf(stderr, "                        print stats per layer (default: false)\n");
+    fprintf(stderr, "  --histogram\n");
+    fprintf(stderr, "                        print error histogram (default: false)\n");
+    fprintf(stderr, "  -l LAYER, --include-layer LAYER\n");
+    fprintf(stderr, "                        only test layers matching pattern\n");
+    fprintf(stderr, "  -L LAYER, --exclude-layer LAYER\n");
+    fprintf(stderr, "                        exclude layers matching pattern\n");
+    fprintf(stderr, "  -t TYPE, --type TYPE\n");
+    fprintf(stderr, "                        only test given type (q4_0, q4_1)\n");
+    fprintf(stderr, "\n");
+}
+
+// Check if a layer is included/excluded by command line
+static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
+    for (const auto& excluded : params.exclude_layers) {
+        if (std::regex_search(layer, std::regex(excluded))) {
+            return false;
+        }
+    }
+    for (const auto& included : params.include_layers) {
+        if (std::regex_search(layer, std::regex(included))) {
+            return true;
+        }
+    }
+    return params.include_layers.empty();
+}
+
+// Update error statistics given vectors with the before/after result of quantization
+static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
+    for (int64_t i = 0; i < nelements; i++) {
+        double diff = input[i] - output[i];
+        stats.total_error += diff * diff;
+        stats.max_error = fmax(fabs(diff), stats.max_error);
+        stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
+    }
+    stats.num_samples += nelements;
+}
+
+static void combine_error_stats(error_stats & into, const error_stats & from) {
+    into.num_samples += from.num_samples;
+    into.total_error += from.total_error;
+    if (from.max_error > into.max_error) into.max_error = from.max_error;
+    for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
+}
+
+static double find_quantile(const error_stats & stats, double quantile) {
+    double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
+
+    double accum = 0;
+    for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+        accum += stats.error_histogram[i];
+        if (accum >= sum*quantile) {
+            return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+        }
+    }
+    return INFINITY;
+}
+
+static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+    double rmse = sqrt(stats.total_error / (double) stats.num_samples);
+    double median = find_quantile(stats, .5);
+    double pct95 = find_quantile(stats, .95);
+    printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
+    if (print_histogram) {
+        printf("Error distribution:\n");
+        for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+            double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
+            printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
+        }
+    }
+}
+
+// copied from ggml.h - verify that we can access this as a flat array
+static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+static void test_roundtrip_on_chunk(
+    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
+    float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
+) {
+    if (layer->type == GGML_TYPE_F16) {
+        for (int i = 0; i < chunk_size; i++) {
+            input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+        }
+    } else {
+        input_scratch = ggml_get_data_f32(layer) + offset;
+    }
+
+    if (use_reference) {
+        qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
+    } else {
+        qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
+    }
+    qfns.to_float(quantized_scratch, output_scratch, chunk_size);
+
+    update_error_stats(chunk_size, input_scratch, output_scratch, stats);
+}
+
+
+// Run quantization function for a single layer and update error stats
+static void test_roundtrip_on_layer(
+    std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
+    const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
+    std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
+) {
+    assert(tensor_is_contiguous(layer));
+    error_stats layer_error {};
+    uint64_t nelements = ggml_nelements(layer);
+
+    float* input_scratch_ptr = nullptr;
+    if (layer->type == GGML_TYPE_F16) {
+        if (input_scratch.size() < nelements) input_scratch.resize(nelements);
+        input_scratch_ptr = input_scratch.data();
+    }
+    if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
+    if (output_scratch.size() < nelements) output_scratch.resize(nelements);
+
+    if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
+    int chunk_size = 32*512;
+    int num_chunks = (nelements + chunk_size - 1)/chunk_size;
+
+    if (num_chunks < 2 || max_thread < 2) {
+        test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
+                output_scratch.data(), print_layer_stats ? layer_error : total_error);
+    } else {
+        auto & stats = print_layer_stats ? layer_error : total_error;
+        std::mutex mutex;
+        uint64_t counter = 0;
+        auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
+             &quantized_scratch, &output_scratch, chunk_size] () {
+            error_stats local_stats {};
+            while (true) {
+                std::unique_lock<std::mutex> lock(mutex);
+                uint64_t offset = counter; counter += chunk_size;
+                if (offset >= nelements) {
+                    combine_error_stats(stats, local_stats);
+                    break;
+                }
+                lock.unlock();
+                uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
+                test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
+                        quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
+            }
+        };
+        int nthread = std::min(num_chunks, max_thread);
+        std::vector<std::thread> workers(nthread-1);
+        for (auto& w : workers) w = std::thread(compute);
+        compute();
+        for (auto& w : workers) w.join();
+    }
+
+    if (print_layer_stats) {
+        print_error_stats(name, layer_error, false);
+        combine_error_stats(total_error, layer_error);
+    }
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    quantize_stats_params params;
+
+    // read command line
+
+    int max_thread = 0;
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            quantize_stats_print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-r" || arg == "--reference") {
+            params.reference = true;
+        } else if (arg == "-v") {
+            params.verbose = true;
+        } else if (arg == "-p" || arg == "--per-layer-stats") {
+            params.per_layer_stats = true;
+        } else if (arg == "--histogram") {
+            params.print_histogram = true;
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
+        } else if (arg == "-l" || arg == "--include-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_layers.emplace_back(argv[i]);
+        } else if (arg == "-L" || arg == "--exclude-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.exclude_layers.emplace_back(argv[i]);
+        } else if (arg == "-t" || arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int j;
+            for (j = 0; j < GGML_TYPE_COUNT; ++j) {
+               const auto * name = ggml_type_name((ggml_type) j);
+               if (name && strcmp(argv[i], name) == 0) break;
+            }
+            if (j < GGML_TYPE_COUNT) {
+                params.include_types.push_back((ggml_type) j);
+            } else {
+                fprintf(stderr, "error: %s not in list of types\n", argv[i]);
+                invalid_param = true;
+            }
+        } else if (arg == "-n" || arg == "--num-threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            max_thread = atoi(argv[i]);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            quantize_stats_print_usage(argc, argv);
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        quantize_stats_print_usage(argc, argv);
+        return 1;
+    }
+
+    llama_print_build_info();
+
+    // load the model
+    fprintf(stderr, "Loading model\n");
+
+    const int64_t t_main_start_us = ggml_time_us();
+    llama_model * model;
+    llama_context * ctx;
+
+    {
+        auto mparams = llama_model_default_params();
+        mparams.use_mlock  = false;
+
+        model = llama_model_load_from_file(params.model.c_str(), mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+
+        auto cparams = llama_context_default_params();
+        cparams.n_ctx = 256;
+
+        ctx = llama_init_from_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+    const auto & tensors = llama_internal_get_tensor_map(model);
+
+    // check layer tensors
+    int included_layers = 0;
+    int64_t max_nelements = 0;
+    bool is_f16 = false;
+    for (const auto & kv_tensor : tensors) {
+        if (!layer_included(params, kv_tensor.first)) {
+            continue;
+        }
+        if (params.verbose) {
+            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
+        }
+        if (kv_tensor.second->type == GGML_TYPE_F16) {
+            is_f16 = true;
+        } else if (kv_tensor.second->type != GGML_TYPE_F32) {
+            fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
+                "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
+            llama_free(ctx);
+            llama_model_free(model);
+            return 1;
+        }
+        included_layers++;
+        max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
+    }
+
+    if (is_f16) {
+        printf("note: source model is f16\n");
+    }
+    printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
+    // allocate scratch space
+    std::vector<float> input_scratch;
+    std::vector<char> quantized_scratch;
+    std::vector<float> output_scratch;
+
+    // loop throught quantization types
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        const ggml_type type = (ggml_type) i;
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
+            continue;
+        }
+        const auto * qfns     = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
+        if (qfns_cpu->from_float && qfns->to_float) {
+            if (params.verbose) {
+                printf("testing %s ...\n",  ggml_type_name(type));
+            }
+
+            ggml_quantize_init(type);
+
+            error_stats global_stats {};
+
+            for (const auto & kv_tensor : tensors) {
+                if (!layer_included(params, kv_tensor.first)) {
+                    continue;
+                }
+                if (params.verbose) {
+                    printf("  %s ...\n",  kv_tensor.first.c_str());
+                }
+                std::string layer_name { ggml_type_name(type) };
+                layer_name += "::" + kv_tensor.first;
+                test_roundtrip_on_layer(
+                        layer_name,
+                        params.per_layer_stats,
+                        *qfns, *qfns_cpu,
+                        params.reference,
+                        kv_tensor.second,
+                        input_scratch,
+                        quantized_scratch,
+                        output_scratch,
+                        global_stats,
+                        max_thread
+                );
+            }
+
+            print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
+        }
+    }
+
+
+    llama_free(ctx);
+    llama_model_free(model);
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
+    }
+
+    return 0;
+}
--- a/tests/test-reasoning-budget.cpp
+++ b/tests/test-reasoning-budget.cpp
@@ -0,0 +1,260 @@
+#include "reasoning-budget.h"
+#include "unicode.h"
+
+#include "llama.h"
+#include "ggml.h"
+
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include <cmath>
+#include <cstddef>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+// Reasoning budget sampler test helper
+// These tests use nullptr vocab which safely falls back to treating all tokens as complete
+// (The UTF-8 boundary detection logic is tested separately in test_utf8_boundary_detection)
+static void test_reasoning_budget(
+    const char * test_name,
+    const std::vector<llama_token> & sequence,
+    const std::vector<llama_token> & start_tokens,
+    const std::vector<llama_token> & end_tokens,
+    const std::vector<llama_token> & forced_tokens,
+    int32_t budget,
+    common_reasoning_budget_state initial_state,
+    size_t expected_force_start,   // token index where forcing should start (SIZE_MAX = never)
+    size_t expected_force_end      // token index where forcing should end (after this, no more forcing)
+) {
+    // Find the maximum token ID to ensure our vocab covers all tokens
+    llama_token max_token = 0;
+    for (auto t : sequence) max_token = std::max(max_token, t);
+    for (auto t : start_tokens) max_token = std::max(max_token, t);
+    for (auto t : end_tokens) max_token = std::max(max_token, t);
+    for (auto t : forced_tokens) max_token = std::max(max_token, t);
+
+    // Create a minimal sampler with mock vocabulary
+    // For this test, we use nullptr as vocab since we're testing state transitions
+    // The UTF-8 boundary check will treat all tokens as complete (safe fallback)
+    auto * sampler = common_reasoning_budget_init(
+        nullptr,  // vocab - not used for basic state machine tests
+        start_tokens,
+        end_tokens,
+        forced_tokens,
+        budget,
+        initial_state
+    );
+
+    // Create a test token data array for checking forcing behavior
+    // Vocab size must be large enough to include all tokens (start, end, forced, sequence)
+    std::vector<llama_token_data> cur;
+    const size_t n_vocab = (size_t)max_token + 1;
+    for (size_t i = 0; i < n_vocab; i++) {
+        cur.emplace_back(llama_token_data{(llama_token)i, logf((float)(i+1)), 0.0f});
+    }
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+
+    size_t actual_force_start = SIZE_MAX;
+    size_t actual_force_end = SIZE_MAX;
+
+    // Feed the sequence and track when forcing occurs
+    for (size_t i = 0; i < sequence.size(); i++) {
+        // Check if we're in forcing state by applying and seeing if logits are modified
+        cur_p.selected = -1;
+        for (size_t j = 0; j < cur.size(); j++) {
+            cur[j].logit = logf((float)(j+1));  // reset logits
+        }
+
+        llama_sampler_apply(sampler, &cur_p);
+
+        // Check if forcing is active (all logits except one should be -INFINITY)
+        size_t finite_count = 0;
+        llama_token finite_token = -1;
+        for (size_t j = 0; j < cur.size(); j++) {
+            if (std::isfinite(cur[j].logit)) {
+                finite_count++;
+                finite_token = cur[j].id;
+            }
+        }
+
+        llama_sampler_accept(sampler, sequence[i]);
+
+        fprintf(stderr, "    i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token);
+
+        if (finite_count == 1) {
+            if (actual_force_start == SIZE_MAX) {
+                actual_force_start = i;
+            }
+            actual_force_end = i;
+        } else if (actual_force_start != SIZE_MAX && actual_force_end != SIZE_MAX) {
+            // Forcing stopped
+            break;
+        }
+    }
+
+    llama_sampler_free(sampler);
+
+    // Verify forcing occurred at expected positions
+    if (expected_force_start == SIZE_MAX) {
+        if (actual_force_start != SIZE_MAX) {
+            fprintf(stderr, "Test '%s' FAILED: Expected no forcing, but forcing occurred at %zu\n", test_name, actual_force_start);
+            GGML_ASSERT(false && "Expected no forcing, but forcing occurred");
+        }
+    } else {
+        if (actual_force_start == SIZE_MAX) {
+            fprintf(stderr, "Test '%s' FAILED: Expected forcing but none occurred\n", test_name);
+            GGML_ASSERT(false && "Expected forcing but none occurred");
+        }
+        if (actual_force_start != expected_force_start) {
+            fprintf(stderr, "Test '%s' FAILED: Forcing started at %zu, expected %zu\n", test_name, actual_force_start, expected_force_start);
+            GGML_ASSERT(false && "Forcing started at wrong position");
+        }
+    }
+
+    if (expected_force_end != SIZE_MAX) {
+        if (actual_force_end < expected_force_end) {
+            fprintf(stderr, "Test '%s' FAILED: Forcing ended at %zu, expected >= %zu\n", test_name, actual_force_end, expected_force_end);
+            GGML_ASSERT(false && "Forcing ended too early");
+        }
+    }
+
+    fprintf(stderr, "  Test '%s' passed (force_start=%zu, force_end=%zu)\n", test_name, actual_force_start, actual_force_end);
+    (void)sequence;
+}
+
+// UTF-8 boundary detection unit test
+// Tests common_utf8_is_complete() from reasoning-budget.h
+static void test_utf8_boundary_detection() {
+    // Complete sequences
+    GGML_ASSERT(common_utf8_is_complete("hello"));
+    GGML_ASSERT(common_utf8_is_complete(""));
+    GGML_ASSERT(common_utf8_is_complete("\xC2\xA0"));            // complete 2-byte UTF-8 (U+00A0)
+    GGML_ASSERT(common_utf8_is_complete("\xE2\x80\x9C"));        // complete 3-byte UTF-8 (left double quote)
+    GGML_ASSERT(common_utf8_is_complete("\xF0\x9F\x98\x80"));    // complete 4-byte UTF-8 (emoji)
+    GGML_ASSERT(common_utf8_is_complete("abc\xC3\xA9"));         // ASCII + complete 2-byte
+
+    // Incomplete sequences
+    GGML_ASSERT(!common_utf8_is_complete(std::string("\xC2", 1)));            // 2-byte start, missing continuation
+    GGML_ASSERT(!common_utf8_is_complete(std::string("\xE2\x80", 2)));        // 3-byte start + 1 cont, missing 1
+    GGML_ASSERT(!common_utf8_is_complete(std::string("\xE2", 1)));            // 3-byte start, missing 2
+    GGML_ASSERT(!common_utf8_is_complete(std::string("\xF0\x9F\x98", 3)));    // 4-byte start + 2 cont, missing 1
+    GGML_ASSERT(!common_utf8_is_complete(std::string("\xF0\x9F", 2)));        // 4-byte start + 1 cont, missing 2
+    GGML_ASSERT(!common_utf8_is_complete(std::string("\xF0", 1)));            // 4-byte start, missing 3
+    GGML_ASSERT(!common_utf8_is_complete(std::string("\x80", 1)));            // orphan continuation byte
+
+    // Mixed: ASCII followed by start of multi-byte
+    GGML_ASSERT(!common_utf8_is_complete(std::string("hello\xC3", 6)));       // ASCII + incomplete 2-byte
+    GGML_ASSERT(common_utf8_is_complete(std::string("hello\xC3\xA9", 7)));    // ASCII + complete 2-byte
+}
+
+int main(void) {
+    // Reasoning budget sampler tests
+    printf("Testing reasoning budget sampler... ");
+
+    // Test 1: Basic budget with start/end tokens - no forcing (natural end before budget exhausted)
+    {
+        const std::vector<llama_token> start = {100};  // start token
+        const std::vector<llama_token> end = {101};    // end token
+        const std::vector<llama_token> forced = {102}; // forced token (not used in this test)
+        const std::vector<llama_token> sequence = {100, 50, 51, 101, 52}; // start, two tokens, end, one more
+
+        test_reasoning_budget("natural end before budget exhausted", sequence, start, end, forced,
+            5,      // budget of 5 tokens
+            REASONING_BUDGET_IDLE,
+            SIZE_MAX, SIZE_MAX); // no forcing expected (natural end)
+    }
+
+    // Test 2: Budget exhausted, forcing should occur
+    // Flow: i=0 apply()->passthrough, accept(100)->COUNTING; i=1 accept(50)->remaining=1
+    // i=2 accept(51)->remaining=0->FORCING; i=3 apply() forces token[0]; i=4 apply() forces token[1]
+    // At i=4, accept() advances force_pos to 2 which equals forced_tokens.size(), so state becomes DONE
+    {
+        const std::vector<llama_token> start = {100};
+        const std::vector<llama_token> end = {101};
+        const std::vector<llama_token> forced = {102, 101}; // forced message + end
+        const std::vector<llama_token> sequence = {100, 50, 51, 52, 53}; // start + 4 tokens (budget=2)
+
+        test_reasoning_budget("budget exhausted forcing", sequence, start, end, forced,
+            2,      // budget of 2 tokens
+            REASONING_BUDGET_IDLE,
+            3,      // forcing starts at i=3 (accept at i=2 depletes budget, apply at i=3 forces)
+            4);     // forcing continues through i=4 (accept at i=4 transitions to DONE)
+    }
+
+    // Test 3: Activate immediately with budget=0, forcing should start right away
+    // Flow: init promotes COUNTING+budget=0 to FORCING, so apply() sees FORCING at i=0
+    {
+        const std::vector<llama_token> start = {100};
+        const std::vector<llama_token> end = {101};
+        const std::vector<llama_token> forced = {102, 101};
+        const std::vector<llama_token> sequence = {100, 50, 51, 52}; // start token first, then 3 tokens
+
+        test_reasoning_budget("activate immediately budget=0", sequence, start, end, forced,
+            0,      // budget of 0 tokens
+            REASONING_BUDGET_COUNTING, // starts counting, promoted to FORCING since budget=0
+            0,      // forcing starts at i=0 (initialized in FORCING, apply forces immediately)
+            1);     // forcing continues through i=1 (accept at i=1 transitions to DONE)
+    }
+
+    // Test 4: No start/end tokens configured - passthrough (no forcing)
+    {
+        const std::vector<llama_token> start = {};
+        const std::vector<llama_token> end = {};
+        const std::vector<llama_token> forced = {102};
+        const std::vector<llama_token> sequence = {50, 51, 52, 53};
+
+        test_reasoning_budget("no start/end configured", sequence, start, end, forced,
+            2,      // budget
+            REASONING_BUDGET_IDLE,
+            SIZE_MAX, SIZE_MAX); // no forcing (no start/end configured)
+    }
+
+    // Test 5: Activate immediately with budget > 0, count down then force
+    // Flow: i=0 accept(50)->remaining=1, i=1 accept(51)->remaining=0->FORCING
+    // Forcing starts at i=2 (apply sees FORCING after accept at i=1 transitioned)
+    {
+        const std::vector<llama_token> start = {100};
+        const std::vector<llama_token> end = {101};
+        const std::vector<llama_token> forced = {102, 101};
+        const std::vector<llama_token> sequence = {50, 51, 52, 53};
+
+        test_reasoning_budget("activate immediately with budget", sequence, start, end, forced,
+            2,      // budget of 2 tokens
+            REASONING_BUDGET_COUNTING,
+            2,      // forcing starts at i=2 (after 2 accepts deplete budget, apply at i=2 forces)
+            3);     // forcing continues through i=3
+    }
+
+    // Test 6: Multi-block thinking. First block ends naturally at i=2, second
+    // start tag at i=3 re-arms the budget, which then exhausts at i=5.
+    // Regression: before this fix, DONE absorbed all subsequent tokens and a
+    // second <think> block ran unbudgeted.
+    // Flow: i=0 accept(100)->COUNTING rem=2; i=1 accept(50)->rem=1;
+    //       i=2 accept(101)->end_matcher matches, DONE;
+    //       i=3 accept(100)->re-arm, COUNTING rem=2;
+    //       i=4 accept(60)->rem=1; i=5 accept(61)->rem=0->FORCING;
+    //       i=6 apply()->forces token[0]=102, accept(62)->force_pos=1, stay FORCING;
+    //       i=7 apply()->forces token[1]=101, accept(63)->force_pos=2->DONE.
+    {
+        const std::vector<llama_token> start = {100};
+        const std::vector<llama_token> end = {101};
+        const std::vector<llama_token> forced = {102, 101};
+        const std::vector<llama_token> sequence = {100, 50, 101, 100, 60, 61, 62, 63};
+
+        test_reasoning_budget("multi-block re-arms budget after DONE", sequence, start, end, forced,
+            2,      // budget of 2 tokens (per block)
+            REASONING_BUDGET_IDLE,
+            6,      // forcing starts at i=6 (after second block exhausts at i=5)
+            7);     // forcing continues through i=7
+    }
+
+    printf("OK (6 tests passed)\n");
+
+    printf("Testing UTF-8 boundary detection... ");
+    test_utf8_boundary_detection();
+    printf("OK\n");
+
+    return 0;
+}
--- a/tests/test-regex-partial.cpp
+++ b/tests/test-regex-partial.cpp
@@ -0,0 +1,288 @@
+//  Tests common_regex (esp. its partial final matches support).
+
+#include "common.h"
+#include "regex-partial.h"
+
+#include <sstream>
+#include <iostream>
+#include <optional>
+
+template <class T> static void assert_equals(const T & expected, const T & actual) {
+    if (expected != actual) {
+        std::cerr << "Expected: " << expected << std::endl;
+        std::cerr << "  Actual: " << actual << std::endl;
+        std::cerr << std::flush;
+        throw std::runtime_error("Test failed");
+    }
+}
+
+struct test_case {
+    std::string pattern;
+    struct input_output {
+        std::string input;
+        common_regex_match output;
+    };
+    std::vector<input_output> inputs_outputs;
+};
+
+static std::string common_regex_match_type_name(common_regex_match_type type) {
+    switch (type) {
+        case COMMON_REGEX_MATCH_TYPE_NONE:
+            return "COMMON_REGEX_MATCH_TYPE_NONE";
+        case COMMON_REGEX_MATCH_TYPE_PARTIAL:
+            return "COMMON_REGEX_MATCH_TYPE_PARTIAL";
+        case COMMON_REGEX_MATCH_TYPE_FULL:
+            return "COMMON_REGEX_MATCH_TYPE_FULL";
+    }
+    return "?";
+}
+
+static void test_regex() {
+    printf("[%s]\n", __func__);
+    auto test = [](const test_case & test_case) {
+        common_regex cr(test_case.pattern);
+        std::cout << "Testing pattern: /" << test_case.pattern << "/\n";
+        // std::cout << "    partial rev: " << cr.reversed_partial_pattern.str() << '\n';
+        for (const auto & input_output : test_case.inputs_outputs) {
+            std::cout << "  Input: " << input_output.input << '\n';
+            auto m = cr.search(input_output.input, 0);
+            if (m != input_output.output) {
+                auto match_to_str = [&](const std::optional<common_regex_match> & m) {
+                    std::ostringstream ss;
+                    if (m->type == COMMON_REGEX_MATCH_TYPE_NONE) {
+                        ss << "<no match>";
+                    } else {
+                        GGML_ASSERT(!input_output.output.groups.empty());
+                        std::vector<std::string> parts;
+                        for (const auto & g : m->groups) {
+                            parts.push_back("{" + std::to_string(g.begin) + ", " + std::to_string(g.end) + "}");
+                        }
+                        ss << "{" << common_regex_match_type_name(m->type) << ", {" << string_join(parts, ", ") << "}}";
+                    }
+                    return ss.str();
+                };
+                std::cout << "    Expected: " << match_to_str(input_output.output) << '\n';
+                std::cout << "         Got: " << match_to_str(m) << '\n';
+                std::cout << " Inverted pattern: /" << regex_to_reversed_partial_regex(test_case.pattern) << "/\n";
+
+                throw std::runtime_error("Test failed");
+            }
+        }
+    };
+    test({
+        "a",
+        {
+            {"a", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
+            {"b", {COMMON_REGEX_MATCH_TYPE_NONE, {}}},
+            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
+            {"ba", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 2}}}},
+        }
+    });
+    test({
+        "abcd",
+        {
+            {"abcd", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
+            {"abcde", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
+            {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
+            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
+            {"d", {}},
+            {"bcd", {}},
+            {"cde", {}},
+            {"cd", {}},
+            {"yeah ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{5, 7}}}},
+            {"abbie", {}},
+            {"", {}},
+        }
+    });
+    test({
+        ".*?ab",
+        {
+            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
+            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
+            {"dab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
+            {"dabc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
+            {"da", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
+        }
+    });
+    test({
+        "a.*?b",
+        {
+            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
+            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
+            {"a b", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
+            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
+            {"argh", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
+            {"d", {}},
+            {"b", {}},
+        }
+    });
+    test({
+        "ab(?:cd){2,4}ef",
+        {
+            // {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, 0, {}}},
+            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"abcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
+            {"abcde", {}},
+            {"abcdef", {}},
+            {"abcdcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
+            {"abcdcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 7}}}},
+            {"abcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
+            {"abcdcdcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 12}}}},
+            {"abcdcdcdcdcdef", {}},
+            {"abcde", {}},
+            {"yea", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{2, 3}}}},
+        }
+    });
+    test({
+        "a(?:rte| pure )fact",
+        {
+            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
+            {"art", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
+            {"artefa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
+            {"fact", {}},
+            {"an arte", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{3, 7}}}},
+            {"artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
+            {"an artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{3, 11}}}},
+            {"a pure", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
+            {"a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 11}}}},
+            {"it's a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{5, 16}}}},
+            {"" , {}},
+            {"pure", {}},
+            {"pure fact", {}},
+        }
+    });
+    test({
+        "abc",
+        {
+            {" abcc", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 4}}}},
+            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
+            {" ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{1, 3}}}},
+            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
+            {"b", {}},
+            {"c", {}},
+            {"", {}},
+        }
+    });
+
+    test({
+        "(?:abc)?\\s*def",
+        {
+            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
+            {"abc ", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
+            {"abc d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
+            {"abc de", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
+            {"abc def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
+            {"abc defg", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
+            {"abc defgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
+            {"abcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
+            {"abcdefgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 6}}}},
+            {" d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
+            {"def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
+        }
+    });
+
+    test({
+        "a+b",
+        {
+            {"aaab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
+            {"aaa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
+            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
+        }
+    });
+
+    test({
+        "(?:"
+            "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
+            "("                          // match 2 (open_tag)
+                "<tool_call>"
+                "|<function_call>"
+                "|<tool>"
+                "|<tools>"
+                "|<response>"
+                "|<json>"
+                "|<xml>"
+                "|<JSON>"
+            ")?"
+            "(\\s*\\{\\s*\"name\"\\s*:)" // match 3 (named tool call)
+        ")"
+        "|<function=([^>]+)>"            // match 4 (function name)
+        "|<function name=\"([^\"]+)\">", // match 5 (function name again)
+        {
+            {"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}, {54, 54}, {54, 54}, {0, 8}, {54, 54}, {54, 54}}}},
+            {"<tool_call> {\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 18}}}},
+            {"<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 17}}}},
+            {"Let's call something\n<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{21, 38}}}},
+            {"Ok then<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 24}}}},
+            {"{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
+            {"Ok then{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 13}}}},
+            {"<tool_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 20}, {66, 66}, {0, 11}, {11, 20}, {66, 66}, {66, 66}}}},
+            {"<function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 24}, {70, 70}, {0, 15}, {15, 24}, {70, 70}, {70, 70}}}},
+            {"<function name=\"special_function\"> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 34}, {89, 89}, {89, 89}, {89, 89}, {89, 89}, {16, 32}}}},
+            {"<function=all>", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 14}, {14, 14}, {14, 14}, {14, 14}, {10, 13}, {14, 14}}}},
+
+        }
+    });
+}
+
+static void test_regex_to_reversed_partial_regex() {
+    printf("[%s]\n", __func__);
+
+    assert_equals<std::string>(
+        "^((?:(?:c)?b)?a)",
+        regex_to_reversed_partial_regex("abc"));
+
+    assert_equals<std::string>(
+        "^(a+)",
+        regex_to_reversed_partial_regex("a+"));
+
+    assert_equals<std::string>(
+        "^(a*)",
+        regex_to_reversed_partial_regex("a*"));
+
+    assert_equals<std::string>(
+        "^(a?)",
+        regex_to_reversed_partial_regex("a?"));
+
+    assert_equals<std::string>(
+        "^([a-z])",
+        regex_to_reversed_partial_regex("[a-z]"));
+
+    assert_equals<std::string>(
+        "^((?:\\w+)?[a-z])",
+        regex_to_reversed_partial_regex("[a-z]\\w+"));
+
+    assert_equals<std::string>(
+        "^((?:a|b))",
+        regex_to_reversed_partial_regex("(?:a|b)"));
+    assert_equals<std::string>(
+        "^((?:(?:(?:d)?c)?b)?a)",
+        regex_to_reversed_partial_regex("abcd"));
+    assert_equals<std::string>(
+        "^((?:b)?a*)", // TODO: ((?:b)?a*+).* ??
+        regex_to_reversed_partial_regex("a*b"));
+    assert_equals<std::string>(
+        "^((?:(?:b)?a)?.*)",
+        regex_to_reversed_partial_regex(".*?ab"));
+    assert_equals<std::string>(
+        "^((?:(?:b)?.*)?a)",
+        regex_to_reversed_partial_regex("a.*?b"));
+    assert_equals<std::string>(
+        "^((?:(?:d)?(?:(?:c)?b))?a)",
+        regex_to_reversed_partial_regex("a(bc)d"));
+    assert_equals<std::string>(
+        "^((?:(?:(?:c)?b|(?:e)?d))?a)",
+        regex_to_reversed_partial_regex("a(bc|de)"));
+    assert_equals<std::string>(
+        "^((?:(?:(?:(?:(?:c)?b?)?b?)?b)?b)?a)",
+        regex_to_reversed_partial_regex("ab{2,4}c"));
+}
+
+int main() {
+    test_regex_to_reversed_partial_regex();
+    test_regex();
+    std::cout << "All tests passed.\n";
+}
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -0,0 +1,263 @@
+#include "ggml.h"
+#include "ggml-cpu.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
+#define MAX_NARGS 3
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define GGML_SILU_FP16
+
+//
+// logging
+//
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+static float frand(void) {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+static int irand(int n) {
+    if (n == 0) return 0;
+    return rand()%n;
+}
+
+static void get_random_dims(int64_t * dims, int ndims) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = 1 + irand(4);
+    }
+}
+
+static struct ggml_tensor * get_random_tensor_f32(
+        struct ggml_context * ctx0,
+        int ndims,
+        const int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+int main(int /*argc*/, const char ** /*argv*/) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 128*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    std::vector<uint8_t> work_buffer;
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_tensor * x;
+
+    // rope f32
+    for (int m = 0; m < 5; ++m) {
+        const int ndims = 4;
+
+        const int64_t n_rot = 128;
+        const int64_t ne[4] = { 2*n_rot, 32, 73, 1 };
+
+        const int n_past_0 = 100;
+        const int n_past_2 = 33;
+
+        struct ggml_tensor * r0;
+        struct ggml_tensor * r1;
+        struct ggml_tensor * r2;
+        x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+        int mode = -1;
+
+        if (m < 2) {
+            struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+            struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+            struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+
+            for (int i = 0; i < ne[2]; ++i) {
+                ((int32_t *) p0->data)[i] = n_past_0 + i;
+                ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
+                ((int32_t *) p2->data)[i] = n_past_2 + i;
+            }
+            // test mode 0, 2  (standard, GPT-NeoX)
+            mode = m == 0 ? GGML_ROPE_TYPE_NORMAL : GGML_ROPE_TYPE_NEOX;
+
+            // 100, 101, 102, ..., 172
+            r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
+            // -67, -67, -67, ..., -67
+            r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+
+            //  33,  34,  35, ..., 105
+            r2 = ggml_rope(ctx0, x,  p2, n_rot, mode);
+        } else {
+            // testing multi-dimension rope position embedding mode
+            struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+            struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+            struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
+
+            int sections[4] = {16, 24, 24, 0};
+
+            mode = (m == 2) ? GGML_ROPE_TYPE_MROPE : (m == 3) ? GGML_ROPE_TYPE_VISION : GGML_ROPE_TYPE_IMROPE;
+
+            for (int i = 0; i < ne[2]; ++i) {
+                for (int j = 0; j < 4; ++j) {
+                    ((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
+                    ((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
+                    ((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
+                }
+            }
+
+            // [[100, 101, 102, ..., 172],
+            // [101, 102, 103, ..., 173],
+            // [102, 103, 104, ..., 174]]
+            r0 = ggml_rope_multi(
+                ctx0, x, p0, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+            // [[-67, -67, -67, ..., -67]
+            // [-67, -67, -67, ..., -67]
+            // [-67, -67, -67, ..., -67]]
+            r1 = ggml_rope_multi(
+                ctx0, r0, p1, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+
+            //  [[33,  34,  35, ..., 105]
+            //  [34,  35,  36, ..., 106]
+            //  [35,  36,  37, ..., 107]]
+            r2 = ggml_rope_multi(
+                ctx0, x, p2, nullptr,
+                n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
+        }
+
+        ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        ggml_build_forward_expand(gf, r0);
+        ggml_build_forward_expand(gf, r1);
+        ggml_build_forward_expand(gf, r2);
+
+        ggml_graph_compute_helper(work_buffer, gf, 4);
+
+        // check that r1 and r2 are the same
+        {
+            double sum0 = 0.0f;
+            double sum1 = 0.0f;
+            double diff = 0.0f;
+
+            const float * r1_data = (float *) r1->data;
+            const float * r2_data = (float *) r2->data;
+
+            const int n_elements = ggml_nelements(r1);
+
+            for (int i = 0; i < n_elements; ++i) {
+                sum0 += fabs(r1_data[i]);
+                sum1 += fabs(r2_data[i]);
+                diff += fabs(r1_data[i] - r2_data[i]);
+                //if (fabs(r1_data[i] - r2_data[i]) > 0.0001f) {
+                //    printf("%d: %f %f\n", i, r1_data[i], r2_data[i]);
+                //    printf("diff: %f\n", fabs(r1_data[i] - r2_data[i]));
+                //}
+            }
+
+            //for (int i = 4096; i < 4096 + 128; ++i) {
+            //    printf("%f %f\n", r1_data[i], r2_data[i]);
+            //}
+
+            printf("mode: %d\n", mode);
+            printf("sum0: %f\n", sum0);
+            printf("sum1: %f\n", sum1);
+            printf("diff: %f\n", diff);
+            printf("rel err: %f\n", diff / sum0);
+            printf("rel err: %f\n", diff / sum1);
+
+            GGML_ASSERT(diff / sum0 < 0.0001f);
+            GGML_ASSERT(diff / sum1 < 0.0001f);
+        }
+    }
+
+    ggml_free(ctx0);
+
+    return 0;
+}
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -0,0 +1,400 @@
+#include "ggml.h"
+#include "llama.h"
+
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <vector>
+
+extern struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers);
+
+static void dump(const llama_token_data_array * cur_p) {
+    for (size_t i = 0; i < cur_p->size; i++) {
+        printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+    }
+}
+
+#define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
+
+struct sampler_tester {
+    sampler_tester(size_t n_vocab) {
+        cur.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+            const float logit = logf(token_id);
+            cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
+        }
+
+        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+    }
+
+    sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {
+        cur.reserve(probs.size());
+        for (llama_token token_id = 0; token_id < (llama_token)probs.size(); token_id++) {
+            const float logit = logf(probs[token_id]);
+            cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
+        }
+
+        cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
+    }
+
+    void apply(llama_sampler * sampler) {
+        llama_sampler_apply(sampler, &cur_p);
+        llama_sampler_free(sampler);
+    }
+
+    void check() {
+        GGML_ASSERT(cur_p.size == probs_expected.size());
+        for (size_t i = 0; i < cur_p.size; i++) {
+            GGML_ASSERT(fabs(cur_p.data[i].p - probs_expected[i]) < 1e-5);
+        }
+    }
+
+    llama_token_data_array cur_p;
+
+private:
+    const std::vector<float> probs_expected;
+
+    std::vector<llama_token_data> cur;
+};
+
+static void test_temp(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_temp(temp));
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_top_k(const std::vector<float> & probs, const std::vector<float> & probs_expected, int k) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_k(k));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_top_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_p(p, 0));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_min_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_min_p(p, 0));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_xtc(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p, float t) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_xtc(p, t, 0, 0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_typical(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_typical(p, 0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_penalties(
+    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
+    const std::vector<float> & probs_expected, float repeat_penalty, float alpha_frequency, float alpha_presence
+) {
+    GGML_ASSERT(probs.size() == probs_expected.size());
+
+    sampler_tester tester(probs, probs_expected);
+
+    auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
+
+    for (size_t i = 0; i < last_tokens.size(); i++) {
+        llama_sampler_accept(sampler, last_tokens[i]);
+    }
+
+    DUMP(&tester.cur_p);
+    tester.apply(sampler);
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_dry(
+    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
+    const std::vector<float> & expected_probs, float dry_multiplier, float dry_base,
+    int dry_allowed_length, int dry_penalty_last_n,
+    const std::vector<std::vector<llama_token>> & seq_breakers
+) {
+    GGML_ASSERT(probs.size() == expected_probs.size());
+
+    sampler_tester tester(probs, expected_probs);
+
+    auto * sampler = llama_sampler_init_dry_testing(1024, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);
+
+    for (size_t i = 0; i < last_tokens.size(); i++) {
+        llama_sampler_accept(sampler, last_tokens[i]);
+    }
+
+    DUMP(&tester.cur_p);
+    tester.apply(sampler);
+    tester.apply(llama_sampler_init_dist(0));
+    DUMP(&tester.cur_p);
+    tester.check();
+}
+
+static void test_top_n_sigma(const std::vector<float> & probs, const std::vector<float> & probs_expected, int n) {
+    sampler_tester tester(probs, probs_expected);
+
+    DUMP(&tester.cur_p);
+    tester.apply(llama_sampler_init_top_n_sigma(n));
+    tester.apply(llama_sampler_init_dist (0));
+    DUMP(&tester.cur_p);
+
+    tester.check();
+}
+
+static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
+) {
+    sampler_tester tester(n_vocab);
+
+          llama_token min_token_id = 0;
+    const llama_token max_token_id = n_vocab - 1;
+
+    for (auto s : samplers_sequence) {
+        switch (s) {
+            case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
+            case 'y': GGML_ABORT("typical test not implemented");
+            case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
+            case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break;
+            case 't': GGML_ABORT("temperature test not implemented");
+            default : GGML_ABORT("Unknown sampler");
+        }
+
+        tester.apply(llama_sampler_init_dist(0));
+
+        auto & cur_p = tester.cur_p;
+
+        const int size = cur_p.size;
+
+        if (s == 'k') {
+            const int expected_size = std::min(size, top_k);
+            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
+
+            GGML_ASSERT(size == expected_size);
+            GGML_ASSERT(cur_p.data[0].id == max_token_id);
+            GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
+        } else if (s == 'p') {
+            const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
+            const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
+
+                min_token_id  = n_vocab;
+            int expected_size = 0;
+            int cumsum        = 0;
+            do { // do-while because always at least one token is sampled
+                min_token_id--;
+                expected_size++;
+
+                cumsum += min_token_id;
+            } while (cumsum < softmax_numerator_target);
+
+            // token 0 has p == 0, need special consideration for cumsum because top_p immediately returns
+            if (min_token_id == 1) {
+                min_token_id--;
+                expected_size += 1;
+            }
+
+            GGML_ASSERT(size == expected_size);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[0].id == max_token_id);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[expected_size-1].id == min_token_id);
+        } else if (s == 'm') {
+            int expected_size = ceilf((1.0f - min_p) * n_vocab);
+            expected_size = std::max(expected_size, 1);
+            expected_size = std::min(expected_size, size);
+
+            min_token_id = floorf(min_p * n_vocab);
+            min_token_id = std::max(min_token_id, 1);
+            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - size));
+            min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
+
+            GGML_ASSERT(size == expected_size);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[0].id == max_token_id);
+            GGML_ASSERT(!cur_p.sorted || cur_p.data[expected_size-1].id == min_token_id);
+        } else {
+            GGML_ABORT("fatal error");
+        }
+    }
+
+    printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%5d top_p=%f min_p=%f\n",
+           samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
+}
+
+static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vector<llama_token_data> & data, int n_iter) {
+    std::vector<llama_token_data> cur(data.size());
+    std::copy(data.begin(), data.end(), cur.begin());
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    llama_sampler_apply(cnstr, &cur_p);
+    llama_sampler_reset(cnstr);
+    const int64_t t_start = ggml_time_us();
+    for (int i = 0; i < n_iter; i++) {
+        std::copy(data.begin(), data.end(), cur.begin());
+        llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+        llama_sampler_apply(cnstr, &cur_p);
+        llama_sampler_reset(cnstr);
+    }
+    const int64_t t_end = ggml_time_us();
+    llama_sampler_free(cnstr);
+    printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
+}
+
+#define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
+
+static void test_perf() {
+    const int n_vocab = 1 << 17;
+
+    std::vector<llama_token_data> data;
+
+    data.reserve(n_vocab);
+    for (int i = 0; i < n_vocab; i++) {
+        const float logit = 2.0f*((double)(rand())/RAND_MAX - 0.5);
+        data.emplace_back(llama_token_data{i, logit, 0.0f});
+    }
+
+    BENCH(llama_sampler_init_top_k  (40),                     data, 32);
+    BENCH(llama_sampler_init_top_p  (0.8f, 1),                data, 32);
+    BENCH(llama_sampler_init_min_p  (0.2f, 1),                data, 32);
+    BENCH(llama_sampler_init_typical(0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_xtc    (1.0f, 0.1f, 1, 1),       data, 32);
+}
+
+int main(void) {
+    ggml_time_init();
+
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f);
+    test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.0f, 0.0f, 0.0f, 1.0f}, 0.0f);
+
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f, 0.0f, 1.0f);
+    test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.0f, 0.0f, 0.0f, 1.0f}, 0.0f, 0.0f, 1.0f);
+
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 0);
+
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 0);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f}, 0.7f);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 0.8f);
+    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 1.0f);
+
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f/1.0f, 0.2f/1.0f, 0.3f/1.0f, 0.4f/1.0f}, 0.00f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f/1.0f, 0.2f/1.0f, 0.3f/1.0f, 0.4f/1.0f}, 0.24f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.2f/0.9f, 0.3f/0.9f, 0.4f/0.9f},            0.26f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.2f/0.9f, 0.3f/0.9f, 0.4f/0.9f},            0.49f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.3f/0.7f, 0.4f/0.7f},                       0.51f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.3f/0.7f, 0.4f/0.7f},                       0.74f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.05f);
+
+    printf("XTC should:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                                0.99f, 0.09f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                          0.99f, 0.19f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},                    0.99f, 0.29f);
+
+    printf("XTC should not:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
+
+    test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f},            0.5f);
+    test_typical({0.4f, 0.2f, 0.2f, 0.2f},     {0.2f, 0.2f, 0.2f}, 0.5f);
+
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0, 0.25f, 0.25f, 0.25f, 0.25f},   50.0f, 0.0f, 0.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0, 0, 0, 0.5f, 0.5f},       50.0f, 0.0f, 0.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0, 0, 0, 0.5f, 0.5f}, 50.0f, 0.0f, 0.0f);
+
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.000011f, 0.249997f, 0.249997f, 0.249997f, 0.249997f}, 1.0f, 5.0f, 5.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.000023f, 0.000023f, 0.000023f, 0.499966f, 0.499966f}, 1.0f, 5.0f, 5.0f);
+    test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.000000f, 0.000023f, 0.000023f, 0.499977f, 0.499977f}, 1.0f, 5.0f, 5.0f);
+
+
+    test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1}, {0.25f, 0.25f, 0.25f, 0.25f}, 1.0f, 1.1f, 2, 4, {});
+    test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1, 2, 0, 1}, {0.296923f, 0.296923f, 0.109232f, 0.296923f}, 1.0f, 1.1f, 2, 5, {});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 2, 6, {{3}});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.032727f, 0.241818f, 0.241818f}, 2.0f, 1.1f, 2, 5, {});
+    test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 4, 7, {});
+
+    test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f, 0.0f, 0.0f}, 1.00f);
+    test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 0.00f); // top_n_sigma == 0 now represents a no-op rather than greedy decoding as of PR#13345
+    test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 3.00f);
+
+    test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
+    test_sampler_queue(10000, "k",     1, 1.0f, 1.0f);
+    test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);
+    test_sampler_queue(10000, "p", 10000, 0.0f, 1.0f);
+    test_sampler_queue(10000, "m", 10000, 1.0f, 1.0f);
+    test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12);
+
+    test_sampler_queue(10000, "k",   100, 1.0000f, 1.0f);
+    test_sampler_queue(10000, "p", 10000, 0.0003f, 1.0f);
+    test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f);
+    test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f);
+    test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f);
+
+    test_sampler_queue(10000, "kp", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "km", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "pk", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "pm", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "mk", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "mp", 100, 0.8f, 9997.9f/9999.0f);
+    test_sampler_queue(10000, "mp", 100, 0.8f, 0.1f);
+
+    test_sampler_queue(10000, "kpm", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "kmp", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "pkm", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "pmk", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f);
+    test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f);
+
+    printf("OK\n");
+
+    test_perf();
+
+    return 0;
+}
--- a/tests/test-state-restore-fragmented.cpp
+++ b/tests/test-state-restore-fragmented.cpp
@@ -0,0 +1,125 @@
+// Test for state restore with fragmented KV cache
+// This tests the fix for: https://github.com/ggml-org/llama.cpp/issues/17527
+// The issue was that state restore required contiguous KV cache slots,
+// which fails when the cache is fragmented.
+//
+// The fix changes find_slot(ubatch, true) to find_slot(ubatch, false)
+// in state_read_meta(), allowing non-contiguous slot allocation.
+
+#include "arg.h"
+#include "common.h"
+#include "llama.h"
+
+#include <vector>
+#include <cstdio>
+#include <cstring>
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.sampling.seed = 1234;
+    params.kv_unified = true;
+    params.n_parallel = 3;
+    params.n_ctx = 256;
+
+    common_init();
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        return 1;
+    }
+
+    // init
+
+    ggml_backend_load_all();
+
+    common_init_result_ptr llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init->model();
+    llama_context * ctx = llama_init->context();
+
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
+        return 1;
+    }
+
+    GGML_UNUSED(model);
+
+    // tokenize prompt
+    std::vector<llama_token> tokens(70, 1);
+
+    // interleave the 3 sequences:
+    // 01201230123...
+    llama_batch batch = llama_batch_init(params.n_parallel*tokens.size(), 0, 1);
+    for (size_t i = 0; i < tokens.size(); i++) {
+        for (int s = 0; s < params.n_parallel; ++s) {
+            common_batch_add(batch, tokens[i], i, {s}, false);
+        }
+    }
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (llama_decode(ctx, batch)) {
+        fprintf(stderr, "%s : failed to decode seq 0\n", __func__);
+        return 1;
+    }
+
+    fprintf(stderr, "%s : processed prompt on seq 0, 1, 2 (%zu tokens each)\n", __func__, tokens.size());
+
+    // Save state of seq 1
+    std::vector<uint8_t> seq_state(llama_state_seq_get_size(ctx, 1));
+    const size_t ncopy = llama_state_seq_get_data(ctx, seq_state.data(), seq_state.size(), 1);
+    if (ncopy != seq_state.size()) {
+        fprintf(stderr, "%s : failed to save seq 1 state\n", __func__);
+        return 1;
+    }
+    fprintf(stderr, "%s : saved seq 1 state, %zu bytes\n", __func__, ncopy);
+
+    // clear seq 1 to create a "hole" in the KV cache (fragmentation)
+    // 0.20.20.20.2....
+    llama_memory_t mem = llama_get_memory(ctx);
+    llama_memory_seq_rm(mem, 1, -1, -1);
+    fprintf(stderr, "%s : cleared seq 1 to create fragmentation\n", __func__);
+
+    // Now the cache has holes where seq 1 was
+    // This creates fragmentation - there's no contiguous block large enough
+    // for the seq 1 state if we only look for contiguous slots
+
+    // Restore seq 1 state into seq 1 (should work with non-contiguous allocation)
+    // We use seq 1 since it's a valid sequence ID (0 to n_parallel-1)
+    // Before the fix, this would fail with "failed to find available cells in kv cache"
+    const size_t nset = llama_state_seq_set_data(ctx, seq_state.data(), seq_state.size(), 1);
+    if (nset != seq_state.size()) {
+        fprintf(stderr, "%s : FAILED to restore seq state into fragmented cache (got %zu, expected %zu)\n",
+                __func__, nset, seq_state.size());
+        fprintf(stderr, "%s : This is the bug - state restore fails with fragmented KV cache\n", __func__);
+        llama_batch_free(batch);
+        return 1;
+    }
+    fprintf(stderr, "%s : restored state into seq 1, %zu bytes\n", __func__, nset);
+
+    // Verify we can decode with the restored state
+    // Generate one token to verify the restored state is usable
+    auto sparams = llama_sampler_chain_default_params();
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
+
+    auto next_token = llama_sampler_sample(smpl, ctx, -1);
+    auto next_token_str = common_token_to_piece(ctx, next_token);
+
+    common_batch_clear(batch);
+    common_batch_add(batch, next_token, (int)tokens.size(), {1}, true);
+
+    if (llama_decode(ctx, batch)) {
+        fprintf(stderr, "%s : failed to decode with restored state\n", __func__);
+        llama_sampler_free(smpl);
+        llama_batch_free(batch);
+        return 1;
+    }
+
+    fprintf(stderr, "%s : successfully decoded with restored state, generated: '%s'\n", __func__, next_token_str.c_str());
+    fprintf(stderr, "%s : SUCCESS - state restore works with fragmented KV cache\n", __func__);
+
+    llama_sampler_free(smpl);
+    llama_batch_free(batch);
+
+    return 0;
+}
--- a/tests/test-thread-safety.cpp
+++ b/tests/test-thread-safety.cpp
@@ -0,0 +1,164 @@
+// thread safety test
+// - Loads a copy of the same model on each GPU, plus a copy on the CPU
+// - Creates n_parallel (--parallel) contexts per model
+// - Runs inference in parallel on each context
+
+#include <array>
+#include <thread>
+#include <vector>
+#include <atomic>
+#include "llama.h"
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "sampling.h"
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    common_init();
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        return 1;
+    }
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+
+    //llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+    //    if (level == GGML_LOG_LEVEL_ERROR) {
+    //        common_log_add(common_log_main(), level, "%s", text);
+    //    }
+    //}, NULL);
+
+    auto cparams = common_context_params_to_llama(params);
+
+    // each context has a single sequence
+    cparams.n_seq_max = 1;
+
+    int dev_count = ggml_backend_dev_count();
+    std::vector<std::array<ggml_backend_dev_t, 2>> gpus;
+    for (int i = 0; i < dev_count; ++i) {
+        auto * dev = ggml_backend_dev_get(i);
+        if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+            gpus.push_back({dev, nullptr});
+        }
+    }
+    const int gpu_dev_count = (int)gpus.size();
+    const int num_models = gpu_dev_count + 1 + 1; // GPUs + 1 CPU model + 1 layer split
+    //const int num_models = std::max(1, gpu_dev_count);
+    const int num_contexts = std::max(1, params.n_parallel);
+
+    std::vector<llama_model_ptr> models;
+    std::vector<std::thread> threads;
+    std::atomic<bool> failed = false;
+
+    for (int m = 0; m < num_models; ++m) {
+        auto mparams = common_model_params_to_llama(params);
+
+        if (m < gpu_dev_count) {
+            mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
+            mparams.devices = gpus[m].data();
+        } else if (m == gpu_dev_count) {
+            mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
+            mparams.main_gpu = -1; // CPU model
+        } else {
+            mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;
+        }
+
+        llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+        if (model == NULL) {
+            LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+            return 1;
+        }
+
+        models.emplace_back(model);
+    }
+
+    for  (int m = 0; m < num_models; ++m) {
+        auto * model = models[m].get();
+        for (int c = 0; c < num_contexts; ++c) {
+            threads.emplace_back([&, m, c, model]() {
+                LOG_INF("Creating context %d/%d for model %d/%d\n", c + 1, num_contexts, m + 1, num_models);
+
+                llama_context_ptr ctx { llama_init_from_model(model, cparams) };
+                if (ctx == NULL) {
+                    LOG_ERR("failed to create context\n");
+                    failed.store(true);
+                    return;
+                }
+
+                std::unique_ptr<common_sampler, decltype(&common_sampler_free)> sampler { common_sampler_init(model, params.sampling), common_sampler_free };
+                if (sampler == NULL) {
+                    LOG_ERR("failed to create sampler\n");
+                    failed.store(true);
+                    return;
+                }
+
+                llama_batch batch = {};
+                {
+                    auto prompt = common_tokenize(ctx.get(), params.prompt, true);
+                    if (prompt.empty()) {
+                        LOG_ERR("failed to tokenize prompt\n");
+                        failed.store(true);
+                        return;
+                    }
+                    batch = llama_batch_get_one(prompt.data(), prompt.size());
+                    if (llama_decode(ctx.get(), batch)) {
+                        LOG_ERR("failed to decode prompt\n");
+                        failed.store(true);
+                        return;
+                    }
+                }
+
+                const auto * vocab = llama_model_get_vocab(model);
+                std::string result = params.prompt;
+
+                for (int i = 0; i < params.n_predict; i++) {
+                    llama_token token;
+                    if (batch.n_tokens > 0) {
+                        token = common_sampler_sample(sampler.get(), ctx.get(), batch.n_tokens - 1);
+                    } else {
+                        token = llama_vocab_bos(vocab);
+                    }
+
+                    result += common_token_to_piece(ctx.get(), token);
+
+                    if (llama_vocab_is_eog(vocab, token)) {
+                        break;
+                    }
+
+                    batch = llama_batch_get_one(&token, 1);
+
+                    int ret = llama_decode(ctx.get(), batch);
+                    if (ret == 1 && i > 0) {
+                        LOG_INF("Context full, stopping generation.\n");
+                        break;
+                    }
+
+                    if (ret != 0) {
+                        LOG_ERR("Model %d/%d, Context %d/%d: failed to decode\n", m + 1, num_models, c + 1, num_contexts);
+                        failed.store(true);
+                        return;
+                    }
+                }
+
+                LOG_INF("Model %d/%d, Context %d/%d: %s\n\n", m + 1, num_models, c + 1, num_contexts, result.c_str());
+            });
+        }
+    }
+
+    for (auto & thread : threads) {
+        thread.join();
+    }
+
+    if (failed) {
+        LOG_ERR("One or more threads failed.\n");
+        return 1;
+    }
+
+    LOG_INF("All threads finished without errors.\n");
+    return 0;
+}
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -0,0 +1,312 @@
+#include "llama.h"
+#include "common.h"
+#include "console.h"
+
+#include <cstdio>
+#include <string>
+#include <map>
+#include <vector>
+#include <fstream>
+#include <thread>
+
+//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
+//    static std::map<std::string, std::vector<llama_token>> _k_tests = {
+//        { ""                      , {  }, },
+//        { " "                     , {     220, }, },
+//        { "  "                    , {     256, }, },
+//        { "   "                   , {     262, }, },
+//        { "\t"                    , {     197, }, },
+//        { "\n"                    , {     198, }, },
+//        { "\n\n"                  , {     271, }, },
+//        { "\n\n\n"                , {    1432, }, },
+//        { "\t\n"                  , {    1602, }, },
+//        { "Hello world"           , {    9906,   1917, }, },
+//        { " Hello world"          , {   22691,   1917, }, },
+//        { "Hello World"           , {    9906,   4435, }, },
+//        { " Hello World"          , {   22691,   4435, }, },
+//        { " Hello World!"         , {   22691,   4435,      0, }, },
+//        { "Hello, world!"         , {    9906,     11,   1917,      0, }, },
+//        { " Hello, world!"        , {   22691,     11,   1917,      0, }, },
+//        { " this is 🦙.cpp"        , {     420,    374,  11410,     99,    247,     13,  11055, }, },
+//        { "w048 7tuijk dsdfhu"    , {      86,  23904,    220,     22,     83,   2005,  42908,  11729,   3013,  17156, }, },
+//        { "нещо на Български"     , {   79862, 102118,  13373,  64571,  34694,   3114, 112203,  80112, }, },
+//        { "កាន់តែពិសេសអាចខលចេញ"   , {   21549,    222,  98629,    241,  45358,    233,  21549,    237,  45358,    224,  21549,    244,  21549,    115,  21549,    253,  45358,    223,  21549,    253,  21549,     95,  98629,    227,  21549,    223,  21549,    249,  21549,    227,  45358,    223,  21549,    231, }, },
+//        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    9468,    248,    222,    320,   8416,      8,  27623,    114, 102470,   9468,    234,    104,  31643,    320,  36773, 100166,  98634,      8,  26602,    227,    320,   3323,  43465,    430,    706,   1202,   1866,   4037,      8, }, },
+//        { "Hello"                 , {    9906, }, },
+//        { " Hello"                , {   22691, }, },
+//        { "  Hello"               , {     220,  22691, }, },
+//        { "   Hello"              , {     256,  22691, }, },
+//        { "    Hello"             , {     262,  22691, }, },
+//        { "    Hello\n    Hello"  , {     262,  22691,    198,    262,  22691, }, },
+//        { " ("                    , {     320, }, },
+//        { "\n ="                  , {     198,    284, }, },
+//        { "' era"                 , {       6,  11639, }, },
+//        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    9906,     11,    379,  65948,      0,   2650,    527,    499,  27623,    223,    949,  37046, 101067,  19000,  23182, 102301,   9263,  18136,     16,  36827,  21909, }, },
+//        { "3"                     , {      18, }, },
+//        { "33"                    , {    1644, }, },
+//        { "333"                   , {    8765, }, },
+//        { "3333"                  , {    8765,     18, }, },
+//        { "33333"                 , {    8765,   1644, }, },
+//        { "333333"                , {    8765,   8765, }, },
+//        { "3333333"               , {    8765,   8765,     18, }, },
+//        { "33333333"              , {    8765,   8765,   1644, }, },
+//        { "333333333"             , {    8765,   8765,   8765, }, },
+//    };
+//
+//    return _k_tests;
+//}
+
+using llama_tests = std::map<std::string, std::vector<llama_token>>;
+
+static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) {
+    llama_tests tests;
+
+    std::ifstream ifs_inp(fname_inp);
+    if (!ifs_inp) {
+        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str());
+        return tests;
+    }
+
+    std::string sraw((std::istreambuf_iterator<char>(ifs_inp)), std::istreambuf_iterator<char>());
+
+    std::ifstream ifs_out(fname_out);
+    if (!ifs_out) {
+        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+        return tests;
+    }
+
+    std::vector<std::string> sout;
+    for (std::string line; std::getline(ifs_out, line);) {
+        sout.push_back(line);
+    }
+
+    const std::string sep = "\n__ggml_vocab_test__\n";
+
+    std::vector<std::string> sinp;
+
+    size_t pos = 0;
+    while (pos < sraw.size()) {
+        const size_t next = sraw.find(sep, pos);
+        if (next == std::string::npos) {
+            sinp.push_back(sraw.substr(pos));
+            break;
+        }
+        sinp.push_back(sraw.substr(pos, next - pos));
+        pos = next + sep.size();
+    }
+
+    if (sinp.size() != sout.size()) {
+        fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__);
+        return tests;
+    }
+
+    for (size_t i = 0; i < sinp.size(); ++i) {
+        const std::string & s = sinp[i];
+        const std::string & o = string_strip(sout[i]);
+
+        std::vector<llama_token> toks;
+
+        size_t pos = 0;
+        while (pos < o.size()) {
+            size_t next = o.find(' ', pos);
+            if (next == std::string::npos) {
+                next = o.size();
+            }
+            const std::string stok = o.substr(pos, next - pos);
+            toks.push_back(std::stoi(stok));
+            pos = next + 1;
+        }
+
+        tests[s] = toks;
+    }
+
+    return tests;
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+
+    const std::string fname_inp = fname + ".inp";
+    const std::string fname_out = fname + ".out";
+
+    std::string fname_text;
+    if (argc > 2) {
+        fname_text = argv[2];
+    }
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init();
+
+    // load the vocab
+    {
+        auto mparams = llama_model_default_params();
+
+        mparams.vocab_only = true;
+
+        model = llama_model_load_from_file(fname.c_str(), mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_init_from_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+#ifdef _WIN32
+    // We need this for unicode console support
+    console::init(false, false);
+    atexit([]() { console::cleanup(); });
+#endif
+
+    bool success = true;
+
+    const auto k_tests = [&]() -> llama_tests {
+        if (!fname_text.empty()) {
+            return {};
+        }
+
+        const auto res = read_tests(fname_inp, fname_out);
+
+        if (res.empty()) {
+            fprintf(stderr, "%s : error: no tests found\n", __func__);
+            exit(1);
+        }
+
+        return res;
+    }();
+
+    const bool add_special = false;
+
+    // multi-threaded tokenization
+    const int nthread = std::thread::hardware_concurrency();
+    std::vector<std::thread> threads(nthread);
+
+    for (int i = 0; i < nthread; i++) {
+        threads[i] = std::thread([&, i]() {
+            for (const auto & test_kv : k_tests) {
+                const std::vector<llama_token> res = common_tokenize(ctx, test_kv.first, add_special, false);
+
+                // here only print the result of the first thread
+                // because the other threads are running the same tests
+                if (i != 0) {
+                    continue;
+                }
+
+                printf("\n");
+                printf("src: '%s'\n", test_kv.first.c_str());
+                printf("res: '%s'\n", common_detokenize(ctx, res).c_str());
+                printf("tok: ");
+                for (const auto & tok : res) {
+                    printf("%d ", tok);
+                }
+                printf("\n");
+
+                bool correct = res.size() == test_kv.second.size();
+                for (int i = 0; i < (int) res.size() && correct; ++i) {
+                    if (test_kv.second[i] != res[i]) {
+                        correct = false;
+                    }
+                }
+
+                if (!correct) {
+                    fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
+                    fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
+                        common_detokenize(ctx, res).c_str(),
+                        common_detokenize(ctx, test_kv.second).c_str());
+                    fprintf(stderr, "%s : expected tokens: ", __func__);
+                    for (const auto & t : test_kv.second) {
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
+                    }
+                    fprintf(stderr, "\n");
+                    fprintf(stderr, "%s : got tokens:      ", __func__);
+                    for (const auto & t : res) {
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
+                    }
+                    fprintf(stderr, "\n");
+
+                    success = false;
+                }
+            }
+        });
+    }
+
+    for (int i = 0; i < nthread; i++) {
+        threads[i].join();
+    }
+
+    // single threaded tokenization
+    if (!fname_text.empty()) {
+        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
+
+        std::string text;
+        {
+            std::ifstream ifs(fname_text);
+            if (!ifs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
+                return 1;
+            }
+            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
+        }
+
+        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
+
+        std::vector<llama_token> res;
+
+        {
+            const auto t_start = ggml_time_us();
+
+            res = common_tokenize(ctx, text, add_special, false);
+
+            const auto t_end = ggml_time_us();
+
+            fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0);
+        }
+
+        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
+
+        {
+            const std::string fname_out = fname_text + ".tokcpp";
+
+            std::ofstream ofs(fname_out);
+            if (!ofs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+                return 1;
+            }
+
+            for (const auto & tok : res) {
+                //ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
+                ofs << tok << "\n";
+            }
+        }
+
+        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    llama_backend_free();
+
+    printf("\n");
+    printf("Tests %s\n", success ? "passed" : "failed");
+
+    return success ? 0 : 3;
+}
--- a/tests/test-tokenizer-0.py
+++ b/tests/test-tokenizer-0.py
@@ -0,0 +1,46 @@
+import time
+import argparse
+
+from transformers import AutoTokenizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
+parser.add_argument("--fname-tok",   help="path to a text file to tokenize", required=True)
+args = parser.parse_args()
+
+dir_tokenizer = args.dir_tokenizer
+fname_tok = args.fname_tok
+
+tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
+
+print('tokenizing file: ', fname_tok) # noqa: NP100
+fname_out = fname_tok + '.tok'
+with open(fname_tok, 'r', encoding='utf-8') as f:
+    lines = f.readlines()
+    s = ''.join(lines)
+    t_start = time.time()
+    res = tokenizer.encode(s, add_special_tokens=False)  # ty: ignore[unresolved-attribute]
+    t_end = time.time()
+    print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
+    with open(fname_out, 'w', encoding='utf-8') as f:
+        for x in res:
+            # LLaMA v3 for some reason strips the space for these tokens (and others)
+            # if x == 662:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 1174:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 2564:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 758:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 949:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # elif x == 5354:
+            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+            # else:
+            #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
+            # f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
+            f.write(str(x) + '\n')
+    print('len(res): ', len(res)) # noqa: NP100
+    print('len(lines): ', len(lines)) # noqa: NP100
+print('results written to: ', fname_out) # noqa: NP100
--- a/tests/test-tokenizer-0.sh
+++ b/tests/test-tokenizer-0.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+#
+# Usage:
+#
+#   test-tokenizer-0.sh <name> <input>
+#
+
+if [ $# -ne 2 ]; then
+    printf "Usage: $0 <name> <input>\n"
+    exit 1
+fi
+
+name=$1
+input=$2
+
+# Build using CMake if binary doesn't exist
+if [ ! -f ./build/bin/test-tokenizer-0 ]; then
+    printf "Building test-tokenizer-0 with CMake...\n"
+    cmake -B build -DLLAMA_BUILD_TESTS=ON
+    cmake --build build --target test-tokenizer-0 -j
+fi
+
+printf "Testing %s on %s ...\n" $name $input
+
+set -e
+
+printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
+python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
+
+printf "Tokenizing using (cpp) llama.cpp ...\n"
+./build/bin/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
+
+cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
+cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
+
+set +e
+
+diff $input.tok $input.tokcpp > /dev/null 2>&1
+
+if [ $? -eq 0 ]; then
+    printf "Tokenization is correct!\n"
+else
+    diff $input.tok $input.tokcpp | head -n 32
+
+    printf "Tokenization differs!\n"
+fi
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -0,0 +1,155 @@
+#include "llama.h"
+#include "common.h"
+#include "console.h"
+
+#include "../src/unicode.h"
+
+#include <cassert>
+#include <codecvt>
+#include <cstdio>
+#include <cstring>
+#include <locale>
+#include <string>
+#include <thread>
+#include <vector>
+#include <atomic>
+
+int main(int argc, char **argv) {
+    if (argc < 2 || argc > 3) {
+        fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+    bool ignore_merges = false;
+    if (argc == 3) {
+        if (std::strcmp(argv[2], "--ignore-merges") != 0) {
+            fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
+            return 1;
+        }
+        ignore_merges = true;
+    }
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    if (ignore_merges) {
+        fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
+    }
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init();
+
+    // load the vocab
+    {
+        auto mparams = llama_model_default_params();
+
+        mparams.vocab_only = true;
+
+        model = llama_model_load_from_file(fname.c_str(), mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_init_from_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    //GGML_ASSERT(llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_BPE);
+    if (llama_vocab_type(vocab) != LLAMA_VOCAB_TYPE_BPE) {
+        return 99;
+    }
+
+#ifdef _WIN32
+    // We need this for unicode console support
+    console::init(false, false);
+    atexit([]() { console::cleanup(); });
+#endif
+
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    for (int i = 0; i < n_vocab; ++i) {
+        std::string str = common_detokenize(ctx, std::vector<int>(1, i));
+        try {
+            auto cps = unicode_cpts_from_utf8(str);
+            std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+            if (ignore_merges && tokens.size() > 1) {
+                fprintf(stderr,
+                        "%s : error: token %d detokenizes to '%s'(%zu) but "
+                        "tokenization of this to multiple tokens: [",
+                        __func__, i, str.c_str(), str.length());
+                fprintf(stderr, "%d", tokens[0]);
+                for (size_t i = 1; i < tokens.size(); i++) {
+                    fprintf(stderr, ", %d", tokens[i]);
+                }
+                fprintf(stderr, "]\n");
+                return 2;
+            }
+            std::string check = common_detokenize(ctx, tokens);
+            if (check != str) {
+                fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
+                    __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
+                return 2;
+            }
+        }
+        catch (const std::invalid_argument &) {
+            //fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
+        }
+    }
+
+    // unicode
+    {
+        const int nthread = std::thread::hardware_concurrency();
+
+        std::vector<std::thread> threads(nthread);
+
+        std::atomic_int errcode = {};
+
+        for (int i = 0; i < nthread; ++i) {
+            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
+                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
+                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
+                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined  \p{Cn}
+                        continue;
+                    }
+
+                    std::string str = unicode_cpt_to_utf8(cp);
+                    std::vector<llama_token> tokens = common_tokenize(ctx, str, false);
+                    std::string check = common_detokenize(ctx, tokens);
+                    if (cp != 9601 && str != check) {
+                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                                cp, check.c_str(), check.length(), str.c_str(), str.length());
+                        errcode = 3;
+                    }
+                }
+            });
+        }
+
+        for (auto & t : threads) {
+            t.join();
+        }
+
+        if (errcode) {
+            return errcode;
+        }
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/tests/test-tokenizer-1-spm.cpp
+++ b/tests/test-tokenizer-1-spm.cpp
@@ -0,0 +1,125 @@
+#include "llama.h"
+#include "common.h"
+#include "console.h"
+
+#include "../src/unicode.h"
+
+#include <cassert>
+#include <codecvt>
+#include <cstdio>
+#include <cstring>
+#include <locale>
+#include <string>
+#include <thread>
+#include <vector>
+#include <atomic>
+
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init();
+
+    // load the vocab
+    {
+        auto mparams = llama_model_default_params();
+
+        mparams.vocab_only = true;
+
+        model = llama_model_load_from_file(fname.c_str(), mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_init_from_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_model_free(model);
+            return 1;
+        }
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+    if (llama_vocab_type(vocab) != LLAMA_VOCAB_TYPE_SPM) {
+        return 99;
+    }
+
+#ifdef _WIN32
+    // We need this for unicode console support
+    console::init(false, false);
+    atexit([]() { console::cleanup(); });
+#endif
+
+    const int n_vocab = llama_vocab_n_tokens(vocab);
+
+    for (int i = 0; i < n_vocab; ++i) {
+        std::string str = common_detokenize(ctx, std::vector<int>(1, i), true);
+        std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+        std::string check = common_detokenize(ctx, tokens);
+        if (check != str) {
+            fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
+                __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
+            return 2;
+        }
+    }
+
+    // unicode
+    {
+        const int nthread = std::thread::hardware_concurrency();
+
+        std::vector<std::thread> threads(nthread);
+
+        std::atomic_int errcode = {};
+
+        for (int i = 0; i < nthread; ++i) {
+            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
+                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
+                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
+                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined \p{Cn}
+                        continue;
+                    }
+
+                    std::string str = unicode_cpt_to_utf8(cp);
+                    std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
+                    std::string check = common_detokenize(ctx, tokens);
+                    if (cp != 9601 && str != check) {
+                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                                cp, check.c_str(), check.length(), str.c_str(), str.length());
+                        errcode = 3;
+                    }
+                }
+            });
+        }
+
+        for (auto & t : threads) {
+            t.join();
+        }
+
+        if(errcode) {
+            return errcode;
+        }
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -0,0 +1,565 @@
+# Test libllama tokenizer == AutoTokenizer.
+# Brute force random words/text generation.
+#
+# Sample usage:
+#
+#   python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
+#
+
+from __future__ import annotations
+
+import time
+import logging
+import argparse
+import subprocess
+import random
+import unicodedata
+
+from pathlib import Path
+from typing import Any, Iterator
+
+import cffi
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+
+logger = logging.getLogger("test-tokenizer-random")
+
+
+class LibLlama:
+
+    DEFAULT_PATH_LLAMA_H = "./include/llama.h"
+    DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"]
+    DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
+
+    def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None):
+        path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
+        path_includes = path_includes or self.DEFAULT_PATH_INCLUDES
+        path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
+        (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_includes, path_libllama)
+        self.lib.llama_backend_init()
+
+    def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str) -> tuple[cffi.FFI, Any]:
+        cmd = ["gcc", "-O0", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
+        cmd += ["-I" + path for path in path_includes] + [path_llama_h]
+        res = subprocess.run(cmd, stdout=subprocess.PIPE)
+        assert (res.returncode == 0)
+        source = res.stdout.decode()
+        ffi = cffi.FFI()
+        if True:  # workarounds for pycparser
+            source = "typedef struct { } __builtin_va_list;" + "\n" + source
+            source = source.replace("sizeof (int)",    str(ffi.sizeof("int")))
+            source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
+            source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
+            source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
+        ffi.cdef(source, override=True)
+        lib = ffi.dlopen(path_libllama)
+        return (ffi, lib)
+
+    def model_default_params(self, **kwargs):
+        mparams = self.lib.llama_model_default_params()
+        for k, v in kwargs.items():
+            setattr(mparams, k, v)
+        return mparams
+
+    def context_default_params(self, **kwargs):
+        cparams = self.lib.llama_context_default_params()
+        for k, v in kwargs.items():
+            setattr(cparams, k, v)
+        return cparams
+
+
+class LibLlamaModel:
+
+    def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
+        self.lib: Any = libllama.lib
+        self.ffi = libllama.ffi
+        if isinstance(mparams, dict):
+            mparams = libllama.model_default_params(**mparams)
+        self.model = self.lib.llama_model_load_from_file(path_model.encode(), mparams)
+        if not self.model:
+            raise RuntimeError("error: failed to load model '%s'" % path_model)
+        if isinstance(cparams, dict):
+            cparams = libllama.context_default_params(**cparams)
+        self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
+        if not self.ctx:
+            raise RuntimeError("error: failed to create context for model '%s'" % path_model)
+        n_tokens_max = self.lib.llama_n_ctx(self.ctx)
+        self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
+        self.text_buff = self.ffi.new("uint8_t[]", 1024)
+
+    def free(self):
+        if self.ctx:
+            self.lib.llama_free(self.ctx)
+        if self.model:
+            self.lib.llama_model_free(self.model)
+        self.ctx = None
+        self.model = None
+        self.lib = None
+
+    def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
+        encoded_text: bytes = text.encode("utf-8")
+        num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
+        while num < 0 and len(self.token_ids) < (16 << 20):
+            self.token_ids = self.ffi.new("llama_token[]", -2 * num)
+            num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
+        return list(self.token_ids[0:num])
+
+    def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
+        if len(self.token_ids) < len(ids):
+            self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
+        for i, id in enumerate(ids):
+            self.token_ids[i] = id
+        num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
+        while num < 0 and len(self.text_buff) < (16 << 20):
+            self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
+            num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
+        return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD' # pyright: ignore[reportArgumentType]
+
+
+class Tokenizer:
+
+    def encode(self, text: str) -> list[int]:
+        raise NotImplementedError
+
+    def decode(self, ids: list[int]) -> str:
+        raise NotImplementedError
+
+
+class TokenizerGroundtruth (Tokenizer):
+
+    def __init__(self, dir_tokenizer: str):
+        self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)  # ty: ignore[invalid-assignment]
+        # guess BOS and EOS
+        ids = self.encode("a")
+        assert 1 <= len(ids) <= 3
+        add_bos_token = len(ids) > 1 and self.model.bos_token_id == ids[0]
+        add_eos_token = len(ids) > 1 and self.model.eos_token_id == ids[-1]
+        self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token)
+        self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token)
+        # build vocab
+        tokens = list(self.model.get_vocab().values())
+        self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True)
+        self.vocab = list(sorted(self.vocab))
+        # tokens and lists
+        self.special_tokens = list(self.model.all_special_tokens)
+        self.added_tokens   = self.model.batch_decode(list(self.model.added_tokens_encoder.values()), skip_special_tokens=False)
+        self.bos_token = self.model.bos_token
+        self.eos_token = self.model.eos_token
+
+    def encode(self, text: str) -> list[int]:
+        return self.model.encode(text, add_special_tokens=True)
+
+    def decode(self, ids: list[int]) -> str:
+        return self.model.decode(ids, skip_special_tokens=False)  # ty: ignore[invalid-return-type]
+
+
+class TokenizerLlamaCpp (Tokenizer):
+
+    libllama: LibLlama | None = None
+
+    def __init__(self, vocab_file: str):
+        if not self.libllama:
+            self.libllama = LibLlama()
+        self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
+
+    def encode(self, text: str) -> list[int]:
+        return self.model.tokenize(text, add_special=True, parse_special=True)
+
+    def decode(self, ids: list[int]) -> str:
+        return self.model.detokenize(ids, remove_special=False, unparse_special=True)
+
+
+def generator_custom_text() -> Iterator[str]:
+    """General tests"""
+    yield from [
+        "",
+        " ",
+        "  ",
+        "   ",
+        "\t",
+        "\n",
+        "\n\n",
+        "\n\n\n",
+        "\t\n",
+        "Hello world",
+        " Hello world",
+        "Hello World",
+        " Hello World",
+        " Hello World!",
+        "Hello, world!",
+        " Hello, world!",
+        " this is 🦙.cpp",
+        "w048 7tuijk dsdfhu",
+        "нещо на Български",
+        "កាន់តែពិសេសអាចខលចេញ",
+        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+        "Hello",
+        " Hello",
+        "  Hello",
+        "   Hello",
+        "    Hello",
+        "    Hello\n    Hello",
+        " (",
+        "\n =",
+        "' era",
+        "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
+        "3",
+        "33",
+        "333",
+        "3333",
+        "33333",
+        "333333",
+        "3333333",
+        "33333333",
+        "333333333",
+    ]
+
+
+def generator_custom_text_edge_cases() -> Iterator[str]:
+    """Edge cases found while debugging"""
+    yield from [
+        '\x1f-a',     # unicode_ranges_control, {0x00001C, 0x00001F}
+        '¼-a',        # unicode_ranges_digit, 0x00BC
+        '½-a',        # unicode_ranges_digit, 0x00BD
+        '¾-a',        # unicode_ranges_digit, 0x00BE
+        'a 〇b',      # unicode_ranges_digit, 0x3007
+        'Ⅵ-a',       # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
+        '\uFEFF//',   # unicode_ranges_control, 0xFEFF (BOM)
+        'Cửa Việt',   # llama-3, ignore_merges = true
+        '<s>a',       # Phi-3 fail
+        '<unk><|endoftext|><s>',  # Phi-3 fail
+        'a\na',            # bert fail
+        '"`',              # falcon
+        ' \u2e4e',         # falcon
+        '\n\x0b  ',        # falcon
+        'a\xa0\xa0\x00b',  # jina-v2-es
+        'one <mask>',      # jina-v2-es  <mask> lstrip=true
+        'a </s> b',        # rstrip phi-3
+        'a <mask> b',      # lstrip jina-v2
+        '\xa0aC',          # deepseek
+        '\u2029 \uA3E4',   # deepseek-llm
+        "a ?",
+        'å',               # mpt
+        '\U000ac517',      # utf-8 encode error, falcon
+        '\U000522f4',      # utf-8 encode error, starcoder
+        "<s><s><unk><s>a<s>b<s>c<unk>d<unk></s>",
+        "<s> <s> <unk><s>a<s>b<s>c<unk>d<unk></s>",
+    ]
+
+
+def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
+    """Brute force check all vocab words"""
+    yield from tokenizer.vocab
+
+
+def generator_ascii_lr_strip() -> Iterator[str]:
+    WHITESPACES = ["", " ", "  "]
+    CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
+    for char1 in CHARACTERS:
+        for char2 in CHARACTERS:
+            for lstrip in WHITESPACES:
+                for rstrip in WHITESPACES:
+                    yield lstrip + char1 + char2 + rstrip
+                    yield lstrip + char1 + rstrip + char2
+                    yield char1 + lstrip + char2 + rstrip
+
+
+def generator_apostrophe() -> Iterator[str]:
+    WHITESPACES = ["", " ", "  "]
+    CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
+    for char1 in CHARACTERS:
+        for char2 in CHARACTERS:
+            for lstrip in WHITESPACES:
+                for rstrip in WHITESPACES:
+                    yield char1 + lstrip + "'" + rstrip + char2
+                    yield char1 + char2 + lstrip + "'" + rstrip + "z"
+                    yield "a" + lstrip + "'" + rstrip + char1 + char2
+
+
+def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
+    WHITESPACES = ["", " ", "  ", "\n", "\r\n", "\n\n", "\t", "\t\t"]
+    all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens)))
+    for token in all_tokens:
+        for lstrip in WHITESPACES:
+            for rstrip in WHITESPACES:
+                yield lstrip + token + rstrip
+                yield "a" + lstrip + token + rstrip
+                yield lstrip + token + rstrip + "z"
+                yield "a" + lstrip + token + rstrip + "z"
+
+
+def generator_random_added_tokens(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
+    separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
+    all_tokens  = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens + separations)))
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        words = rand.choices(all_tokens, k=500)
+        if words and words[0] == tokenizer.bos_token:  # skip spam warning of double BOS
+            while len(words) > 1 and words[1] == tokenizer.bos_token:  # leave one starting BOS
+                words.pop(0)
+            if tokenizer.add_bos_token:  # drop all starting BOS
+                words.pop(0)
+        if words and words[-1] == tokenizer.eos_token:  # skip spam warning of double EOS
+            while len(words) > 1 and words[-2] == tokenizer.eos_token:  # leave one trailing EOS
+                words.pop(-1)
+            if tokenizer.add_bos_token:  # drop all trailing EOS
+                words.pop(-1)
+        yield "".join(words)
+
+
+def generator_random_chars(iterations=100) -> Iterator[str]:
+    """Brute force random text with simple characters"""
+
+    NUM_WORDS = 400
+    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
+    CHARS = list(sorted(set("""
+        ABCDEFGHIJKLMNOPQRSTUVWXYZ
+        abcdefghijklmnopqrstuvwxyz
+        ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
+        áéíóúàèìòùâêîôûäëïöü
+        .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
+    """)))
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        for _ in range(NUM_WORDS):
+            k = rand.randint(1, 7)
+            word = rand.choices(CHARS, k=k)
+            word.append(rand.choice(WHITESPACES))
+            text.append("".join(word))
+        yield "".join(text)
+
+
+def generator_unicodes() -> Iterator[str]:
+    """Iterate unicode characters"""
+
+    MAX_CODEPOINTS = 0x30000  # 0x110000
+
+    def _valid(cpt):
+        if cpt >= 0x30000:  # unassigned and supplementary
+            return False
+        # if cpt == 0x2029:  # deepseek-llm
+        #    return False
+        if unicodedata.category(chr(cpt)) in ("Cn", "Cs", "Co"):  # undefined, surrogates, private
+            return False
+        return True
+
+    characters = [chr(cpt) for cpt in range(0, MAX_CODEPOINTS) if _valid(cpt)]
+
+    yield from characters
+
+
+def generator_random_unicodes(iterations=100) -> Iterator[str]:
+    """Brute force random text with unicode characters"""
+
+    NUM_WORDS = 200
+    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
+
+    characters = list(generator_unicodes())
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        for _ in range(NUM_WORDS):
+            k = rand.randint(1, 7)
+            word = rand.choices(characters, k=k)
+            word.append(rand.choice(WHITESPACES))
+            text.append("".join(word))
+        yield "".join(text)
+
+
+def generator_random_vocab_chars(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
+    """Brute force random text with vocab characters"""
+
+    vocab_chars = set()
+    for word in tokenizer.vocab:
+        vocab_chars.update(word)
+    vocab_chars = list(sorted(vocab_chars))
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = rand.choices(vocab_chars, k=1024)
+        yield "".join(text)
+
+
+def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
+    """Brute force random text from vocab words"""
+
+    vocab = [w.strip() for w in tokenizer.vocab]
+    yield from vocab
+
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        text = []
+        num_words = rand.randint(300, 400)
+        for i in range(num_words):
+            k = rand.randint(1, 3)
+            words = rand.choices(vocab, k=k)
+            sep = rand.choice("     \n\r\t")
+            text.append("".join(words) + sep)
+        yield "".join(text)
+
+
+def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
+
+    def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
+        for i, (a, b) in enumerate(zip(ids1, ids2)):
+            if a != b:
+                return i
+        if len(ids1) == len(ids2):
+            return -1
+        return min(len(ids1), len(ids2))
+
+    def check_detokenizer(text: str, text1: str, text2: str) -> bool:
+        if text1 == text2:  # equal to TokenizerGroundtruth?
+            return True
+        # equal to source text?
+        if tokenizer1.add_bos_token and tokenizer1.bos_token and isinstance(tokenizer1.bos_token, str):  # remove BOS
+            if text2.startswith(tokenizer1.bos_token):
+                text2 = text2[len(tokenizer1.bos_token):]
+        if tokenizer1.add_eos_token and tokenizer1.eos_token and isinstance(tokenizer1.eos_token, str):  # remove EOS
+            if text2.endswith(tokenizer1.eos_token):
+                text2 = text2[:-len(tokenizer1.eos_token)]
+        return text == text2
+
+    t_encode1 = 0
+    t_encode2 = 0
+    t_decode1 = 0
+    t_decode2 = 0
+    t_start = time.perf_counter()
+    encode_errors = 0
+    decode_errors = 0
+    MAX_ERRORS = 10
+
+    logger.info("%s: %s" % (getattr(generator, "__qualname__", ""), "ini"))
+    for text in generator:
+        # print(repr(text), text.encode())
+        # print(repr(text), hex(ord(text[0])), text.encode())
+        t0 = time.perf_counter()
+        ids1 = tokenizer1.encode(text)
+        t1 = time.perf_counter()
+        ids2 = tokenizer2.encode(text)
+        t2 = time.perf_counter()
+        text1 = tokenizer1.decode(ids1)
+        t3 = time.perf_counter()
+        text2 = tokenizer2.decode(ids1)
+        t4 = time.perf_counter()
+        t_encode1 += t1 - t0
+        t_encode2 += t2 - t1
+        t_decode1 += t3 - t2
+        t_decode2 += t4 - t3
+        if encode_errors < MAX_ERRORS and ids1 != ids2:
+            i = find_first_mismatch(ids1, ids2)
+            ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
+            ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
+            logger.error(" Expected: " + str(ids1))
+            logger.error("   Result: " + str(ids2))
+            encode_errors += 1
+            logger.error(f" {encode_errors=}")
+        if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
+            i = find_first_mismatch(text1, text2)
+            text1 = list(text1[max(0, i - 2) : i + 5 + 1])
+            text2 = list(text2[max(0, i - 2) : i + 5 + 1])
+            logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
+            logger.error("   Result: " + " ".join(hex(ord(x)) for x in text2))
+            decode_errors += 1
+            logger.error(f" {decode_errors=}")
+        if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
+            logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
+            # raise Exception()
+            break
+
+    t_total = time.perf_counter() - t_start
+    logger.info(f"{getattr(generator, '__qualname__', '')}: end,  {t_encode1=:.3f} {t_encode2=:.3f}  {t_decode1=:.3f} {t_decode2=:.3f}  {t_total=:.3f}")
+
+
+def main(argv: list[str] | None = None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file")
+    parser.add_argument("dir_tokenizer", type=str, help="directory containing 'tokenizer.model' file")
+    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
+    logger.info(f"VOCABFILE: '{args.vocab_file}'")
+
+    tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
+    tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
+
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
+    compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
+    compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
+    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
+
+    tokenizer2.model.free()
+
+
+if __name__ == "__main__":
+    # main()
+
+    if True:
+        logging.basicConfig(
+            level    = logging.DEBUG,
+            format   = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
+            datefmt  = "%Y-%m-%d %H:%M:%S",
+            filename = logger.name + ".log",
+            filemode = "a"
+        )
+    logging.basicConfig(
+        level    = logging.DEBUG,
+        format   = "%(levelname)s %(message)s",
+    )
+
+    path_tokenizers   = Path("./models/tokenizers/")
+    path_vocab_format = "./models/ggml-vocab-%s.gguf"
+
+    tokenizers = [
+        "llama-spm",      # SPM
+        "phi-3",          # SPM
+        "gemma",          # SPM
+        "gemma-2",        # SPM
+        "baichuan",       # SPM
+        "bert-bge",       # WPM
+        "jina-v2-en",     # WPM
+        "llama-bpe",      # BPE
+        "phi-2",          # BPE
+        "deepseek-llm",   # BPE
+        "deepseek-coder", # BPE
+        "falcon",         # BPE
+        "mpt",            # BPE
+        "starcoder",      # BPE
+        "gpt-2",          # BPE
+        "stablelm2",      # BPE
+        "refact",         # BPE
+        "qwen2",          # BPE
+        "olmo",           # BPE
+        "jina-v2-es",     # BPE
+        "jina-v2-de",     # BPE
+        "smaug-bpe",      # BPE
+        "poro-chat",      # BPE
+        "jina-v2-code",   # BPE
+        "viking",         # BPE
+        "jais",           # BPE
+    ]
+
+    logger.info("=" * 50)
+    for tokenizer in tokenizers:
+        logger.info("-" * 50)
+        logger.info(f"TOKENIZER: '{tokenizer}'")
+        vocab_file = Path(path_vocab_format % tokenizer)
+        dir_tokenizer = path_tokenizers / tokenizer
+        main([str(vocab_file), str(dir_tokenizer), "--verbose"])
--- a/tests/test-tokenizers-repo.sh
+++ b/tests/test-tokenizers-repo.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+if [ $# -lt 2 ]; then
+    printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
+    exit 1
+fi
+
+if [ $# -eq 3 ]; then
+    toktest=$3
+else
+    toktest="./test-tokenizer-0"
+fi
+
+if [ ! -x $toktest ]; then
+    printf "Test executable \"$toktest\" not found!\n"
+    exit 1
+fi
+
+repo=$1
+folder=$2
+
+if [ -d $folder ] && [ -d $folder/.git ]; then
+    (cd $folder; git pull)
+else
+    git clone $repo $folder
+
+    # byteswap models if on big endian
+    if [ "$(uname -m)" = s390x ]; then
+        for f in $folder/*/*.gguf; do
+            echo YES | python3 "$(dirname $0)/../gguf-py/gguf/scripts/gguf_convert_endian.py" $f big
+        done
+    fi
+fi
+
+shopt -s globstar
+for gguf in $folder/**/*.gguf; do
+    if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
+        $toktest $gguf
+    else
+        printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
+    fi
+done
+
--- a/tests/testing.h
+++ b/tests/testing.h
@@ -0,0 +1,243 @@
+#pragma once
+
+#include "common.h"
+
+#include <chrono>
+#include <exception>
+#include <iostream>
+#include <string>
+#include <regex>
+#include <vector>
+
+struct testing {
+    std::ostream &out;
+    std::vector<std::string> stack;
+    std::regex filter;
+    bool filter_tests = false;
+    bool throw_exception = false;
+    bool verbose = false;
+    int tests = 0;
+    int assertions = 0;
+    int failures = 0;
+    int unnamed = 0;
+    int exceptions = 0;
+
+    static constexpr std::size_t status_column = 80;
+
+    explicit testing(std::ostream &os = std::cout) : out(os) {}
+
+    std::string indent() const {
+        if (stack.empty()) {
+            return "";
+        }
+        return std::string((stack.size() - 1) * 2, ' ');
+    }
+
+    std::string full_name() const {
+        return string_join(stack, ".");
+    }
+
+    void log(const std::string & msg) {
+        if (verbose) {
+            out << indent() << "  " << msg << "\n";
+        }
+    }
+
+    void set_filter(const std::string & re) {
+        filter = std::regex(re);
+        filter_tests = true;
+    }
+
+    bool should_run() const {
+        if (filter_tests) {
+            if (!std::regex_match(full_name(), filter)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template <typename F>
+    void run_with_exceptions(F &&f, const char *ctx) {
+        try {
+            f();
+        } catch (const std::exception &e) {
+            ++failures;
+            ++exceptions;
+            out << indent() << "UNHANDLED EXCEPTION (" << ctx << "): " << e.what() << "\n";
+            if (throw_exception) {
+                throw;
+            }
+        } catch (...) {
+            ++failures;
+            ++exceptions;
+            out << indent() << "UNHANDLED EXCEPTION (" << ctx << "): unknown\n";
+            if (throw_exception) {
+                throw;
+            }
+        }
+    }
+
+    void print_result(const std::string &label, int new_failures, int new_assertions, const std::string &extra = "") const {
+        std::string line = indent() + label;
+
+        std::string details;
+        if (new_assertions > 0) {
+            if (new_failures == 0) {
+                details = std::to_string(new_assertions) + " assertion(s)";
+            } else {
+                details = std::to_string(new_failures) + " of " +
+                          std::to_string(new_assertions) + " assertion(s) failed";
+            }
+        }
+        if (!extra.empty()) {
+            if (!details.empty()) {
+                details += ", ";
+            }
+            details += extra;
+        }
+
+        if (!details.empty()) {
+            line += " (" + details + ")";
+        }
+
+        std::string status = (new_failures == 0) ? "[PASS]" : "[FAIL]";
+
+        if (line.size() + 1 < status_column) {
+            line.append(status_column - line.size(), ' ');
+        } else {
+            line.push_back(' ');
+        }
+
+        out << line << status << "\n";
+    }
+
+    template <typename F>
+    void test(const std::string &name, F f) {
+        stack.push_back(name);
+        if (!should_run()) {
+            stack.pop_back();
+            return;
+        }
+
+        ++tests;
+        out << indent() << name << "\n";
+
+        int before_failures   = failures;
+        int before_assertions = assertions;
+
+        run_with_exceptions([&] { f(*this); }, "test");
+
+        int new_failures   = failures   - before_failures;
+        int new_assertions = assertions - before_assertions;
+
+        print_result(name, new_failures, new_assertions);
+
+        stack.pop_back();
+    }
+
+    template <typename F>
+    void test(F f) {
+        test("test #" + std::to_string(++unnamed), f);
+    }
+
+    template <typename F>
+    void bench(const std::string &name, F f, int iterations = 100) {
+        stack.push_back(name);
+        if (!should_run()) {
+            stack.pop_back();
+            return;
+        }
+
+        ++tests;
+        out << indent() << "[bench] " << name << "\n";
+
+        int before_failures   = failures;
+        int before_assertions = assertions;
+
+        using clock = std::chrono::high_resolution_clock;
+
+        std::chrono::microseconds duration(0);
+
+        run_with_exceptions([&] {
+            for (auto i = 0; i < iterations; i++) {
+                auto start = clock::now();
+                f();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - start);
+            }
+        }, "bench");
+
+        auto avg_elapsed   = duration.count() / iterations;
+        auto avg_elapsed_s = std::chrono::duration_cast<std::chrono::duration<double>>(duration).count() / iterations;
+        auto rate = (avg_elapsed_s > 0.0) ? (1.0 / avg_elapsed_s) : 0.0;
+
+        int new_failures   = failures   - before_failures;
+        int new_assertions = assertions - before_assertions;
+
+        std::string extra =
+            "n=" + std::to_string(iterations) +
+            " avg=" + std::to_string(avg_elapsed) + "us" +
+            " rate=" + std::to_string(int(rate)) + "/s";
+
+        print_result("[bench] " + name, new_failures, new_assertions, extra);
+
+        stack.pop_back();
+    }
+
+    template <typename F>
+    void bench(F f, int iterations = 100) {
+        bench("bench #" + std::to_string(++unnamed), f, iterations);
+    }
+
+    // Assertions
+    bool assert_true(bool cond) {
+        return assert_true("", cond);
+    }
+
+    bool assert_true(const std::string &msg, bool cond) {
+        ++assertions;
+        if (!cond) {
+            ++failures;
+            out << indent() << "ASSERTION FAILED";
+            if (!msg.empty()) {
+                out << " : " << msg;
+            }
+            out << "\n";
+            return false;
+        }
+        return true;
+    }
+
+    template <typename A, typename B>
+    bool assert_equal(const A &expected, const B &actual) {
+        return assert_equal("", expected, actual);
+    }
+
+    template <typename A, typename B>
+    bool assert_equal(const std::string &msg, const A &expected, const B &actual) {
+        ++assertions;
+        if (!(actual == expected)) {
+            ++failures;
+            out << indent() << "ASSERT EQUAL FAILED";
+            if (!msg.empty()) {
+                out << " : " << msg;
+            }
+            out << "\n";
+
+            out << indent() << "  expected: " << expected << "\n";
+            out << indent() << "  actual  : " << actual << "\n";
+            return false;
+        }
+        return true;
+    }
+
+    // Print summary and return an exit code
+    int summary() const {
+        out << "\n";
+        out << "tests      : " << tests << "\n";
+        out << "assertions : " << assertions << "\n";
+        out << "failures   : " << failures << "\n";
+        out << "exceptions : " << exceptions << "\n";
+        return failures == 0 ? 0 : 1;
+    }
+};