llama.cpp verification source 2026-05-22
Some checks are pending
Copilot Setup Steps / copilot-setup-steps (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
Python Type-Check / python type-check (push) Waiting to run
Update Operations Documentation / update-ops-docs (push) Waiting to run
Some checks are pending
Copilot Setup Steps / copilot-setup-steps (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
Python Type-Check / python type-check (push) Waiting to run
Update Operations Documentation / update-ops-docs (push) Waiting to run
This commit is contained in:
90
docs/backend/snapdragon/CMakeUserPresets.json
Normal file
90
docs/backend/snapdragon/CMakeUserPresets.json
Normal file
@@ -0,0 +1,90 @@
|
||||
{
|
||||
"version": 5,
|
||||
"configurePresets": [
|
||||
{
|
||||
"name": "arm64-android-snapdragon",
|
||||
"hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"ANDROID_ABI": "arm64-v8a",
|
||||
"ANDROID_PLATFORM": "android-31",
|
||||
"CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
"CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
"CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}",
|
||||
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
|
||||
"HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
|
||||
"PREBUILT_LIB_DIR": "android_aarch64",
|
||||
"GGML_OPENMP": "OFF",
|
||||
"GGML_LLAMAFILE": "OFF",
|
||||
"GGML_OPENCL": "ON",
|
||||
"GGML_HEXAGON": "ON",
|
||||
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
|
||||
"LLAMA_OPENSSL": "OFF"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-snapdragon",
|
||||
"inherits": [ "base", "arm64-windows-llvm" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
"CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
"CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}",
|
||||
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
|
||||
"HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
|
||||
"PREBUILT_LIB_DIR": "windows_aarch64",
|
||||
"GGML_OPENMP": "OFF",
|
||||
"GGML_LLAMAFILE": "OFF",
|
||||
"GGML_OPENCL": "ON",
|
||||
"GGML_HEXAGON": "ON",
|
||||
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
|
||||
"LLAMA_OPENSSL": "OFF"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-linux-snapdragon",
|
||||
"hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "cmake/arm64-linux-clang.cmake",
|
||||
"CMAKE_C_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
"CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
"CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}",
|
||||
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
|
||||
"HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
|
||||
"PREBUILT_LIB_DIR": "linux_aarch64",
|
||||
"GGML_OPENMP": "OFF",
|
||||
"GGML_LLAMAFILE": "OFF",
|
||||
"GGML_OPENCL": "OFF",
|
||||
"GGML_HEXAGON": "ON",
|
||||
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
|
||||
"LLAMA_OPENSSL": "OFF"
|
||||
}
|
||||
},
|
||||
|
||||
{ "name": "arm64-android-snapdragon-debug" , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
|
||||
{ "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
|
||||
|
||||
{ "name": "arm64-windows-snapdragon-debug" , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] },
|
||||
{ "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] },
|
||||
|
||||
{ "name": "arm64-linux-snapdragon-debug" , "inherits": [ "base", "arm64-linux-snapdragon", "debug" ] },
|
||||
{ "name": "arm64-linux-snapdragon-release", "inherits": [ "base", "arm64-linux-snapdragon", "release" ] }
|
||||
]
|
||||
}
|
||||
280
docs/backend/snapdragon/README.md
Normal file
280
docs/backend/snapdragon/README.md
Normal file
@@ -0,0 +1,280 @@
|
||||
# Snapdragon-based devices
|
||||
|
||||
## Setup
|
||||
|
||||
### Android
|
||||
|
||||
The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain).
|
||||
This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
|
||||
|
||||
This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
|
||||
[d]/> cd /workspace
|
||||
```
|
||||
|
||||
Note: The rest of the **Android** build process assumes that you're running inside the toolchain container.
|
||||
|
||||
### Windows On Snapdragon
|
||||
|
||||
Native Windows 11 arm64 builds has the following tools dependencies:
|
||||
- MS Visual Studio 2026 (Community Edition or Pro)
|
||||
- MSVC arm64 standard and runtime libraries
|
||||
- UCRT and Driver Kit
|
||||
- LLVM core libraries and Clang compiler (winget)
|
||||
- CMake, Git, Python (winget)
|
||||
- Hexagon SDK Community Edition 6.4 or later (see windows.md)
|
||||
- OpenCL SDK 2.3 or later (see windows.md)
|
||||
|
||||
Note: The rest of the **Windows** build process assumes that you're running natively in Powershell.
|
||||
Adapt below build commands accordingly.
|
||||
|
||||
## How to Build
|
||||
|
||||
Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
|
||||
|
||||
```
|
||||
[d]/workspace> cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
|
||||
[d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
|
||||
Preset CMake variables:
|
||||
ANDROID_ABI="arm64-v8a"
|
||||
...
|
||||
CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake"
|
||||
GGML_HEXAGON="ON"
|
||||
GGML_OPENCL="ON"
|
||||
GGML_OPENMP="OFF"
|
||||
HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
|
||||
...
|
||||
-- Including OpenCL backend
|
||||
-- Including Hexagon backend
|
||||
...
|
||||
-- Build files have been written to: /workspace/build-snapdragon
|
||||
|
||||
[d]/workspace> cmake --build build-snapdragon
|
||||
...
|
||||
[144/356] Performing build step for 'htp-v73'
|
||||
[1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h
|
||||
[2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj
|
||||
[3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj
|
||||
[4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj
|
||||
...
|
||||
-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so
|
||||
-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so
|
||||
...
|
||||
```
|
||||
|
||||
To generate an installable "package" simply use cmake --install:
|
||||
|
||||
```
|
||||
[d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp
|
||||
-- Install configuration: "Release"
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-cpu.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-opencl.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-hexagon.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v73.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v75.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v79.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v81.so
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml.so
|
||||
...
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-bench
|
||||
-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-cli
|
||||
...
|
||||
```
|
||||
|
||||
## How to Install
|
||||
|
||||
### Android
|
||||
|
||||
For this step, your device needs to be configured for on-device development.
|
||||
Please see https://developer.android.com/studio/debug/dev-options for details.
|
||||
|
||||
Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
|
||||
**Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ adb push pkg-snapdragon/llama.cpp /data/local/tmp/
|
||||
pkg-snapdragon/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
|
||||
pkg-snapdragon/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
|
||||
pkg-snapdragon/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
|
||||
102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
|
||||
```
|
||||
|
||||
At this point, you should also install some models:
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf
|
||||
...
|
||||
2025-10-11 12:04:52 (10.7 MB/s) - ‘Llama-3.2-1B-Instruct-Q4_0.gguf’ saved [773025920/773025920]
|
||||
|
||||
~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf
|
||||
Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
|
||||
```
|
||||
|
||||
### Windows
|
||||
|
||||
All artifacts are already installed in the `pkg-snapdragon` folder.
|
||||
To run, adapt below instructions to use Powershell scripts in `scripts/snapdragon/windows`.
|
||||
|
||||
## How to Run
|
||||
|
||||
The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables.
|
||||
|
||||
llama.cpp supports three backends on Snapdragon-based devices: CPU, Adreno GPU (GPUOpenCL), and Hexagon NPU (HTP0-4).
|
||||
You can select which backend to run the model on using the `D=` variable, which maps to the `--device` option.
|
||||
|
||||
Hexagon NPU behaves as a "GPU" device when it comes to `-ngl` and other offload-related options.
|
||||
|
||||
Here are some examples of running various llama.cpp tools via ADB.
|
||||
|
||||
Simple question for Llama-3.2-1B
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-completion.sh -p "what is the most popular cookie in the world?"
|
||||
...
|
||||
ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
|
||||
ggml-hex: Hexagon Arch version v79
|
||||
ggml-hex: allocating new session: HTP0
|
||||
ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50
|
||||
...
|
||||
load_tensors: offloading output layer to GPU
|
||||
load_tensors: offloaded 17/17 layers to GPU
|
||||
load_tensors: CPU model buffer size = 225.49 MiB
|
||||
load_tensors: HTP0 model buffer size = 0.26 MiB
|
||||
load_tensors: HTP0-REPACK model buffer size = 504.00 MiB
|
||||
...
|
||||
I hope this helps you understand the world's most popular cookies! [end of text]
|
||||
...
|
||||
llama_perf_sampler_print: sampling time = 30.08 ms / 487 runs ( 0.06 ms per token, 16191.77 tokens per second)
|
||||
llama_perf_context_print: load time = 617.94 ms
|
||||
llama_perf_context_print: prompt eval time = 80.76 ms / 11 tokens ( 7.34 ms per token, 136.21 tokens per second)
|
||||
llama_perf_context_print: eval time = 9210.59 ms / 475 runs ( 19.39 ms per token, 51.57 tokens per second)
|
||||
llama_perf_context_print: total time = 9454.92 ms / 486 tokens
|
||||
llama_perf_context_print: graphs reused = 473
|
||||
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
||||
llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
|
||||
llama_memory_breakdown_print: | - Host | 439 = 225 + 136 + 77 |
|
||||
llama_memory_breakdown_print: | - HTP0-REPACK | 504 = 504 + 0 + 0 |
|
||||
```
|
||||
|
||||
Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-completion.sh -f surfing.txt
|
||||
...
|
||||
ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
|
||||
ggml-hex: Hexagon Arch version v81
|
||||
ggml-hex: allocating new session: HTP0
|
||||
ggml-hex: allocating new session: HTP1
|
||||
...
|
||||
load_tensors: offloading output layer to GPU
|
||||
load_tensors: offloaded 17/17 layers to GPU
|
||||
load_tensors: CPU model buffer size = 143.86 MiB
|
||||
load_tensors: HTP1 model buffer size = 0.23 MiB
|
||||
load_tensors: HTP1-REPACK model buffer size = 1575.00 MiB
|
||||
load_tensors: HTP0 model buffer size = 0.28 MiB
|
||||
load_tensors: HTP0-REPACK model buffer size = 2025.00 MiB
|
||||
...
|
||||
llama_context: CPU output buffer size = 0.19 MiB
|
||||
llama_kv_cache: HTP1 KV buffer size = 238.00 MiB
|
||||
llama_kv_cache: HTP0 KV buffer size = 306.00 MiB
|
||||
llama_kv_cache: size = 544.00 MiB ( 8192 cells, 16 layers, 1/1 seqs), K (q8_0): 272.00 MiB, V (q8_0): 272.00 MiB
|
||||
llama_context: HTP0 compute buffer size = 15.00 MiB
|
||||
llama_context: HTP1 compute buffer size = 15.00 MiB
|
||||
llama_context: CPU compute buffer size = 24.56 MiB
|
||||
...
|
||||
llama_perf_context_print: prompt eval time = 1730.57 ms / 212 tokens ( 8.16 ms per token, 122.50 tokens per second)
|
||||
llama_perf_context_print: eval time = 5624.75 ms / 257 runs ( 21.89 ms per token, 45.69 tokens per second)
|
||||
llama_perf_context_print: total time = 7377.33 ms / 469 tokens
|
||||
llama_perf_context_print: graphs reused = 255
|
||||
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
||||
llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
|
||||
llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
|
||||
llama_memory_breakdown_print: | - Host | 742 = 144 + 544 + 54 |
|
||||
llama_memory_breakdown_print: | - HTP1-REPACK | 1575 = 1575 + 0 + 0 |
|
||||
llama_memory_breakdown_print: | - HTP0-REPACK | 2025 = 2025 + 0 + 0 |
|
||||
```
|
||||
|
||||
Op test for MUL_MAT
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ HB=0 ./scripts/snapdragon/adb/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
|
||||
...
|
||||
Backend 2/3: HTP0
|
||||
Device description: Hexagon
|
||||
Device memory: 2048 MB (2048 MB free)
|
||||
MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
|
||||
MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
|
||||
MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
|
||||
|
||||
~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf ./scripts/snapdragon/adb/run-bench.sh -p 128 -n 64
|
||||
...
|
||||
ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
|
||||
ggml-hex: Hexagon Arch version v79
|
||||
ggml-hex: allocating new session: HTP0
|
||||
ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090
|
||||
| model | size | params | backend | ngl | threads | n_batch | mmap | test | t/s |
|
||||
| ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: |
|
||||
| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | pp128 | 169.42 ± 1.75 |
|
||||
| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | tg64 | 51.54 ± 1.13 |
|
||||
|
||||
build: 6a8cf8914 (6733)
|
||||
```
|
||||
|
||||
## Environment variables
|
||||
|
||||
- `GGML_HEXAGON_NDEV=1`
|
||||
Controls the number of devices/sessions to allocate. The default is 1.
|
||||
Most quantized models under 4B fit into a single session; an 8B model needs two, and a 20B model needs four.
|
||||
|
||||
- `GGML_HEXAGON_NHVX=0`
|
||||
Controls the number of HVX hardware threads to use. The default is all (actual number varies depending on the hardware version).
|
||||
|
||||
- `GGML_HEXAGON_HOSTBUF=1`
|
||||
Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.
|
||||
This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).
|
||||
|
||||
- `GGML_HEXAGON_VERBOSE=1`
|
||||
Enables verbose logging of Ops from the backend. Example output:
|
||||
|
||||
```
|
||||
ggml-hex: HTP0 graph-compute n_nodes 2
|
||||
ggml-hex: HTP0 matmul : blk.27.ffn_up.weight x ffn_norm-27 -> ffn_up-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x1
|
||||
ggml-hex: HTP0 matmul : blk.27.ffn_gate.weight x ffn_norm-27 -> ffn_gate-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x3
|
||||
ggml-hex: HTP0 graph-compute n_nodes 1
|
||||
ggml-hex: HTP0 matmul : blk.27.ffn_down.weight x ffn_gate_par-27 -> ffn_out-27 : 8192:3072 x 8192:1 -> 3072:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x0
|
||||
ggml-hex: HTP0 get-tensor result_output : data 0x7592487000 offset 0 size 513024
|
||||
```
|
||||
|
||||
- `GGML_HEXAGON_PROFILE=1`
|
||||
Enables Op profiling:
|
||||
|
||||
- `1` Basic profile with per-op `usecs` and `cycles` counters
|
||||
- `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
|
||||
- `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
|
||||
|
||||
The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
|
||||
Examples:
|
||||
|
||||
`GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
|
||||
|
||||
- `GGML_HEXAGON_OPSTAGE=0x0`
|
||||
Allows enabling specific stages of the Op processing pipeline:
|
||||
|
||||
- `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
|
||||
- `0x2` Enable Op Compute (MUL_MAT, etc.)
|
||||
|
||||
Examples:
|
||||
|
||||
`GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
|
||||
`GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
|
||||
|
||||
- `GGML_HEXAGON_OPFILTER=regex`
|
||||
Allows filtering (disabling) Ops that match the regex pattern:
|
||||
|
||||
Examples:
|
||||
|
||||
`GGML_HEXAGON_OPFILTER="FLASH_ATTN_EXT" llama-completion ...` - Disable Flash Attention on Hexagon (falls back to CPU or GPU)
|
||||
`GGML_HEXAGON_OPFILTER="ADD\|SUB" llama-completion ...` - Disable ADD and SUB on Hexagon (fall back to CPU or GPU)
|
||||
109
docs/backend/snapdragon/developer.md
Normal file
109
docs/backend/snapdragon/developer.md
Normal file
@@ -0,0 +1,109 @@
|
||||
# Hexagon backend developer details
|
||||
|
||||
## Backend libraries
|
||||
|
||||
The Hexagon backend consist of two parts:
|
||||
|
||||
- `libggml-hexagon`
|
||||
This is the regular CPU-side GGML backend library, either shared or statically linked
|
||||
|
||||
- `libggml-htp-vNN`
|
||||
This is the NPU-side (HTP stands for Hexagon Tensor Processor) shared library that contains the Op dispatcher and kernels.
|
||||
The correct library is selected automatically at runtime based on the HW version.
|
||||
|
||||
Here is an example of the build artifacts
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ ls -l pkg-adb/llama.cpp/lib/libggml*
|
||||
pkg-adb/llama.cpp/lib/libggml-base.so
|
||||
pkg-adb/llama.cpp/lib/libggml-cpu.so
|
||||
pkg-adb/llama.cpp/lib/libggml-hexagon.so <<< CPU library
|
||||
pkg-adb/llama.cpp/lib/libggml-htp-v73.so <<< HTP op/kernels for Hexagon v73
|
||||
pkg-adb/llama.cpp/lib/libggml-htp-v75.so
|
||||
pkg-adb/llama.cpp/lib/libggml-htp-v79.so
|
||||
pkg-adb/llama.cpp/lib/libggml-htp-v81.so
|
||||
```
|
||||
|
||||
## Memory buffers
|
||||
|
||||
Hexagon NPU backend takes advantage of the Snapdragon's unified memory model where all buffers are fully accessible by the CPU and GPU.
|
||||
The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically
|
||||
quantized tensors) or temporary data (chunks of the weight tensors fetched via DMA).
|
||||
|
||||
Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those
|
||||
to the NPU at this point.
|
||||
|
||||
The backend does allocates non-host buffers for the tensors with datatypes that require repacking: Q4_0, Q8_0, MXFP4.
|
||||
From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force
|
||||
the repacking.
|
||||
|
||||
## Large model handling
|
||||
|
||||
Hexagon NPU session (aka Process Domain (PD) in the Hexagon docs) is limited to a memory mapping of around 3.5GB.
|
||||
In llama.cpp/GGML the Hexagon session is mapped to a single GGML backend device (HTP0, HTP1, etc).
|
||||
|
||||
In order to map models larger than 3.5GB we need to allocate multiple devices and split the model.
|
||||
For this we're taking advantage of the llama.cpp/GGML multi-GPU layer-splitting support.
|
||||
Each Hexagon device behaves like a GPU from the offload and model splitting perspective.
|
||||
|
||||
Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
|
||||
|
||||
```
|
||||
M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-completion.sh -f surfing.txt -n 32
|
||||
...
|
||||
LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
|
||||
ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
|
||||
GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
|
||||
-t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt
|
||||
...
|
||||
llama_model_loader: - type f32: 289 tensors
|
||||
llama_model_loader: - type q4_0: 96 tensors
|
||||
llama_model_loader: - type q8_0: 2 tensors
|
||||
llama_model_loader: - type mxfp4: 72 tensors
|
||||
...
|
||||
load_tensors: offloaded 25/25 layers to GPU
|
||||
load_tensors: CPU model buffer size = 1182.09 MiB
|
||||
load_tensors: HTP1 model buffer size = 6.64 MiB
|
||||
load_tensors: HTP1-REPACK model buffer size = 2505.94 MiB
|
||||
load_tensors: HTP3 model buffer size = 5.55 MiB
|
||||
load_tensors: HTP3-REPACK model buffer size = 2088.28 MiB
|
||||
load_tensors: HTP0 model buffer size = 7.75 MiB
|
||||
load_tensors: HTP0-REPACK model buffer size = 2923.59 MiB
|
||||
load_tensors: HTP2 model buffer size = 6.64 MiB
|
||||
load_tensors: HTP2-REPACK model buffer size = 2505.94 MiB
|
||||
...
|
||||
llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
|
||||
llama_context: CPU output buffer size = 0.77 MiB
|
||||
llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells
|
||||
llama_kv_cache: HTP1 KV buffer size = 25.50 MiB
|
||||
llama_kv_cache: HTP3 KV buffer size = 25.50 MiB
|
||||
llama_kv_cache: HTP0 KV buffer size = 25.50 MiB
|
||||
llama_kv_cache: HTP2 KV buffer size = 25.50 MiB
|
||||
llama_kv_cache: size = 102.00 MiB ( 8192 cells, 12 layers, 1/1 seqs), K (q8_0): 51.00 MiB, V (q8_0): 51.00 MiB
|
||||
llama_kv_cache_iswa: creating SWA KV cache, size = 256 cells
|
||||
llama_kv_cache: HTP1 KV buffer size = 0.80 MiB
|
||||
llama_kv_cache: HTP3 KV buffer size = 0.53 MiB
|
||||
llama_kv_cache: HTP0 KV buffer size = 1.06 MiB
|
||||
llama_kv_cache: HTP2 KV buffer size = 0.80 MiB
|
||||
llama_kv_cache: size = 3.19 MiB ( 256 cells, 12 layers, 1/1 seqs), K (q8_0): 1.59 MiB, V (q8_0): 1.59 MiB
|
||||
llama_context: HTP0 compute buffer size = 16.06 MiB
|
||||
llama_context: HTP1 compute buffer size = 16.06 MiB
|
||||
llama_context: HTP2 compute buffer size = 16.06 MiB
|
||||
llama_context: HTP3 compute buffer size = 16.06 MiB
|
||||
llama_context: CPU compute buffer size = 98.19 MiB
|
||||
...
|
||||
llama_perf_context_print: prompt eval time = 3843.67 ms / 197 tokens ( 19.51 ms per token, 51.25 tokens per second)
|
||||
llama_perf_context_print: eval time = 1686.13 ms / 31 runs ( 54.39 ms per token, 18.39 tokens per second)
|
||||
llama_perf_context_print: total time = 6266.30 ms / 228 tokens
|
||||
llama_perf_context_print: graphs reused = 30
|
||||
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
|
||||
llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
|
||||
llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
|
||||
llama_memory_breakdown_print: | - HTP2 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
|
||||
llama_memory_breakdown_print: | - HTP3 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
|
||||
llama_memory_breakdown_print: | - Host | 1476 = 1208 + 105 + 162 |
|
||||
llama_memory_breakdown_print: | - HTP1-REPACK | 2505 = 2505 + 0 + 0 |
|
||||
llama_memory_breakdown_print: | - HTP3-REPACK | 2088 = 2088 + 0 + 0 |
|
||||
llama_memory_breakdown_print: | - HTP0-REPACK | 2923 = 2923 + 0 + 0 |
|
||||
llama_memory_breakdown_print: | - HTP2-REPACK | 2505 = 2505 + 0 + 0 |
|
||||
```
|
||||
58
docs/backend/snapdragon/linux.md
Normal file
58
docs/backend/snapdragon/linux.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# Snapdragon-based Linux devices
|
||||
|
||||
## Docker Setup
|
||||
|
||||
The easiest way to build llama.cpp for a Snapdragon-based Linux device is using the toolchain Docker image (see [github.com/snapdragon-toolchain](https://github.com/snapdragon-toolchain)).
|
||||
This image includes OpenCL SDK, Hexagon SDK, CMake, and the ARM64 Linux cross-compilation toolchain.
|
||||
|
||||
Cross-compilation is supported on **Linux X86** hosts. The resulting binaries are deployed to and run on the target **Qualcomm Snapdragon ARM64 Linux** device.
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-linux:v0.1
|
||||
[d]/> cd /workspace
|
||||
```
|
||||
|
||||
Note: The rest of the **Linux** build process assumes that you're running inside the toolchain container.
|
||||
|
||||
|
||||
## How to Build
|
||||
|
||||
Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
|
||||
|
||||
```
|
||||
[d]/workspace> cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
|
||||
[d]/workspace> cmake --preset arm64-linux-snapdragon-release -B build-snapdragon
|
||||
|
||||
[d]/workspace> cmake --build build-snapdragon -j $(nproc)
|
||||
```
|
||||
|
||||
To generate an installable "package" simply use cmake --install, then zip it:
|
||||
|
||||
```
|
||||
[d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon
|
||||
[d]/workspace> zip -r pkg-snapdragon.zip pkg-snapdragon
|
||||
```
|
||||
|
||||
## How to Install
|
||||
|
||||
For this step, you will deploy the built binaries and libraries to the target Linux device. Transfer `pkg-snapdragon.zip` to the target device, then unzip it and set up the environment variables:
|
||||
|
||||
```
|
||||
$ unzip pkg-snapdragon.zip
|
||||
$ cd pkg-snapdragon
|
||||
$ export LD_LIBRARY_PATH=./lib
|
||||
$ export ADSP_LIBRARY_PATH=./lib
|
||||
```
|
||||
|
||||
At this point, you should also download some models onto the device:
|
||||
|
||||
```
|
||||
$ wget https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_0.gguf
|
||||
```
|
||||
|
||||
## How to Run
|
||||
Next, since we have setup the environment variables, we can run the llama-cli with the Hexagon backends:
|
||||
```
|
||||
$ ./bin/llama-cli -m Llama-3.2-3B-Instruct-Q4_0.gguf --device HTP0 -ngl 99 -p "what is the most popular cookie in the world?"
|
||||
```
|
||||
161
docs/backend/snapdragon/windows.md
Normal file
161
docs/backend/snapdragon/windows.md
Normal file
@@ -0,0 +1,161 @@
|
||||
## Overview
|
||||
|
||||
The document covers procedures for installing the latest GPU and NPU drivers, and OpenCL and Hexagon SDKs.
|
||||
|
||||
|
||||
In order to use Hexagon NPU on Snapdragon Windows devices the underlying HTP Ops libraries (e.g libggml-htp-v73.so)
|
||||
must be included in the .cat file digitally signed with a trusted certificate.
|
||||
|
||||
This document covers details on how to generate personal certificate files (.pfx) and how to configure the system
|
||||
to allow for test signatures (aka test-signing).
|
||||
|
||||
## Install the latest Adreno OpenCL SDK
|
||||
|
||||
Either use the trimmed down version (optimized for CI) from
|
||||
|
||||
https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz
|
||||
|
||||
Or download the complete official version from
|
||||
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Adreno_OpenCL_SDK?version=2.3.2
|
||||
|
||||
Unzip/untar the archive into
|
||||
```
|
||||
c:\Qualcomm\OpenCL_SDK\2.3.2
|
||||
```
|
||||
|
||||
## Install the latest Hexagon SDK Community Edition
|
||||
|
||||
Either use the trimmed down version (optimized for CI) from
|
||||
|
||||
https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz
|
||||
|
||||
Or download the complete official version from
|
||||
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2
|
||||
|
||||
Unzip/untar the archive into
|
||||
```
|
||||
c:\Qualcomm\Hexagon_SDK\6.4.0.2
|
||||
```
|
||||
|
||||
## Install the latest Adreno GPU driver
|
||||
|
||||
Download the driver from
|
||||
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Windows_Graphics_Driver
|
||||
|
||||
After the automated installation and reboot please make sure that the GPU device shows up in the `Device Manager` (under 'Display Adapters`)
|
||||
|
||||
## Install the latest Qualcomm NPU driver
|
||||
|
||||
Download the driver from
|
||||
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Qualcomm_HND
|
||||
|
||||
After the automated installation and reboot please make sure that the Hexagon NPU device shows up in the `Device Manager` (under `Neural Processors`).
|
||||
|
||||
If the device is not available you can try installing all components (`qcnspmcdm8380`, `qcnspmcdm8380_ext`) manually.
|
||||
The components are extracted into
|
||||
```
|
||||
c:\QCDrivers\qcnspmcdm...
|
||||
```
|
||||
|
||||
## Enable NPU driver test signatures
|
||||
|
||||
Please note that the following steps are required only for the Hexagon NPU.
|
||||
Adreno GPU backend does not require test signatures.
|
||||
|
||||
### Enable testsigning
|
||||
|
||||
Use `bcdedit` to enable test-signing
|
||||
```
|
||||
> bcdedit /set TESTSIGNING ON
|
||||
```
|
||||
(Secure Boot may need to be disabled for this to work)
|
||||
|
||||
Make sure test-signing is enabled after reboot
|
||||
```
|
||||
> bcdedit /enum
|
||||
...
|
||||
testsigning Yes
|
||||
...
|
||||
```
|
||||
For additional details see Microsoft guide at
|
||||
|
||||
https://learn.microsoft.com/en-us/windows-hardware/drivers/install/the-testsigning-boot-configuration-option
|
||||
|
||||
### Create personal certificate
|
||||
|
||||
The tools required for this procedure are available as part of Windows SDK and Windows Driver Kit which should be
|
||||
installed as part of the MS Visual Studio.
|
||||
They are typically located at
|
||||
```
|
||||
c:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0
|
||||
```
|
||||
(replace 10.0.26100.0 with correct version).
|
||||
|
||||
To create personal self-signed certificate run the following commands (either from cmd or power-shell):
|
||||
```
|
||||
> cd c:\Users\MyUser
|
||||
> mkdir Certs
|
||||
> cd Certs
|
||||
> makecert -r -pe -ss PrivateCertStore -n CN=GGML.HTP.v1 -eku 1.3.6.1.5.5.7.3.3 -sv ggml-htp-v1.pvk ggml-htp-v1.cer
|
||||
> pvk2pfx.exe -pvk ggml-htp-v1.pvk -spc ggml-htp-v1.cer -pfx ggml-htp-v1.pfx
|
||||
```
|
||||
(replace `MyUser` with your username).
|
||||
|
||||
Add this certificate to `Trusted Root Certification Authorities` and `Trusted Publishers` stores.
|
||||
This can be done using `certlm` Certificate Manager tool.
|
||||
Right click on the certificate store, select `All Tasks -> Import` and follow the prompts to import the certificate from the
|
||||
PFX file you created above.
|
||||
|
||||
For additional details see Microsoft guide at
|
||||
|
||||
https://learn.microsoft.com/en-us/windows-hardware/drivers/install/introduction-to-test-signing
|
||||
|
||||
Make sure to save the PFX file, you will need it for the build procedures.
|
||||
Please note that the same certificate can be used for signing any number of builds.
|
||||
|
||||
## Build Hexagon backend with signed HTP ops libraries
|
||||
|
||||
The overall Hexagon backend build procedure for Windows on Snapdragon is the same as for other platforms.
|
||||
However, additional settings are required for generating and signing HTP Ops libraries.
|
||||
```
|
||||
> $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2"
|
||||
> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2"
|
||||
> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04"
|
||||
> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
|
||||
> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
|
||||
|
||||
> cmake --preset arm64-windows-snapdragon-release -B build-wos
|
||||
...
|
||||
> cmake --install build-wos --prefix pkg-snapdragon
|
||||
```
|
||||
|
||||
Once the build is complete HTP ops libraries will be installed like this
|
||||
```
|
||||
> dir pkg-snapdragon/lib
|
||||
...
|
||||
-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v73.so
|
||||
-a---- 1/22/2026 6:01 PM 191752 libggml-htp-v75.so
|
||||
-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v79.so
|
||||
-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v81.so
|
||||
-a---- 1/22/2026 6:01 PM 4139 libggml-htp.cat
|
||||
```
|
||||
|
||||
The .cat file, the signature and proper certificate installation can be verified with
|
||||
|
||||
```
|
||||
> signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat
|
||||
Verifying: .\pkg-snapdragon\lib\libggml-htp.cat
|
||||
|
||||
Signature Index: 0 (Primary Signature)
|
||||
Hash of file (sha256): 9820C664DA59D5EAE31DBB664127FCDAEF59CDC31502496BC567544EC2F401CF
|
||||
|
||||
Signing Certificate Chain:
|
||||
Issued to: GGML.HTP.v1
|
||||
...
|
||||
Successfully verified: .\pkg-snapdragon\lib\libggml-htp.cat
|
||||
...
|
||||
```
|
||||
Reference in New Issue
Block a user