llama.cpp verification source 2026-05-22
Some checks are pending
Copilot Setup Steps / copilot-setup-steps (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
Python Type-Check / python type-check (push) Waiting to run
Update Operations Documentation / update-ops-docs (push) Waiting to run
Some checks are pending
Copilot Setup Steps / copilot-setup-steps (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
Python Type-Check / python type-check (push) Waiting to run
Update Operations Documentation / update-ops-docs (push) Waiting to run
This commit is contained in:
124
common/unicode.cpp
Normal file
124
common/unicode.cpp
Normal file
@@ -0,0 +1,124 @@
|
||||
#include "unicode.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// implementation adopted from src/unicode.cpp
|
||||
|
||||
size_t common_utf8_sequence_length(unsigned char first_byte) {
|
||||
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
|
||||
return lookup[highbits];
|
||||
}
|
||||
|
||||
utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset) {
|
||||
if (offset >= input.size()) {
|
||||
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
|
||||
}
|
||||
|
||||
// ASCII fast path
|
||||
if (!(input[offset] & 0x80)) {
|
||||
return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
|
||||
}
|
||||
|
||||
// Invalid: continuation byte as first byte
|
||||
if (!(input[offset] & 0x40)) {
|
||||
return utf8_parse_result(utf8_parse_result::INVALID);
|
||||
}
|
||||
|
||||
// 2-byte sequence
|
||||
if (!(input[offset] & 0x20)) {
|
||||
if (offset + 1 >= input.size()) {
|
||||
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
|
||||
}
|
||||
if ((input[offset + 1] & 0xc0) != 0x80) {
|
||||
return utf8_parse_result(utf8_parse_result::INVALID);
|
||||
}
|
||||
auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
|
||||
return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
|
||||
}
|
||||
|
||||
// 3-byte sequence
|
||||
if (!(input[offset] & 0x10)) {
|
||||
if (offset + 2 >= input.size()) {
|
||||
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
|
||||
}
|
||||
if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
|
||||
return utf8_parse_result(utf8_parse_result::INVALID);
|
||||
}
|
||||
auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
|
||||
return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
|
||||
}
|
||||
|
||||
// 4-byte sequence
|
||||
if (!(input[offset] & 0x08)) {
|
||||
if (offset + 3 >= input.size()) {
|
||||
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
|
||||
}
|
||||
if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
|
||||
return utf8_parse_result(utf8_parse_result::INVALID);
|
||||
}
|
||||
auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
|
||||
return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
|
||||
}
|
||||
|
||||
// Invalid first byte
|
||||
return utf8_parse_result(utf8_parse_result::INVALID);
|
||||
}
|
||||
|
||||
bool common_utf8_is_complete(const std::string & s) {
|
||||
if (s.empty()) {
|
||||
return true;
|
||||
}
|
||||
for (int i = 1; i <= std::min(4, (int)s.size()); i++) {
|
||||
unsigned char c = s[s.size() - i];
|
||||
if ((c & 0xC0) != 0x80) {
|
||||
int expected = (c >= 0xF0) ? 4 : (c >= 0xE0) ? 3 : (c >= 0xC0) ? 2 : 1;
|
||||
return i >= expected;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
||||
std::string result;
|
||||
for (size_t i = 0; i < cps.size(); ++i) {
|
||||
result.append(common_unicode_cpt_to_utf8(cps[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string common_unicode_cpt_to_utf8(uint32_t cpt) {
|
||||
std::string result;
|
||||
|
||||
if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
|
||||
result.push_back(cpt);
|
||||
return result;
|
||||
}
|
||||
if (0x80 <= cpt && cpt <= 0x7ff) {
|
||||
result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
|
||||
result.push_back(0x80 | (cpt & 0x3f));
|
||||
return result;
|
||||
}
|
||||
if (0x800 <= cpt && cpt <= 0xffff) {
|
||||
result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
|
||||
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cpt & 0x3f));
|
||||
return result;
|
||||
}
|
||||
if (0x10000 <= cpt && cpt <= 0x10ffff) {
|
||||
result.push_back(0xf0 | ((cpt >> 18) & 0x07));
|
||||
result.push_back(0x80 | ((cpt >> 12) & 0x3f));
|
||||
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
|
||||
result.push_back(0x80 | (cpt & 0x3f));
|
||||
return result;
|
||||
}
|
||||
|
||||
throw std::invalid_argument("invalid codepoint");
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user