llama.cpp verification source 2026-05-22
Some checks are pending
Copilot Setup Steps / copilot-setup-steps (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
Python Type-Check / python type-check (push) Waiting to run
Update Operations Documentation / update-ops-docs (push) Waiting to run

This commit is contained in:
2026-05-22 16:44:08 +08:00
commit 8e5a449007
2740 changed files with 1155720 additions and 0 deletions

View File

@@ -0,0 +1,37 @@
#include "simple-tokenize.h"
std::vector<std::string> simple_tokenize(const std::string & input) {
std::vector<std::string> result;
std::string current;
for (size_t i = 0; i < input.size(); i++) {
switch (input[i]) {
case ' ':
case '\n':
case '\t':
case '{':
case '}':
case ',':
case '[':
case '"':
case ']':
case '.':
case '<':
case '>':
case '=':
case '/':
if (!current.empty()) {
result.push_back(current);
current.clear();
}
default:;
}
current += input[i];
}
if (!current.empty()) {
result.push_back(current);
}
return result;
}

View File

@@ -0,0 +1,6 @@
#pragma once
#include <string>
#include <vector>
std::vector<std::string> simple_tokenize(const std::string &);

View File

@@ -0,0 +1,471 @@
#include "peg-parser.h"
#include "tests.h"
void test_basic(testing & t) {
t.test("chars", [](testing & t) {
// Test common escape sequences - newline
t.test("escape_sequence_newline", [](testing &t) {
auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("\n");
result = common_chat_combinator_parser.parse(ctx);
t.assert_equal("escape_sequence_newline", true, result.success());
});
// Test common escape sequences - tab
t.test("escape_sequence_tab", [](testing &t) {
auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("\t");
result = common_chat_combinator_parser.parse(ctx);
t.assert_equal("escape_sequence_tab", true, result.success());
});
// Test common escape sequences - backslash
t.test("escape_sequence_backslash", [](testing &t) {
auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("\\");
result = common_chat_combinator_parser.parse(ctx);
t.assert_equal("escape_sequence_backslash", true, result.success());
});
// Test common escape sequences - space (should ())
t.test("escape_sequence_space_fail", [](testing &t) {
auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context(" ");
result = common_chat_combinator_parser.parse(ctx);
t.assert_equal("escape_sequence_space_fail", true, result.fail());
});
// Test escaped dash - 'a' should succeed
t.test("escaped_dash_a", [](testing &t) {
auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("a");
result = common_chat_combinator_parser.parse(ctx);
t.assert_equal("escaped_dash_a", true, result.success());
});
// Test escaped dash - '-' should succeed (literal dash)
t.test("escaped_dash_literal", [](testing &t) {
auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("-");
result = common_chat_combinator_parser.parse(ctx);
t.assert_equal("escaped_dash_literal", true, result.success());
});
// Test escaped dash - 'z' should succeed
t.test("escaped_dash_z", [](testing &t) {
auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("z");
result = common_chat_combinator_parser.parse(ctx);
t.assert_equal("escaped_dash_z", true, result.success());
});
// Test escaped dash - 'b' should NOT match (since \- is literal dash, not range)
t.test("escaped_dash_b_fail", [](testing &t) {
auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("b");
result = common_chat_combinator_parser.parse(ctx);
t.assert_equal("escaped_dash_b_fail", true, result.fail());
});
});
t.test("optional", [](testing & t) {
// Full match with optional part present
t.test("optional_present", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("hello") + p.optional(p.literal(" world"));
});
auto ctx = common_peg_parse_context("hello world");
auto result = parser.parse(ctx);
t.assert_equal("optional_present", true, result.success());
t.assert_equal("optional_present_end", 11u, result.end);
});
// Full match with optional part absent
t.test("optional_absent", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("hello") + p.optional(p.literal(" world"));
});
auto ctx = common_peg_parse_context("hello");
auto result = parser.parse(ctx);
t.assert_equal("optional_absent", true, result.success());
t.assert_equal("optional_absent_end", 5u, result.end);
});
// Partial match - waiting for more input to determine if optional matches
t.test("partial_match_need_more", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("hello") + p.optional(p.literal(" world"));
});
auto ctx = common_peg_parse_context("hello ", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("partial_match_need_more", true, result.need_more_input());
});
});
t.test("partial parsing", [](testing & t) {
// Literals - Basic Success
t.test("literal_success", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("hello");
result = parser.parse(ctx);
t.assert_equal("literal_success", true, result.success());
});
// Char Classes - Basic Lowercase Success
t.test("char_class_lowercase_success", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("a");
result = parser.parse(ctx);
t.assert_equal("char_class_lowercase_success", true, result.success());
});
// Char Classes - Uppercase Fail
t.test("char_class_uppercase_fail", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("A");
result = parser.parse(ctx);
t.assert_equal("char_class_uppercase_fail", true, result.fail());
});
// Char Classes with Dash - Lowercase Success
t.test("char_class_with_dash_lowercase", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("f");
result = parser.parse(ctx);
t.assert_equal("char_class_with_dash_lowercase", true, result.success());
});
// Char Classes with Dash - Literal Dash Success
t.test("char_class_with_dash_literal_dash", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("-");
result = parser.parse(ctx);
t.assert_equal("char_class_with_dash_literal_dash", true, result.success());
});
// Char Classes with Dash - Uppercase Fail
t.test("char_class_with_dash_uppercase_fail", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
common_peg_parse_context ctx;
common_peg_parse_result result;
ctx = common_peg_parse_context("A");
result = parser.parse(ctx);
t.assert_equal("char_class_with_dash_uppercase_fail", true, result.fail());
});
// Sequences - Partial Match 1
t.test("sequence_partial_match_1", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
auto ctx = common_peg_parse_context("<thi", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("sequence_partial_match_1", true, result.need_more_input());
});
// Sequences - Partial Match 2
t.test("sequence_partial_match_2", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("begin") + p.literal("end"); });
auto ctx = common_peg_parse_context("begin", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("sequence_partial_match_2", true, result.need_more_input());
});
// Sequences - Partial Match 3
t.test("sequence_partial_match_3", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
auto ctx = common_peg_parse_context("<think></", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("sequence_partial_match_3", true, result.need_more_input());
});
// Sequences - Full Match
t.test("sequence_full_match", [&](testing & t) {
auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello") + p.literal("world"); });
auto ctx = common_peg_parse_context("helloworld");
auto result = common_chat_combinator_parser.parse(ctx);
t.assert_equal("sequence_full_match", true, result.success());
});
// Sequences - No Match
t.test("sequence_no_match", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
auto ctx = common_peg_parse_context("<think>I am common_chat_combinator_parser", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("sequence_no_match", true, result.fail());
});
// Choices - Partial Match 1
t.test("choices_partial_match_1", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("option1") | p.literal("option2"); });
auto ctx = common_peg_parse_context("opt", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("choices_partial_match_1", true, result.need_more_input());
});
// Choices - Partial Match 2
t.test("choices_partial_match_2", [&](testing & t) {
auto parser =
build_peg_parser([](common_peg_parser_builder & p) { return p.literal("choice_a") | p.literal("choice_b"); });
auto ctx = common_peg_parse_context("choice", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("choices_partial_match_2", true, result.need_more_input());
});
// Choices - Full Match 1
t.test("choices_full_match_1", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("first") | p.literal("second"); });
auto ctx = common_peg_parse_context("first");
auto result = parser.parse(ctx);
t.assert_equal("choices_full_match_1", true, result.success());
});
// Choices - Full Match 2
t.test("choices_full_match_2", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("alpha") | p.literal("beta"); });
auto ctx = common_peg_parse_context("beta");
auto result = parser.parse(ctx);
t.assert_equal("choices_full_match_2", true, result.success());
});
// Choices - No Match
t.test("choices_no_match", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("good") | p.literal("better"); });
auto ctx = common_peg_parse_context("best");
auto result = parser.parse(ctx);
t.assert_equal("choices_no_match", true, result.fail());
});
// Zero or More - Partial Match 1
t.test("zero_or_more_partial_match_1", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("ab")); });
auto ctx = common_peg_parse_context("a", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("zero_or_more_partial_match_1", true, result.need_more_input());
});
// Zero or More - Partial Match 2
t.test("zero_or_more_partial_match_2", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("xy")); });
auto ctx = common_peg_parse_context("xyx", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("zero_or_more_partial_match_2", true, result.need_more_input());
});
// Zero or More - Full Match
t.test("zero_or_more_full_match", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("test")); });
auto ctx = common_peg_parse_context("test");
auto result = parser.parse(ctx);
t.assert_equal("zero_or_more_full_match", true, result.success());
});
// One or More - Partial Match 1
t.test("one_or_more_partial_match_1", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("repeat")); });
auto ctx = common_peg_parse_context("rep", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("one_or_more_partial_match_1", true, result.need_more_input());
});
// One or More - Partial Match 2
t.test("one_or_more_partial_match_2", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("ab")); });
auto ctx = common_peg_parse_context("aba", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("one_or_more_partial_match_2", true, result.need_more_input());
});
// One or More - Full Match
t.test("one_or_more_full_match", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("single")); });
auto ctx = common_peg_parse_context("single");
auto result = parser.parse(ctx);
t.assert_equal("one_or_more_full_match", true, result.success());
});
// One or More - No Match
t.test("one_or_more_no_match", [&](testing & t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("()")); });
auto ctx = common_peg_parse_context("success");
auto result = parser.parse(ctx);
t.assert_equal("one_or_more_no_match", true, result.fail());
});
});
t.test("recursive rules", [](testing &t) {
// Test simple number
t.test("simple_number", [](testing &t) {
auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
p.rule("number", p.chars("0-9"));
p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
return p.rule("value", p.ref("number") | p.ref("list"));
});
common_peg_parse_context ctx("1");
auto result = value_parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
});
// Test simple list
t.test("simple_list", [](testing &t) {
auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
p.rule("number", p.chars("0-9"));
p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
return p.rule("value", p.ref("number") | p.ref("list"));
});
common_peg_parse_context ctx("[1]");
auto result = value_parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
});
// Test nested list
t.test("nested_list", [](testing &t) {
auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
p.rule("number", p.chars("0-9"));
p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
return p.rule("value", p.ref("number") | p.ref("list"));
});
common_peg_parse_context ctx("[[2]]");
auto result = value_parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
});
// Test deeply nested list
t.test("deeply_nested_list", [](testing &t) {
auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
p.rule("number", p.chars("0-9"));
p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
return p.rule("value", p.ref("number") | p.ref("list"));
});
common_peg_parse_context ctx("[[[3]]]");
auto result = value_parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
});
// Test need_more_input match
t.test("need_more_input_match", [](testing &t) {
auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
p.rule("number", p.chars("0-9"));
p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
return p.rule("value", p.ref("number") | p.ref("list"));
});
common_peg_parse_context ctx("[[", COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = value_parser.parse(ctx);
t.assert_equal("result_is_need_more_input", true, result.need_more_input());
});
// Test no match
t.test("no_match", [](testing &t) {
auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
p.rule("number", p.chars("0-9"));
p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
return p.rule("value", p.ref("number") | p.ref("list"));
});
common_peg_parse_context ctx("[a]");
auto result = value_parser.parse(ctx);
t.assert_equal("result_is_fail", true, result.fail());
});
// Test markers
t.test("marker", [](testing &t) {
auto bracket_parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.marker();
});
common_peg_parse_context ctx_square("[marker]");
common_peg_parse_context ctx_sharp("<marker>");
auto result_square = bracket_parser.parse(ctx_square);
auto result_sharp = bracket_parser.parse(ctx_sharp);
t.assert_true("result_square_is_success", result_square.success());
t.assert_true("result_sharp_is_success", result_sharp.success());
});
});
}

View File

@@ -0,0 +1,370 @@
#include "tests.h"
#include "json-schema-to-grammar.h"
#include <regex>
static std::string trim_leading_space(const std::string & s) {
static const std::regex leading_ws_re = std::regex(R"((^|\n)\s+)");
return std::regex_replace(s, leading_ws_re, "$1");
}
static void assert_gbnf_equal(testing & t, const std::string & expected, const std::string & actual) {
t.assert_equal("gbnf are equal", trim_leading_space(expected), trim_leading_space(actual));
}
void test_gbnf_generation(testing &t) {
t.test("literal grammar generation", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("hello");
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "hello"
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("char class grammar", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.chars("[a-z]", 1, 1);
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= [a-z]
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("sequence grammar", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("hello") + p.literal(" ") + p.literal("world");
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "hello" " " "world"
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("choice grammar", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("cat") | p.literal("dog");
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "cat" | "dog"
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("one_or_more grammar", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.one_or_more(p.literal("a"));
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "a"+
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("zero_or_more grammar", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.zero_or_more(p.literal("a"));
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "a"*
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("optional grammar", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("hello") + p.optional(p.literal(" world"));
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "hello" " world"?
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("until grammar", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.until("</tag>");
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])*
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("complex expressions with parentheses", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.one_or_more(p.literal("a") | p.literal("b"));
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= ("a" | "b")+
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("rule references", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
auto digit = p.rule("digit", p.chars("[0-9]", 1, 1));
return p.one_or_more(digit);
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
digit ::= [0-9]
root ::= digit+
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("escaping in literals", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("hello\nworld\n!");
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "hello\nworld\n!"
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("operator<< (whitespace insertion)", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("hello") << p.literal("world");
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "hello" space "world"
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("emit only reachable rules", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
p.rule("orphan", p.literal("orphan"));
return p.literal("hello") + p.rule("child", p.literal(" world"));
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
child ::= " world"
root ::= "hello" child
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("tagged choice inside sequence gets parenthesized", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("a") + p.tag("t", p.literal("b") | p.literal("c"));
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "a" ("b" | "c")
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("tagged sequence inside choice gets parenthesized", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.tag("t", p.literal("a") + p.literal("b")) | p.literal("c");
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "a" "b" | "c"
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("atomic choice inside repetition gets parenthesized", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.one_or_more(p.atomic(p.literal("a") | p.literal("b")));
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= ("a" | "b")+
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("silent parser emits nothing in gbnf", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("hello") + p.gbnf(p.literal("world"), "");
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "hello"
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("silent choice inside sequence emits nothing", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("a") + p.gbnf(p.literal("b") | p.literal("c"), "") + p.literal("d");
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "a" "d"
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("silent wrapped in tag emits nothing", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("a") + p.tag("t", p.gbnf(p.literal("b"), ""));
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "a"
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("gbnf parser emits custom grammar", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("a") + p.gbnf(p.literal("b"), "[a-z]+");
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "a" [a-z]+
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("nested transparent wrappers get parenthesized", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.literal("x") + p.tag("outer", p.atomic(p.literal("a") | p.literal("b")));
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= "x" ("a" | "b")
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
});
t.test("emit only trigger rules (and references)", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
auto rule1 = p.rule("rule-1", p.literal("a") + p.ref("rule-2"));
p.rule("rule-2", p.literal("b") + p.ref("rule-3"), true);
p.rule("rule-3", p.literal("c") + p.ref("rule-4"));
p.rule("rule-4", p.literal("d"), true);
return rule1;
});
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder);
});
assert_gbnf_equal(t, R"""(
root ::= rule-1
rule-1 ::= "a" rule-2
rule-2 ::= "b" rule-3
rule-3 ::= "c" rule-4
rule-4 ::= "d"
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf);
auto gbnf_lazy = build_grammar([&](const common_grammar_builder & builder) {
parser.build_grammar(builder, true);
});
assert_gbnf_equal(t, R"""(
root ::= rule-2 | rule-4
rule-2 ::= "b" rule-3
rule-3 ::= "c" rule-4
rule-4 ::= "d"
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""", gbnf_lazy);
});
}

View File

@@ -0,0 +1,109 @@
#include "tests.h"
void test_json_parser(testing &t) {
// Test parsing a simple JSON object
t.test("simple JSON object parsing", [](testing &t) {
auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
std::string input = R"({"name": "test", "value": 42, "flag": true})";
common_peg_parse_context ctx(input);
auto result = json.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test parsing a JSON array with mixed types
t.test("JSON array with mixed types", [](testing &t) {
auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
std::string input = R"([1, "hello", true, null, 3.14])";
common_peg_parse_context ctx(input);
auto result = json.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test parsing nested JSON with objects and arrays
t.test("nested JSON with objects and arrays", [](testing &t) {
auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
std::string input =
R"({"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], "count": 2, "metadata": {"version": "1.0", "tags": ["admin", "user"]}})";
common_peg_parse_context ctx(input);
auto result = json.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test need_more_input() parsing - incomplete object
t.test("need_more_input() parsing - incomplete object", [](testing &t) {
auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
std::string input = R"({"name": "test", "value": )";
common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = json.parse(ctx);
t.assert_equal("result_is_need_more_input", true, result.need_more_input());
});
// Test need_more_input() parsing - incomplete array
t.test("need_more_input() parsing - incomplete array", [](testing &t) {
auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
std::string input = R"([1, 2, 3, )";
common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = json.parse(ctx);
t.assert_equal("result_is_need_more_input", true, result.need_more_input());
});
// Test need_more_input() parsing - incomplete nested structure
t.test("need_more_input() parsing - incomplete nested structure", [](testing &t) {
auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
std::string input = R"({"data": {"nested": )";
common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = json.parse(ctx);
t.assert_equal("result_is_need_more_input", true, result.need_more_input());
});
t.test("object member", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.json_member("name", "\"" + p.chars("[a-z]") + "\"");
});
t.test("success", [&](testing &t) {
std::string input = R"("name": "bob")";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
});
t.test("partial", [&](testing &t) {
std::string input = R"("name": "bo)";
common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_true("need more input", result.need_more_input());
});
t.test("failed", [&](testing &t) {
std::string input = R"([])";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("fail", result.fail());
});
});
}

View File

@@ -0,0 +1,28 @@
#include "tests.h"
void test_json_serialization(testing &t) {
auto original = build_peg_parser([](common_peg_parser_builder & p) {
return "<tool_call>" + p.json() + "</tool_call>";
});
auto json_serialized = original.to_json().dump();
t.test("compare before/after", [&](testing &t) {
auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
// Test complex JSON
std::string input = R"({"name": "test", "values": [1, 2, 3], "nested": {"a": true}})";
common_peg_parse_context ctx1(input);
common_peg_parse_context ctx2(input);
auto result1 = original.parse(ctx1);
auto result2 = deserialized.parse(ctx2);
t.assert_equal("both_succeed", result1.success(), result2.success());
t.assert_equal("same_end_pos", result1.end, result2.end);
});
t.bench("deserialize", [&]() {
auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
}, 100);
}

View File

@@ -0,0 +1,318 @@
#include "tests.h"
void test_python_dict_parser(testing &t) {
// Test parsing a simple Python dict object with single quotes
t.test("simple Python dict object parsing", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input = "{'name': 'test', 'value': 42, 'flag': True}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test parsing a Python array with mixed types
t.test("Python array with mixed types", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input = "[1, 'hello', True, None, 3.14]";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test parsing nested Python dict with objects and arrays
t.test("nested Python dict with objects and arrays", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input =
"{'users': [{'id': 1, 'name': 'Alice'}, {'id': 2, 'name': 'Bob'}], 'count': 2, 'metadata': {'version': '1.0', 'tags': ['admin', 'user']}}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test parsing Python dict with escaped single quotes
t.test("Python dict with escaped single quotes", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input = "{'message': 'It\\'s working!'}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test parsing Python dict with double quotes inside single quotes
t.test("Python dict with double quotes inside single quotes", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input = "{'quote': 'He said \"Hello\"'}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test the example from the requirements
t.test("complex Python dict example from requirements", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input = "{ 'obj' : { 'something': 1, 'other \"something\"' : 'foo\\'s bar' } }";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test need_more_input() parsing - incomplete object
t.test("need_more_input() parsing - incomplete object", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input = "{'name': 'test', 'value': ";
common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("result_is_need_more_input", true, result.need_more_input());
});
// Test need_more_input() parsing - incomplete single-quoted string
t.test("need_more_input() parsing - incomplete single-quoted string", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input = "{'name': 'test";
common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_equal("result_is_need_more_input", true, result.need_more_input());
});
// Test unicode in Python dict strings
t.test("unicode in Python dict strings", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input = "{'message': 'Hello, 世界!'}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test Python dict with unicode escapes
t.test("Python dict with unicode escapes", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input = "{'unicode': 'Hello\\u0041'}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test that Python parser accepts double-quoted strings too
t.test("Python parser accepts double-quoted strings", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input = "{\"name\": \"test\"}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test Python parser with mixed quote styles
t.test("Python parser with mixed quote styles", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
std::string input = "{\"name\": 'test', 'value': \"hello\"}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_equal("result_is_success", true, result.success());
t.assert_equal("result_end", input.size(), result.end);
});
// Test Python True/False/None
t.test("Python True/False/None", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); });
t.test("True", [&](testing &t) {
std::string input = "True";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
t.assert_equal("end", input.size(), result.end);
});
t.test("False", [&](testing &t) {
std::string input = "False";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
t.assert_equal("end", input.size(), result.end);
});
t.test("None", [&](testing &t) {
std::string input = "None";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
t.assert_equal("end", input.size(), result.end);
});
t.test("rejects JSON-style true/false/null", [&](testing &t) {
for (const auto & kw : {"true", "false", "null"}) {
std::string input = kw;
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true(std::string("rejects ") + kw, result.fail());
}
});
});
// Test single-quoted string content parser directly
t.test("single-quoted string content parser", [](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.sequence({ p.literal("'"), p.string_content('\''), p.literal("'"), p.space() });
});
t.test("simple string", [&](testing &t) {
std::string input = "'hello'";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
t.assert_equal("end", input.size(), result.end);
});
t.test("string with escaped single quote", [&](testing &t) {
std::string input = "'it\\'s'";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
t.assert_equal("end", input.size(), result.end);
});
t.test("string with double quotes", [&](testing &t) {
std::string input = "'say \"hello\"'";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
t.assert_equal("end", input.size(), result.end);
});
t.test("incomplete string", [&](testing &t) {
std::string input = "'hello";
common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
t.assert_true("need_more_input", result.need_more_input());
});
});
// Test json() with pre-registered flexible json-string rule (python dict support)
t.test("json() parser with flexible json-string rule", [](testing &t) {
t.test("json() rejects single quotes by default", [&](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
return p.json();
});
std::string input = "{'name': 'test'}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("fail", result.fail());
});
t.test("json() accepts single quotes with pre-registered flexible json-string rule", [&](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
// Pre-register json-string rule with both quote styles
p.rule("json-string", [&]() {
return p.choice({ p.double_quoted_string(), p.single_quoted_string() });
});
return p.json();
});
std::string input = "{'name': 'test'}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
t.assert_equal("end", input.size(), result.end);
});
t.test("json() still accepts double quotes with flexible json-string rule", [&](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
p.rule("json-string", [&]() {
return p.choice({ p.double_quoted_string(), p.single_quoted_string() });
});
return p.json();
});
std::string input = "{\"name\": \"test\"}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
t.assert_equal("end", input.size(), result.end);
});
t.test("json() accepts mixed quote styles with flexible json-string rule", [&](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
p.rule("json-string", [&]() {
return p.choice({ p.double_quoted_string(), p.single_quoted_string() });
});
return p.json();
});
std::string input = "{\"name\": 'test', 'value': \"hello\"}";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
t.assert_equal("end", input.size(), result.end);
});
t.test("complex nested structure with flexible json-string rule", [&](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
p.rule("json-string", [&]() {
return p.choice({ p.double_quoted_string(), p.single_quoted_string() });
});
return p.json();
});
std::string input = "{ 'obj' : { 'something': 1, 'other \"something\"' : 'foo\\'s bar' } }";
common_peg_parse_context ctx(input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
t.assert_equal("end", input.size(), result.end);
});
});
}

View File

@@ -0,0 +1,446 @@
#include "tests.h"
#include "peg-parser.h"
#include <string>
#include <sstream>
#include <iomanip>
#include <cctype>
static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) {
t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual));
}
static std::string hex_dump(const std::string& str) {
std::ostringstream oss;
for (unsigned char c : str) {
if (std::isprint(c)) {
oss << c;
} else {
oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
}
}
return oss.str();
}
void test_unicode(testing &t) {
struct test_case {
std::string input;
std::string expected_text;
common_peg_parse_result_type expected_result;
};
t.test("any", [](testing &t) {
std::vector<test_case> test_cases {
// Valid UTF-8 sequences
{"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS},
{std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
{std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
{std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
// Incomplete UTF-8 sequences (partial bytes at end)
{std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
{std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
{std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
// Invalid/malformed UTF-8 sequences
{std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
{std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL},
{std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
};
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
return p.sequence({p.one_or_more(p.any()), p.end()});
});
for (size_t i = 0; i < test_cases.size(); i++) {
const auto & tc = test_cases[i];
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
t.test(test_name, [&](testing &t) {
common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
// Assert result type matches
assert_result_equal(t, tc.expected_result, result.type);
// Assert matched text if success or need_more_input
if (result.success() || result.need_more_input()) {
std::string matched = tc.input.substr(result.start, result.end - result.start);
t.assert_equal(tc.expected_text, matched);
}
});
}
});
t.test("char classes", [](testing &t) {
t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) {
std::vector<test_case> test_cases {
// Within range - CJK Unified Ideographs
{std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
{std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
{std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D
{std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF
// Outside range - should fail
{"a", "", COMMON_PEG_PARSE_RESULT_FAIL}, // ASCII
{std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+4DFF (before range)
{std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+A000 (after range)
// Incomplete sequences in range
{std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+4E00
{std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+597D
};
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()});
});
for (size_t i = 0; i < test_cases.size(); i++) {
const auto & tc = test_cases[i];
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
t.test(test_name, [&](testing &t) {
common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
// Assert result type matches
assert_result_equal(t, tc.expected_result, result.type);
// Assert matched text if success or need_more_input
if (result.success() || result.need_more_input()) {
std::string matched = tc.input.substr(result.start, result.end - result.start);
t.assert_equal(tc.expected_text, matched);
}
});
}
});
t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) {
std::vector<test_case> test_cases {
// Within range - Emoticons (all 4-byte UTF-8)
{std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
{std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601
{std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F
// Outside range
{std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range)
{std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range)
{std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range)
// Incomplete sequences
{std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji
{std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Very incomplete
};
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()});
});
for (size_t i = 0; i < test_cases.size(); i++) {
const auto & tc = test_cases[i];
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
t.test(test_name, [&](testing &t) {
common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
// Assert result type matches
assert_result_equal(t, tc.expected_result, result.type);
// Assert matched text if success or need_more_input
if (result.success() || result.need_more_input()) {
std::string matched = tc.input.substr(result.start, result.end - result.start);
t.assert_equal(tc.expected_text, matched);
}
});
}
});
t.test("mixed unicode ranges", [](testing &t) {
std::vector<test_case> test_cases {
// Match CJK
{std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
{std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
// Match emoticons
{std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
// Match ASCII digits
{"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS},
// Don't match outside any range
{"a", "", COMMON_PEG_PARSE_RESULT_FAIL},
{std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680
// Incomplete
{std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
{std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
};
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()});
});
for (size_t i = 0; i < test_cases.size(); i++) {
const auto & tc = test_cases[i];
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
t.test(test_name, [&](testing &t) {
common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
// Assert result type matches
assert_result_equal(t, tc.expected_result, result.type);
// Assert matched text if success or need_more_input
if (result.success() || result.need_more_input()) {
std::string matched = tc.input.substr(result.start, result.end - result.start);
t.assert_equal(tc.expected_text, matched);
}
});
}
});
});
t.test("until parser", [](testing &t) {
t.test("ASCII delimiter with Unicode content", [](testing &t) {
std::vector<test_case> test_cases {
// CJK characters before delimiter
{std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
// Emoji before delimiter
{std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
// Mixed content
{std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
};
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
return p.until("</tag>");
});
for (size_t i = 0; i < test_cases.size(); i++) {
const auto & tc = test_cases[i];
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
t.test(test_name, [&](testing &t) {
common_peg_parse_context ctx(tc.input);
auto result = parser.parse(ctx);
assert_result_equal(t, tc.expected_result, result.type);
if (result.success()) {
std::string matched = tc.input.substr(result.start, result.end - result.start);
t.assert_equal(tc.expected_text, matched);
}
});
}
});
t.test("incomplete UTF-8 at end", [](testing &t) {
std::vector<test_case> test_cases {
// Incomplete emoji at end, no delimiter
{std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
// Incomplete CJK at end, no delimiter
{std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
// Complete content, no delimiter (should consume all valid UTF-8)
{std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
};
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
return p.until("</tag>");
});
for (size_t i = 0; i < test_cases.size(); i++) {
const auto & tc = test_cases[i];
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
t.test(test_name, [&](testing &t) {
common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
assert_result_equal(t, tc.expected_result, result.type);
if (result.success() || result.need_more_input()) {
std::string matched = tc.input.substr(result.start, result.end - result.start);
t.assert_equal(tc.expected_text, matched);
}
});
}
});
t.test("malformed UTF-8", [](testing &t) {
std::vector<test_case> test_cases {
// Invalid UTF-8 bytes
{std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
// Continuation byte without lead byte
{std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
// Invalid continuation byte
{std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
};
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
return p.until("</tag>");
});
for (size_t i = 0; i < test_cases.size(); i++) {
const auto & tc = test_cases[i];
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
t.test(test_name, [&](testing &t) {
common_peg_parse_context ctx(tc.input);
auto result = parser.parse(ctx);
assert_result_equal(t, tc.expected_result, result.type);
});
}
});
});
t.test("json_string parser", [](testing &t) {
t.test("valid UTF-8 characters", [](testing &t) {
std::vector<test_case> test_cases {
// ASCII only
{"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS},
// 2-byte UTF-8 (accented characters)
{std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
// 3-byte UTF-8 (CJK)
{std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
// 4-byte UTF-8 (emoji)
{std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
// Mixed content
{std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
};
for (size_t i = 0; i < test_cases.size(); i++) {
const auto & tc = test_cases[i];
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
t.test(test_name, [&](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
return p.sequence({p.string_content('"'), p.literal("\"")});
});
common_peg_parse_context ctx(tc.input);
auto result = parser.parse(ctx);
assert_result_equal(t, tc.expected_result, result.type);
if (result.success()) {
std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
t.assert_equal(tc.expected_text, matched);
}
});
}
});
t.test("incomplete UTF-8", [](testing &t) {
std::vector<test_case> test_cases {
// Incomplete 2-byte sequence
{std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
// Incomplete 3-byte sequence
{std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
// Incomplete 4-byte sequence
{std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
// Incomplete at very start
{std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
};
for (size_t i = 0; i < test_cases.size(); i++) {
const auto & tc = test_cases[i];
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
t.test(test_name, [&](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
return p.string_content('"');
});
common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT);
auto result = parser.parse(ctx);
assert_result_equal(t, tc.expected_result, result.type);
if (result.need_more_input()) {
std::string matched = tc.input.substr(result.start, result.end - result.start);
t.assert_equal(tc.expected_text, matched);
}
});
}
});
t.test("malformed UTF-8", [](testing &t) {
std::vector<test_case> test_cases {
// Invalid UTF-8 bytes
{std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
// Continuation byte without lead byte
{std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
// Invalid continuation byte
{std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
};
for (size_t i = 0; i < test_cases.size(); i++) {
const auto & tc = test_cases[i];
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
t.test(test_name, [&](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
return p.string_content('"');
});
common_peg_parse_context ctx(tc.input);
auto result = parser.parse(ctx);
assert_result_equal(t, tc.expected_result, result.type);
});
}
});
t.test("escape sequences with UTF-8", [](testing &t) {
std::vector<test_case> test_cases {
// Unicode escape sequence
{"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS},
// Mix of UTF-8 and escape sequences
{std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
// Escaped quote in UTF-8 string
{std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
};
for (size_t i = 0; i < test_cases.size(); i++) {
const auto & tc = test_cases[i];
std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
t.test(test_name, [&](testing &t) {
auto parser = build_peg_parser([](common_peg_parser_builder& p) {
return p.sequence({p.string_content('"'), p.literal("\"")});
});
common_peg_parse_context ctx(tc.input);
auto result = parser.parse(ctx);
assert_result_equal(t, tc.expected_result, result.type);
if (result.success()) {
std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
t.assert_equal(tc.expected_text, matched);
}
});
}
});
});
}

25
tests/peg-parser/tests.h Normal file
View File

@@ -0,0 +1,25 @@
#pragma once
// Common includes for all test files
#include <nlohmann/json.hpp>
#include <string>
#include <vector>
#include "../testing.h"
#include "peg-parser.h"
#include "chat-peg-parser.h"
#include "simple-tokenize.h"
struct bench_tool_call {
std::string id;
std::string name;
nlohmann::ordered_json args;
};
// Test function declarations
void test_basic(testing &t);
void test_json_parser(testing &t);
void test_gbnf_generation(testing &t);
void test_unicode(testing &t);
void test_json_serialization(testing &t);
void test_python_dict_parser(testing &t);