From 2e5c8aeab0134b4265df4c8ac48a645bc0b3bad7 Mon Sep 17 00:00:00 2001 From: marcus Date: Fri, 24 Nov 2023 16:29:58 -0800 Subject: [PATCH 1/5] reserve space for codepoints --- llama.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index e599917a81eb1..08daaf8897b72 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6903,10 +6903,13 @@ struct llama_grammar_candidate { // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`. static std::pair, llama_partial_utf8> decode_utf8( const char * src, + size_t n_src, llama_partial_utf8 partial_start) { static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; const char * pos = src; std::vector code_points; + // common english strings have the same number of codepoints and bytes. + code_points.reserve(n_src); uint32_t value = partial_start.value; int n_remain = partial_start.n_remain; @@ -6957,6 +6960,13 @@ static std::pair, llama_partial_utf8> decode_utf8( return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain }); } +static std::pair, llama_partial_utf8> decode_utf8( + std::string src, + llama_partial_utf8 partial_start +) { + return decode_utf8(src.c_str(), src.size(), partial_start); +} + // returns true iff pos points to the end of one of the definitions of a rule static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) { switch (pos->type) { @@ -7580,7 +7590,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } else if (piece.empty() || piece[0] == 0) { candidates->data[i].logit = -INFINITY; } else { - candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8)); + candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8)); candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second }); } } @@ -7787,7 +7797,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar const std::string piece = llama_token_to_piece(ctx, token); // Note terminating 0 in decoded string - const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8); + const auto decoded = decode_utf8(piece, grammar->partial_utf8); const auto & code_points = decoded.first; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); From 9d3ba0bacdb6705ec2c8ad1ce21e1dfb6f6a2db8 Mon Sep 17 00:00:00 2001 From: marcus Date: Fri, 24 Nov 2023 17:27:18 -0800 Subject: [PATCH 2/5] improvement for the appended 0 --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index ec23485fda22c..f2b5967d791e9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6425,8 +6425,8 @@ static std::pair, llama_partial_utf8> decode_utf8( static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; const char * pos = src; std::vector code_points; - // common english strings have the same number of codepoints and bytes. - code_points.reserve(n_src); + // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0. + code_points.reserve(n_src + 1); uint32_t value = partial_start.value; int n_remain = partial_start.n_remain; From f29add56d8d8fb8ba8e107a5096b726aeb47adf2 Mon Sep 17 00:00:00 2001 From: marcus Date: Fri, 24 Nov 2023 20:47:01 -0800 Subject: [PATCH 3/5] changed allowed saving of pieces to reduce calls to llama_token_to_piece --- common/sampling.cpp | 2 +- llama.cpp | 15 +++++++++++++-- llama.h | 4 +++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 1317024c2c11c..8a4b1f4384f6b 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -166,7 +166,7 @@ llama_token llama_sampling_sample( } if (ctx_sampling->grammar != NULL) { - llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); + llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar, nullptr); } if (temp < 0.0) { diff --git a/llama.cpp b/llama.cpp index f2b5967d791e9..0e87da50a4363 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7106,7 +7106,11 @@ void llama_sample_repetition_penalties( } } -void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) { +void llama_sample_grammar( + struct llama_context * ctx, + llama_token_data_array * candidates, + const struct llama_grammar * grammar, + char const * const * pieces) { GGML_ASSERT(ctx); const int64_t t_start_sample_us = ggml_time_us(); @@ -7125,7 +7129,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c for (size_t i = 0; i < candidates->size; ++i) { const llama_token id = candidates->data[i].id; - const std::string piece = llama_token_to_piece(ctx, id); + std::string piece; + + if (pieces != nullptr && pieces[id] != nullptr) { + piece = std::string(pieces[id]); + } else { + piece = llama_token_to_piece(ctx, id); + } + if (id == eos) { if (!allow_eos) { candidates->data[i].logit = -INFINITY; diff --git a/llama.h b/llama.h index 1a62058d1406b..7f2a1a6c67dd3 100644 --- a/llama.h +++ b/llama.h @@ -722,7 +722,9 @@ extern "C" { LLAMA_API void llama_sample_grammar( struct llama_context * ctx, llama_token_data_array * candidates, - const struct llama_grammar * grammar); + const struct llama_grammar * grammar, + char const * const * pieces); + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. From c788d1b57982498473de8ce3cc842f3a186e97ae Mon Sep 17 00:00:00 2001 From: marcus Date: Fri, 24 Nov 2023 20:49:58 -0800 Subject: [PATCH 4/5] added docs --- llama.h | 1 + 1 file changed, 1 insertion(+) diff --git a/llama.h b/llama.h index 7f2a1a6c67dd3..b15de2ccc119d 100644 --- a/llama.h +++ b/llama.h @@ -719,6 +719,7 @@ extern "C" { "use llama_sample_temp instead"); /// @details Apply constraints from grammar + /// @param pieces an array of all null terminated strings obtained from calling llama_token_to_piece for the whole grammar. Can be nullptr in which case they will be computed. LLAMA_API void llama_sample_grammar( struct llama_context * ctx, llama_token_data_array * candidates, From 3cc9682681afd5116252380bee9562ec8736e0e3 Mon Sep 17 00:00:00 2001 From: marcus Date: Fri, 24 Nov 2023 20:50:30 -0800 Subject: [PATCH 5/5] terminology. --- llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.h b/llama.h index b15de2ccc119d..9d6e1599f5988 100644 --- a/llama.h +++ b/llama.h @@ -719,7 +719,7 @@ extern "C" { "use llama_sample_temp instead"); /// @details Apply constraints from grammar - /// @param pieces an array of all null terminated strings obtained from calling llama_token_to_piece for the whole grammar. Can be nullptr in which case they will be computed. + /// @param pieces an array of all null terminated strings obtained from calling llama_token_to_piece for the whole vocab. Can be nullptr in which case they will be computed. LLAMA_API void llama_sample_grammar( struct llama_context * ctx, llama_token_data_array * candidates,