From 2e5c8aeab0134b4265df4c8ac48a645bc0b3bad7 Mon Sep 17 00:00:00 2001
From: marcus <marcus.s.dunn@gmail.com>
Date: Fri, 24 Nov 2023 16:29:58 -0800
Subject: [PATCH 1/5] reserve space for codepoints

---
 llama.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index e599917a81eb1..08daaf8897b72 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6903,10 +6903,13 @@ struct llama_grammar_candidate {
 // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
 static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
         const char         * src,
+        size_t               n_src,
         llama_partial_utf8   partial_start) {
     static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
     const char          * pos      = src;
     std::vector<uint32_t> code_points;
+    // common english strings have the same number of codepoints and bytes.
+    code_points.reserve(n_src);
     uint32_t              value    = partial_start.value;
     int                   n_remain = partial_start.n_remain;
 
@@ -6957,6 +6960,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
     return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
 }
 
+static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
+        std::string src,
+        llama_partial_utf8 partial_start
+) {
+    return decode_utf8(src.c_str(), src.size(), partial_start);
+}
+
 // returns true iff pos points to the end of one of the definitions of a rule
 static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
     switch (pos->type) {
@@ -7580,7 +7590,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
         } else if (piece.empty() || piece[0] == 0) {
             candidates->data[i].logit = -INFINITY;
         } else {
-            candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
+            candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
             candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
         }
     }
@@ -7787,7 +7797,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
     const std::string piece = llama_token_to_piece(ctx, token);
 
     // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(piece.c_str(), grammar->partial_utf8);
+    const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
     const auto & code_points = decoded.first;
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
         grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);

From 9d3ba0bacdb6705ec2c8ad1ce21e1dfb6f6a2db8 Mon Sep 17 00:00:00 2001
From: marcus <marcus.s.dunn@gmail.com>
Date: Fri, 24 Nov 2023 17:27:18 -0800
Subject: [PATCH 2/5] improvement for the appended 0

---
 llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ec23485fda22c..f2b5967d791e9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6425,8 +6425,8 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
     static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
     const char          * pos      = src;
     std::vector<uint32_t> code_points;
-    // common english strings have the same number of codepoints and bytes.
-    code_points.reserve(n_src);
+    // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
+    code_points.reserve(n_src + 1);
     uint32_t              value    = partial_start.value;
     int                   n_remain = partial_start.n_remain;
 

From f29add56d8d8fb8ba8e107a5096b726aeb47adf2 Mon Sep 17 00:00:00 2001
From: marcus <marcus.s.dunn@gmail.com>
Date: Fri, 24 Nov 2023 20:47:01 -0800
Subject: [PATCH 3/5] changed allowed saving of pieces to reduce calls to
 llama_token_to_piece

---
 common/sampling.cpp |  2 +-
 llama.cpp           | 15 +++++++++++++--
 llama.h             |  4 +++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index 1317024c2c11c..8a4b1f4384f6b 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -166,7 +166,7 @@ llama_token llama_sampling_sample(
     }
 
     if (ctx_sampling->grammar != NULL) {
-        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
+        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar, nullptr);
     }
 
     if (temp < 0.0) {
diff --git a/llama.cpp b/llama.cpp
index f2b5967d791e9..0e87da50a4363 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7106,7 +7106,11 @@ void llama_sample_repetition_penalties(
     }
 }
 
-void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
+void llama_sample_grammar(
+        struct llama_context * ctx,
+        llama_token_data_array * candidates,
+        const struct llama_grammar * grammar,
+        char const * const * pieces) {
     GGML_ASSERT(ctx);
     const int64_t t_start_sample_us = ggml_time_us();
 
@@ -7125,7 +7129,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
 
     for (size_t i = 0; i < candidates->size; ++i) {
         const llama_token id    = candidates->data[i].id;
-        const std::string piece = llama_token_to_piece(ctx, id);
+        std::string piece;
+
+        if (pieces != nullptr && pieces[id] != nullptr) {
+            piece = std::string(pieces[id]);
+        } else {
+            piece = llama_token_to_piece(ctx, id);
+        }
+
         if (id == eos) {
             if (!allow_eos) {
                 candidates->data[i].logit = -INFINITY;
diff --git a/llama.h b/llama.h
index 1a62058d1406b..7f2a1a6c67dd3 100644
--- a/llama.h
+++ b/llama.h
@@ -722,7 +722,9 @@ extern "C" {
     LLAMA_API void llama_sample_grammar(
             struct llama_context * ctx,
           llama_token_data_array * candidates,
-      const struct llama_grammar * grammar);
+      const struct llama_grammar * grammar,
+              char const * const * pieces);
+
 
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.

From c788d1b57982498473de8ce3cc842f3a186e97ae Mon Sep 17 00:00:00 2001
From: marcus <marcus.s.dunn@gmail.com>
Date: Fri, 24 Nov 2023 20:49:58 -0800
Subject: [PATCH 4/5] added docs

---
 llama.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama.h b/llama.h
index 7f2a1a6c67dd3..b15de2ccc119d 100644
--- a/llama.h
+++ b/llama.h
@@ -719,6 +719,7 @@ extern "C" {
             "use llama_sample_temp instead");
 
     /// @details Apply constraints from grammar
+    /// @param pieces an array of all null terminated strings obtained from calling llama_token_to_piece for the whole grammar. Can be nullptr in which case they will be computed.
     LLAMA_API void llama_sample_grammar(
             struct llama_context * ctx,
           llama_token_data_array * candidates,

From 3cc9682681afd5116252380bee9562ec8736e0e3 Mon Sep 17 00:00:00 2001
From: marcus <marcus.s.dunn@gmail.com>
Date: Fri, 24 Nov 2023 20:50:30 -0800
Subject: [PATCH 5/5] terminology.

---
 llama.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.h b/llama.h
index b15de2ccc119d..9d6e1599f5988 100644
--- a/llama.h
+++ b/llama.h
@@ -719,7 +719,7 @@ extern "C" {
             "use llama_sample_temp instead");
 
     /// @details Apply constraints from grammar
-    /// @param pieces an array of all null terminated strings obtained from calling llama_token_to_piece for the whole grammar. Can be nullptr in which case they will be computed.
+    /// @param pieces an array of all null terminated strings obtained from calling llama_token_to_piece for the whole vocab. Can be nullptr in which case they will be computed.
     LLAMA_API void llama_sample_grammar(
             struct llama_context * ctx,
           llama_token_data_array * candidates,