packages/llm/llama_cpp/patches.diff

diff --git a/ggml.h b/ggml.h
index bdbd128..28a806a 100644
--- a/ggml.h
+++ b/ggml.h
@@ -255,9 +255,11 @@
 extern "C" {
 #endif
 
-#ifdef __ARM_NEON
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
     // we use the built-in 16-bit float type
     typedef __fp16 ggml_fp16_t;
+#elif defined(__ARM_NEON) && defined(__CUDACC__)
+    typedef half ggml_fp16_t;
 #else
     typedef uint16_t ggml_fp16_t;
 #endif
diff --git a/llama.cpp b/llama.cpp
index c8ab313..2e9cc90 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -572,6 +572,13 @@ struct llama_file_loader {
             uint32_t len = file.read_u32();
             std::string word = file.read_string(len);
 
+            if (i == 0)
+                word = "<unk>";
+            else if (i == 1)
+                word = "<s>";
+            else if (i == 2)
+                word = "</s>";
+		  
             float score = 0.0f;
             file.read_raw(&score, sizeof(score));
 
@@ -580,6 +587,8 @@ struct llama_file_loader {
             auto & tok_score = vocab.id_to_token[i];
             tok_score.tok = std::move(word);
             tok_score.score = score;
+		  
+		  //printf("vocab token %u  %s  score=%f\n", i, tok_score.tok.c_str(), tok_score.score);
         }
     }
     void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
@@ -1975,18 +1984,19 @@ struct llama_sp_bigram {
 struct llama_tokenizer {
     llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
 
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+    void tokenize(const char * text, size_t len, std::vector<llama_vocab::id> & output) {
+	   symbols_.clear();
         // split string into utf8 chars
         int index = 0;
         size_t offs = 0;
-        while (offs < text.size()) {
+        while (offs < len) {
             llama_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
-            sym.text = text.c_str() + offs;
+            size_t char_len = std::min(len - offs, utf8_len(text[offs]));
+            sym.text = text + offs;
             sym.n = char_len;
             offs += char_len;
             sym.prev = index - 1;
-            sym.next = offs == text.size() ? -1 : index + 1;
+            sym.next = offs == len ? -1 : index + 1;
             index++;
             symbols_.emplace_back(sym);
         }
@@ -2089,7 +2099,47 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
         output.push_back(llama_token_bos());
     }
 
-    tokenizer.tokenize(text, output);
+    // add parsing of special BOS/EOS tokens (https://github.com/ggerganov/llama.cpp/pull/1931)
+    std::unordered_map<llama_vocab::token, llama_vocab::id> special_token_to_id;
+    
+    special_token_to_id["<unk>"] = 0;
+    special_token_to_id["<s>"] = 1;
+    special_token_to_id["</s>"] = 2;
+    
+    size_t delim_start = 0;
+    size_t last_delim_end = 0;
+
+    while (delim_start < text.size()) {
+        size_t delim_end = 0;
+        llama_vocab::id token_id = -1;
+
+        for (const auto & mit : special_token_to_id) {
+            const std::string & delimiter = mit.first;
+            size_t end = delim_start + delimiter.size();
+            if (end <= text.size() && text.compare(delim_start, delimiter.size(), delimiter) == 0) {
+                if (token_id == -1 || end > delim_end) {
+                    token_id = mit.second;
+                    delim_end = end;
+                }
+            }
+        }
+
+        if (token_id != -1) {
+            if (last_delim_end < delim_start) {
+                tokenizer.tokenize(text.c_str() + last_delim_end, delim_start - last_delim_end, output);
+            }
+            output.push_back(token_id);
+            delim_start = delim_end;
+            last_delim_end = delim_end;
+        } else {
+            delim_start++;
+        }
+    }
+
+    if (last_delim_end < text.size()) {
+        tokenizer.tokenize(text.c_str() + last_delim_end, text.size() - last_delim_end, output);
+    }
+    
     return output;
 }