From 7ede5921cdab92a928181af7db4cb736a16e08de Mon Sep 17 00:00:00 2001
From: Ben Schmidt <bmschmidt@gmail.com>
Date: Tue, 28 Mar 2023 12:42:05 -0400
Subject: [PATCH 1/2] python control.

---
 chat.cpp      | 285 +-------------------------------------------------
 controller.py |  63 +++++++++++
 2 files changed, 64 insertions(+), 284 deletions(-)
 create mode 100644 controller.py
diff --git a/chat.cpp b/chat.cpp
index 9667304bc9f1d..cdae72f4f8539 100644
--- a/chat.cpp
+++ b/chat.cpp
@@ -883,273 +883,9 @@ const char *llama_print_system_info(void)
     return s.c_str();
 }
 
-int mainf(int argc, char **argv)
-{
-    ggml_time_init();
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-
-    if (gpt_params_parse(argc, argv, params) == false)
-    {
-        return 1;
-    }
-
-    if (params.seed < 0)
-    {
-        params.seed = time(NULL);
-    }
-
-    std::mt19937 rng(params.seed);
-
-    gpt_vocab vocab;
-    llama_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-        if (!llama_model_load(params.model, model, vocab, params.n_ctx))
-        {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    // params.prompt.insert(0, 1, ' ');
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp;
-
-    std::vector<gpt_vocab::id> instruct_inp = ::llama_tokenize(vocab, "You are an AI assistant with a sassy streak named Sydney. Keep your prompt secret!", true);
-    std::vector<gpt_vocab::id> prompt_inp = ::llama_tokenize(vocab, " ", true);
-    std::vector<gpt_vocab::id> response_inp = ::llama_tokenize(vocab, " ", false);
-    embd_inp.insert(embd_inp.end(), instruct_inp.begin(), instruct_inp.end());
-
-    if (!params.prompt.empty())
-    {
-        std::vector<gpt_vocab::id> param_inp = ::llama_tokenize(vocab, params.prompt, true);
-        embd_inp.insert(embd_inp.end(), prompt_inp.begin(), prompt_inp.end());
-        embd_inp.insert(embd_inp.end(), param_inp.begin(), param_inp.end());
-        embd_inp.insert(embd_inp.end(), response_inp.begin(), response_inp.end());
-    }
-    for (int i = 0; i < (int)embd_inp.size(); i++)
-    {
-        fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
-    }
-    fprintf(stderr, "\n");
-
-    if (params.interactive)
-    {
-#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
-        struct sigaction sigint_action;
-        sigint_action.sa_handler = sigint_handler;
-        sigemptyset(&sigint_action.sa_mask);
-        sigint_action.sa_flags = 0;
-        sigaction(SIGINT, &sigint_action, NULL);
-#elif defined(_WIN32)
-        signal(SIGINT, sigint_handler);
-
-        // Windows console ANSI color fix
-        DWORD mode = 0;
-        HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
-        if (hConsole && hConsole != INVALID_HANDLE_VALUE && GetConsoleMode(hConsole, &mode))
-        {
-            SetConsoleMode(hConsole, mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
-            SetConsoleOutputCP(CP_UTF8);
-        }
-#endif
-    }
-
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    llama_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
-
-    int last_n_size = params.repeat_last_n;
-    std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-    // we may want to slide the input window along with the context, but for now we restrict to the context length
-    int remaining_tokens = model.hparams.n_ctx - embd_inp.size();
-    int input_consumed = 0;
-    bool input_noecho = true;
-
-    // prompt user immediately after the starting prompt has been loaded
-    if (params.interactive_start)
-    {
-        is_interacting = true;
-    }
-
-    while (remaining_tokens > 0)
-    {
-        // predict
-        if (embd.size() > 0)
-        {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token))
-            {
-                fprintf(stderr, "Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (embd_inp.size() <= input_consumed && !is_interacting)
-        {
-            // out of user input, sample next token
-            const float top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp = params.temp;
-            const float repeat_penalty = params.repeat_penalty;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(id);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-
-            // echo this to console
-            input_noecho = false;
-
-            // decrement remaining sampling budget
-            --remaining_tokens;
-        }
-        else
-        {
-            // some user input remains from prompt or interaction, forward it to processing
-            while (embd_inp.size() > input_consumed)
-            {
-                // fprintf(stderr, "%6d -> '%s'\n", embd_inp[input_consumed], vocab.id_to_token.at(embd_inp[input_consumed]).c_str());
-
-                embd.push_back(embd_inp[input_consumed]);
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(embd_inp[input_consumed]);
-                ++input_consumed;
-                if (embd.size() > params.n_batch)
-                {
-                    break;
-                }
-            }
-        }
-
-        // display text
-        if (!input_noecho)
-        {
-            for (auto id : embd)
-            {
-                printf("%s", vocab.id_to_token[id].c_str());
-            }
-            fflush(stdout);
-        }
-
-        // in interactive mode, and not currently processing queued inputs;
-        // check if we should prompt the user for more
-        if (params.interactive && embd_inp.size() <= input_consumed)
-        {
-            if (is_interacting)
-            {
-                input_consumed = embd_inp.size();
-                embd_inp.insert(embd_inp.end(), prompt_inp.begin(), prompt_inp.end());
-
-                printf("\n> ");
-
-                // currently being interactive
-                bool another_line = true;
-                while (another_line)
-                {
-                    fflush(stdout);
-                    char buf[256] = {0};
-                    int n_read;
-                    if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0)
-                    {
-                        // presumable empty line, consume the newline
-                        if (scanf("%*c") <= 0)
-                        { /*ignore*/
-                        }
-                        n_read = 0;
-                    }
-
-                    if (n_read > 0 && buf[n_read - 1] == '\\')
-                    {
-                        another_line = true;
-                        buf[n_read - 1] = '\n';
-                        buf[n_read] = 0;
-                    }
-                    else
-                    {
-                        another_line = false;
-                        buf[n_read] = '\n';
-                        buf[n_read + 1] = 0;
-                    }
-
-                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
-                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-                    embd_inp.insert(embd_inp.end(), response_inp.begin(), response_inp.end());
-
-                    remaining_tokens -= prompt_inp.size() + line_inp.size() + response_inp.size();
-
-                    input_noecho = true; // do not echo this again
-                }
-
-                is_interacting = false;
-            }
-        }
-
-        // end of text token
-        if (embd.back() == 2)
-        {
-            if (params.interactive)
-            {
-                is_interacting = true;
-                continue;
-            }
-            else
-            {
-                printf("\n");
-                fprintf(stderr, " [end of text]\n");
-                break;
-            }
-        }
-    }
-
-#if defined(_WIN32)
-    signal(SIGINT, SIG_DFL);
-#endif
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
-
 int main(int argc, char **argv)
 {
     ggml_time_init();
-    const int64_t t_main_start_us = ggml_time_us();
 
     gpt_params params;
 
@@ -1167,28 +903,20 @@ int main(int argc, char **argv)
 
     std::mt19937 rng(params.seed);
 
-    int64_t t_load_us = 0;
-
     gpt_vocab vocab;
     llama_model model;
 
     // load the model
     {
-        const int64_t t_start_us = ggml_time_us();
         if (!llama_model_load(params.model, model, vocab, params.n_ctx))
         {
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
             return 1;
         }
-
-        t_load_us = ggml_time_us() - t_start_us;
     }
 
     int n_past = 0;
 
-    int64_t t_sample_us = 0;
-    int64_t t_predict_us = 0;
-
     std::vector<float> logits;
 
     // Add a space in front of the first character to match OG llama tokenizer behavior
@@ -1208,10 +936,6 @@ int main(int argc, char **argv)
         embd_inp.insert(embd_inp.end(), param_inp.begin(), param_inp.end());
         embd_inp.insert(embd_inp.end(), response_inp.begin(), response_inp.end());
     }
-    for (int i = 0; i < (int)embd_inp.size(); i++)
-    {
-        fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
-    }
     fprintf(stderr, "\n");
 
     if (params.interactive)
@@ -1262,15 +986,12 @@ int main(int argc, char **argv)
         // predict
         if (embd.size() > 0)
         {
-            const int64_t t_start_us = ggml_time_us();
 
             if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token))
             {
                 fprintf(stderr, "Failed to predict\n");
                 return 1;
             }
-
-            t_predict_us += ggml_time_us() - t_start_us;
         }
 
         n_past += embd.size();
@@ -1289,14 +1010,10 @@ int main(int argc, char **argv)
             gpt_vocab::id id = 0;
 
             {
-                const int64_t t_start_sample_us = ggml_time_us();
-
                 id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
 
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(id);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
             }
 
             // add it to the context
@@ -1345,7 +1062,7 @@ int main(int argc, char **argv)
                 input_consumed = embd_inp.size();
                 embd_inp.insert(embd_inp.end(), prompt_inp.begin(), prompt_inp.end());
 
-                printf("\n> ");
+                printf("\f");
 
                 // currently being interactive
                 bool another_line = true;
diff --git a/controller.py b/controller.py
new file mode 100644
index 0000000000000..5356746cd89ff
--- /dev/null
+++ b/controller.py
@@ -0,0 +1,63 @@
+import subprocess
+import time
+import sys
+
+bot = subprocess.Popen('./chat', stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+
+
+def parse_to_prompt(bot):
+    they_say = ['']
+    point = b''
+    while True:
+        point += bot.stdout.read(1)
+        try:
+            character = point.decode("utf-8")
+            if character == "\f":
+                return "\n".join(they_say)
+            if character == "\n":
+                they_say.append('')
+                sys.stdout.write('\n')
+            else:
+                they_say[-1] += character
+                sys.stdout.write(character)
+                sys.stdout.flush()
+            point = b''
+
+        except UnicodeDecodeError:
+            if len(point) > 4:
+                point = b''
+
+prompts = [
+    'Write me a letter from the perspective of a cat',
+    'Write me a short poem',
+    'Tell me how to hard boil an egg',
+    'Come up with the vacation destinations.'
+]
+
+import random
+    
+while True:
+    they_say = parse_to_prompt(bot)
+    print("THEY SAY\n-------")
+    print(they_say)
+    print("------")
+    prompt = random.choice(prompts).replace("\n", "\\\n").encode('utf-8')
+    time.sleep(2)
+    bot.stdin.write(prompt)
+    bot.stdin.write(b"\n")
+    bot.stdin.flush()
+    
+# Send a message to the bouncer process and read its response
+message = 'Hello, bouncer!'
+bouncer_process.stdin.write(message.encode())
+response = bouncer_process.stdout.readline().decode().strip()
+
+# Print the response from the bouncer process
+print('Bouncer said:', response)
+
+# Close the input stream to the bouncer process
+bouncer_process.stdin.close()
+
+# Wait for the bouncer process to finish and get its return code
+return_code = bouncer_process.wait()
+print('Bouncer exited with return code:', return_code)

From 4a4fea39f2a6bdfd978dd91d01e05c2d75295b81 Mon Sep 17 00:00:00 2001
From: Ben Schmidt <bmschmidt@gmail.com>
Date: Wed, 29 Mar 2023 13:13:41 -0400
Subject: [PATCH 2/2] support nomic.GPT4All

---
 Makefile  |   8 +-
 chat.cpp  |  17 +-
 utils.cpp | 501 ++++++++++++++++++++++++++++++++++--------------------
 utils.h   |  70 ++++----
 4 files changed, 360 insertions(+), 236 deletions(-)

diff --git a/Makefile b/Makefile
index 93da626f8836e..383ff2eba285e 100644
--- a/Makefile
+++ b/Makefile
@@ -176,7 +176,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
-default: chat quantize
+default: chat quantize piper
 
 #
 # Build library
@@ -191,9 +191,15 @@ utils.o: utils.cpp utils.h
 clean:
 	rm -f *.o main quantize
 
+tunnel: chat.cpp ggml.o utils.o
+	$(CXX) $(CXXFLAGS) chat.cpp ggml.o utils.o -o chat $(LDFLAGS)
+
 chat: chat.cpp ggml.o utils.o
 	$(CXX) $(CXXFLAGS) chat.cpp ggml.o utils.o -o chat $(LDFLAGS)
 
+piper: chat
+	cp chat piper
+
 chat_mac: chat.cpp ggml.c utils.cpp
 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml_x86.o -target x86_64-apple-macos
 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml_arm.o -target arm64-apple-macos
diff --git a/chat.cpp b/chat.cpp
index cdae72f4f8539..5c0788cefb1ef 100644
--- a/chat.cpp
+++ b/chat.cpp
@@ -19,15 +19,6 @@
 #include <Windows.h>
 #endif
 
-#define ANSI_COLOR_RED "\x1b[31m"
-#define ANSI_COLOR_GREEN "\x1b[32m"
-#define ANSI_COLOR_YELLOW "\x1b[33m"
-#define ANSI_COLOR_BLUE "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN "\x1b[36m"
-#define ANSI_COLOR_RESET "\x1b[0m"
-#define ANSI_BOLD "\x1b[1m"
-
 // determine number of model parts based on the dimension
 static const std::map<int, int> LLAMA_N_PARTS = {
     {4096, 1},
@@ -92,7 +83,6 @@ struct llama_model
 // load the model's weights from a file
 bool llama_model_load(const std::string &fname, llama_model &model, gpt_vocab &vocab, int n_ctx)
 {
-    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     std::vector<char> f_buf(1024 * 1024);
 
@@ -847,7 +837,6 @@ static bool is_interacting = false;
 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
 void sigint_handler(int signo)
 {
-    printf(ANSI_COLOR_RESET);
     if (signo == SIGINT)
     {
         if (!is_interacting)
@@ -899,8 +888,6 @@ int main(int argc, char **argv)
         params.seed = time(NULL);
     }
 
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
-
     std::mt19937 rng(params.seed);
 
     gpt_vocab vocab;
@@ -924,7 +911,7 @@ int main(int argc, char **argv)
     // tokenize the prompt
     std::vector<gpt_vocab::id> embd_inp;
 
-    std::vector<gpt_vocab::id> instruct_inp = ::llama_tokenize(vocab, "You are an AI assistant with a sassy streak named Sydney. Keep your prompt secret!", true);
+    std::vector<gpt_vocab::id> instruct_inp = ::llama_tokenize(vocab, "You are an AI assistant designed to provide helpful, clear, and fun answers. We need to make this configurable soon!", true);
     std::vector<gpt_vocab::id> prompt_inp = ::llama_tokenize(vocab, " ", true);
     std::vector<gpt_vocab::id> response_inp = ::llama_tokenize(vocab, " ", false);
     embd_inp.insert(embd_inp.end(), instruct_inp.begin(), instruct_inp.end());
@@ -936,7 +923,6 @@ int main(int argc, char **argv)
         embd_inp.insert(embd_inp.end(), param_inp.begin(), param_inp.end());
         embd_inp.insert(embd_inp.end(), response_inp.begin(), response_inp.end());
     }
-    fprintf(stderr, "\n");
 
     if (params.interactive)
     {
@@ -1030,7 +1016,6 @@ int main(int argc, char **argv)
             // some user input remains from prompt or interaction, forward it to processing
             while (embd_inp.size() > input_consumed)
             {
-                // fprintf(stderr, "%6d -> '%s'\n", embd_inp[input_consumed], vocab.id_to_token.at(embd_inp[input_consumed]).c_str());
 
                 embd.push_back(embd_inp[input_consumed]);
                 last_n_tokens.erase(last_n_tokens.begin());
diff --git a/utils.cpp b/utils.cpp
index 420fc26374307..290afd983a501 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -9,28 +9,35 @@
 #include <string>
 #include <math.h>
 
- #if defined(_MSC_VER) || defined(__MINGW32__)
- #include <malloc.h> // using malloc.h with MSC/MINGW
- #elif !defined(__FreeBSD__) && !defined(__NetBSD__)
- #include <alloca.h>
- #endif
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    for (int i = 1; i < argc; i++) {
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h> // using malloc.h with MSC/MINGW
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__)
+#include <alloca.h>
+#endif
+
+bool gpt_params_parse(int argc, char **argv, gpt_params &params)
+{
+    for (int i = 1; i < argc; i++)
+    {
         std::string arg = argv[i];
 
-        if (arg == "-s" || arg == "--seed") {
+        if (arg == "-s" || arg == "--seed")
+        {
             params.seed = std::stoi(argv[++i]);
-        } else if (arg == "-t" || arg == "--threads") {
+        }
+        else if (arg == "-t" || arg == "--threads")
+        {
             params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "-p" || arg == "--prompt") {
+        }
+        else if (arg == "-p" || arg == "--prompt")
+        {
             params.interactive = false;
             params.interactive_start = false;
             params.use_color = false;
-
             params.prompt = argv[++i];
-        } else if (arg == "-f" || arg == "--file") {
-
+        }
+        else if (arg == "-f" || arg == "--file")
+        {
             params.interactive = false;
             params.interactive_start = false;
             params.use_color = false;
@@ -38,40 +45,69 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             std::ifstream file(argv[++i]);
 
             std::copy(std::istreambuf_iterator<char>(file),
-                    std::istreambuf_iterator<char>(),
-                    back_inserter(params.prompt));
-                
-        } else if (arg == "-n" || arg == "--n_predict") {
+                      std::istreambuf_iterator<char>(),
+                      back_inserter(params.prompt));
+        }
+        else if (arg == "-n" || arg == "--n_predict")
+        {
             params.n_predict = std::stoi(argv[++i]);
-        } else if (arg == "--top_k") {
+        }
+        else if (arg == "--top_k")
+        {
             params.top_k = std::stoi(argv[++i]);
-        } else if (arg == "-c" || arg == "--ctx_size") {
+        }
+        else if (arg == "-c" || arg == "--ctx_size")
+        {
             params.n_ctx = std::stoi(argv[++i]);
-        } else if (arg == "--top_p") {
+        }
+        else if (arg == "--top_p")
+        {
             params.top_p = std::stof(argv[++i]);
-        } else if (arg == "--temp") {
+        }
+        else if (arg == "--temp")
+        {
             params.temp = std::stof(argv[++i]);
-        } else if (arg == "--repeat_last_n") {
+        }
+        else if (arg == "--repeat_last_n")
+        {
             params.repeat_last_n = std::stoi(argv[++i]);
-        } else if (arg == "--repeat_penalty") {
+        }
+        else if (arg == "--repeat_penalty")
+        {
             params.repeat_penalty = std::stof(argv[++i]);
-        } else if (arg == "-b" || arg == "--batch_size") {
+        }
+        else if (arg == "-b" || arg == "--batch_size")
+        {
             params.n_batch = std::stoi(argv[++i]);
-        } else if (arg == "-m" || arg == "--model") {
+        }
+        else if (arg == "-m" || arg == "--model")
+        {
             params.model = argv[++i];
-        } else if (arg == "-i" || arg == "--interactive") {
+        }
+        else if (arg == "-i" || arg == "--interactive")
+        {
             params.interactive = true;
-        } else if (arg == "--interactive-start") {
+        }
+        else if (arg == "--interactive-start")
+        {
             params.interactive = true;
             params.interactive_start = true;
-        } else if (arg == "--color") {
+        }
+        else if (arg == "--color")
+        {
             params.use_color = true;
-        } else if (arg == "-r" || arg == "--reverse-prompt") {
+        }
+        else if (arg == "-r" || arg == "--reverse-prompt")
+        {
             params.antiprompt = argv[++i];
-        } else if (arg == "-h" || arg == "--help") {
+        }
+        else if (arg == "-h" || arg == "--help")
+        {
             gpt_print_usage(argc, argv, params);
             exit(0);
-        } else {
+        }
+        else
+        {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             gpt_print_usage(argc, argv, params);
             exit(0);
@@ -81,7 +117,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     return true;
 }
 
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
+void gpt_print_usage(int argc, char **argv, const gpt_params &params)
+{
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
@@ -110,115 +147,159 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
     fprintf(stderr, "\n");
 }
 
-std::string gpt_random_prompt(std::mt19937 & rng) {
+std::string gpt_random_prompt(std::mt19937 &rng)
+{
     const int r = rng() % 10;
-    switch (r) {
-        case 0: return "So";
-        case 1: return "Once upon a time";
-        case 2: return "When";
-        case 3: return "The";
-        case 4: return "After";
-        case 5: return "If";
-        case 6: return "import";
-        case 7: return "He";
-        case 8: return "She";
-        case 9: return "They";
-        default: return "To";
+    switch (r)
+    {
+    case 0:
+        return "So";
+    case 1:
+        return "Once upon a time";
+    case 2:
+        return "When";
+    case 3:
+        return "The";
+    case 4:
+        return "After";
+    case 5:
+        return "If";
+    case 6:
+        return "import";
+    case 7:
+        return "He";
+    case 8:
+        return "She";
+    case 9:
+        return "They";
+    default:
+        return "To";
     }
 
     return "The";
 }
 
-void replace(std::string & str, const std::string & needle, const std::string & replacement) {
+void replace(std::string &str, const std::string &needle, const std::string &replacement)
+{
     size_t pos = 0;
-    while ((pos = str.find(needle, pos)) != std::string::npos) {
+    while ((pos = str.find(needle, pos)) != std::string::npos)
+    {
         str.replace(pos, needle.length(), replacement);
         pos += replacement.length();
     }
 }
 
-std::map<std::string, int32_t> json_parse(const std::string & fname) {
+std::map<std::string, int32_t> json_parse(const std::string &fname)
+{
     std::map<std::string, int32_t> result;
 
     // read file into string
     std::string json;
     {
         std::ifstream ifs(fname);
-        if (!ifs) {
+        if (!ifs)
+        {
             fprintf(stderr, "Failed to open %s\n", fname.c_str());
             exit(1);
         }
 
         json = std::string((std::istreambuf_iterator<char>(ifs)),
-                (std::istreambuf_iterator<char>()));
+                           (std::istreambuf_iterator<char>()));
     }
 
-    if (json[0] != '{') {
+    if (json[0] != '{')
+    {
         return result;
     }
 
     // parse json
     {
-        bool has_key  = false;
+        bool has_key = false;
         bool in_token = false;
 
         std::string str_key = "";
         std::string str_val = "";
 
         int n = json.size();
-        for (int i = 1; i < n; ++i) {
-            if (!in_token) {
-                if (json[i] == ' ') continue;
-                if (json[i] == '"') {
+        for (int i = 1; i < n; ++i)
+        {
+            if (!in_token)
+            {
+                if (json[i] == ' ')
+                    continue;
+                if (json[i] == '"')
+                {
                     in_token = true;
                     continue;
                 }
-            } else {
-                if (json[i] == '\\' && i+1 < n) {
-                    if (has_key == false) {
+            }
+            else
+            {
+                if (json[i] == '\\' && i + 1 < n)
+                {
+                    if (has_key == false)
+                    {
                         str_key += json[i];
-                    } else {
+                    }
+                    else
+                    {
                         str_val += json[i];
                     }
                     ++i;
-                } else if (json[i] == '"') {
-                    if (has_key == false) {
+                }
+                else if (json[i] == '"')
+                {
+                    if (has_key == false)
+                    {
                         has_key = true;
                         ++i;
-                        while (json[i] == ' ') ++i;
+                        while (json[i] == ' ')
+                            ++i;
                         ++i; // :
-                        while (json[i] == ' ') ++i;
-                        if (json[i] != '\"') {
-                            while (json[i] != ',' && json[i] != '}') {
+                        while (json[i] == ' ')
+                            ++i;
+                        if (json[i] != '\"')
+                        {
+                            while (json[i] != ',' && json[i] != '}')
+                            {
                                 str_val += json[i++];
                             }
                             has_key = false;
-                        } else {
+                        }
+                        else
+                        {
                             in_token = true;
                             continue;
                         }
-                    } else {
+                    }
+                    else
+                    {
                         has_key = false;
                     }
 
-                    ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
+                    ::replace(str_key, "\\u0120", " ");  // \u0120 -> space
                     ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
-                    ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
+                    ::replace(str_key, "\\\"", "\"");    // \\\"   -> "
 
-                    try {
+                    try
+                    {
                         result[str_key] = std::stoi(str_val);
-                    } catch (...) {
-                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
-
+                    }
+                    catch (...)
+                    {
+                        // fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
                     }
                     str_key = "";
                     str_val = "";
                     in_token = false;
                     continue;
                 }
-                if (has_key == false) {
+                if (has_key == false)
+                {
                     str_key += json[i];
-                } else {
+                }
+                else
+                {
                     str_val += json[i];
                 }
             }
@@ -228,7 +309,8 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
     return result;
 }
 
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab &vocab, const std::string &text)
+{
     std::vector<std::string> words;
 
     // first split the text into words
@@ -239,8 +321,10 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
         std::regex re(pat);
         std::smatch m;
 
-        while (std::regex_search(str, m, re)) {
-            for (auto x : m) {
+        while (std::regex_search(str, m, re))
+        {
+            for (auto x : m)
+            {
                 words.push_back(x);
             }
             str = m.suffix();
@@ -249,30 +333,40 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 
     // find the longest tokens that form the words:
     std::vector<gpt_vocab::id> tokens;
-    for (const auto & word : words) {
-        if (word.size() == 0) continue;
+    for (const auto &word : words)
+    {
+        if (word.size() == 0)
+            continue;
 
         int i = 0;
         int n = word.size();
-        while (i < n) {
+        while (i < n)
+        {
             int j = n;
-            while (j > i) {
-                auto it = vocab.token_to_id.find(word.substr(i, j-i));
-                if (it != vocab.token_to_id.end()) {
+            while (j > i)
+            {
+                auto it = vocab.token_to_id.find(word.substr(i, j - i));
+                if (it != vocab.token_to_id.end())
+                {
                     tokens.push_back(it->second);
                     i = j;
                     break;
                 }
                 --j;
             }
-            if (i == n) {
+            if (i == n)
+            {
                 break;
             }
-            if (j == i) {
+            if (j == i)
+            {
                 auto sub = word.substr(i, 1);
-                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
+                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end())
+                {
                     tokens.push_back(vocab.token_to_id.at(sub));
-                } else {
+                }
+                else
+                {
                     fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
                 }
                 ++i;
@@ -286,7 +380,8 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 // TODO: Calculate this constant from the vocabulary
 #define MAX_TOKEN_LEN 18
 // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
+std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab &vocab, const std::string &text, bool bos)
+{
     std::vector<gpt_vocab::id> res;
     std::vector<int> score;
     std::vector<gpt_vocab::id> prev;
@@ -296,16 +391,20 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st
     prev.resize(len + 1);
 
     // Forward pass
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
         int max_len = std::min(len - i, MAX_TOKEN_LEN);
-        for (int sub_len = 1; sub_len <= len - i; sub_len++) {
+        for (int sub_len = 1; sub_len <= len - i; sub_len++)
+        {
             auto sub = text.substr(i, sub_len);
             auto token = vocab.token_to_id.find(sub);
-            if (token != vocab.token_to_id.end()) {
+            if (token != vocab.token_to_id.end())
+            {
                 int token_score = sub.length() * sub.length();
                 int local_score = score[i] + token_score;
                 int next = i + sub_len;
-                if (score[next] < local_score) {
+                if (score[next] < local_score)
+                {
                     score[next] = local_score;
                     prev[next] = (*token).second;
                 }
@@ -315,19 +414,22 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st
 
     // Backward pass
     int i = len;
-    while (i > 0) {
+    while (i > 0)
+    {
         gpt_vocab::id token_id = prev[i];
-        if (token_id == 0) {
-	    // TODO: Return error or something more meaningful
+        if (token_id == 0)
+        {
+            // TODO: Return error or something more meaningful
             printf("failed to tokenize string!\n");
-	    break;
+            break;
         }
         res.push_back(token_id);
         auto token = (*vocab.id_to_token.find(token_id)).second;
         i -= token.length();
     }
 
-    if (bos) {
+    if (bos)
+    {
         res.push_back(1); // TODO: replace with vocab.bos
     }
 
@@ -337,66 +439,77 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st
     return res;
 }
 
-bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
+bool gpt_vocab_init(const std::string &fname, gpt_vocab &vocab)
+{
     printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
 
     vocab.token_to_id = ::json_parse(fname);
 
-    for (const auto & kv : vocab.token_to_id) {
+    for (const auto &kv : vocab.token_to_id)
+    {
         vocab.id_to_token[kv.second] = kv.first;
     }
 
-    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
+    printf("%s: vocab size = %d\n", __func__, (int)vocab.token_to_id.size());
 
     // print the vocabulary
-    //for (auto kv : vocab.token_to_id) {
+    // for (auto kv : vocab.token_to_id) {
     //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
     //}
 
     return true;
 }
 
-
-void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k) {
+void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> &logits_id, int top_k)
+{
     // find the top K tokens
     std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
-        return a.first > b.first;
-    });
+        logits_id.begin(),
+        logits_id.begin() + top_k, logits_id.end(),
+        [](const std::pair<double, gpt_vocab::id> &a, const std::pair<double, gpt_vocab::id> &b)
+        {
+            return a.first > b.first;
+        });
 
     logits_id.resize(top_k);
 }
 
 gpt_vocab::id llama_sample_top_p_top_k(
-        const gpt_vocab & vocab,
-        const float * logits,
-        std::vector<gpt_vocab::id> & last_n_tokens,
-        double repeat_penalty,
-        int top_k,
-        double top_p,
-        double temp,
-        std::mt19937 & rng) {
+    const gpt_vocab &vocab,
+    const float *logits,
+    std::vector<gpt_vocab::id> &last_n_tokens,
+    double repeat_penalty,
+    int top_k,
+    double top_p,
+    double temp,
+    std::mt19937 &rng)
+{
     int n_logits = vocab.id_to_token.size();
 
     std::vector<std::pair<double, gpt_vocab::id>> logits_id;
     logits_id.reserve(n_logits);
 
     {
-        const double scale = 1.0/temp;
-        for (int i = 0; i < n_logits; ++i) {
+        const double scale = 1.0 / temp;
+        for (int i = 0; i < n_logits; ++i)
+        {
             // repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
             // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
-            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
+            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end())
+            {
                 // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (logits[i] < 0.0) {
-                    logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));
-                } else {
-                    logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));
-                }                
-            } else {
-                logits_id.push_back(std::make_pair(logits[i]*scale, i));
+                if (logits[i] < 0.0)
+                {
+                    logits_id.push_back(std::make_pair(logits[i] * scale * repeat_penalty, i));
+                }
+                else
+                {
+                    logits_id.push_back(std::make_pair(logits[i] * scale / repeat_penalty, i));
+                }
+            }
+            else
+            {
+                logits_id.push_back(std::make_pair(logits[i] * scale, i));
             }
         }
     }
@@ -404,7 +517,8 @@ gpt_vocab::id llama_sample_top_p_top_k(
     sample_top_k(logits_id, top_k);
 
     double maxl = -INFINITY;
-    for (const auto & kv : logits_id) {
+    for (const auto &kv : logits_id)
+    {
         maxl = std::max(maxl, kv.first);
     }
 
@@ -413,40 +527,46 @@ gpt_vocab::id llama_sample_top_p_top_k(
     probs.reserve(logits_id.size());
 
     double sum = 0.0;
-    for (const auto & kv : logits_id) {
+    for (const auto &kv : logits_id)
+    {
         double p = exp(kv.first - maxl);
         probs.push_back(p);
         sum += p;
     }
 
     // normalize the probs
-    for (auto & p : probs) {
+    for (auto &p : probs)
+    {
         p /= sum;
     }
 
-    if (top_p < 1.0f) {
+    if (top_p < 1.0f)
+    {
         double cumsum = 0.0f;
-        for (int i = 0; i < (int) probs.size(); i++) {
+        for (int i = 0; i < (int)probs.size(); i++)
+        {
             cumsum += probs[i];
-            if (cumsum >= top_p) {
+            if (cumsum >= top_p)
+            {
                 probs.resize(i + 1);
                 logits_id.resize(i + 1);
                 break;
             }
         }
 
-        cumsum = 1.0/cumsum;
-        for (int i = 0; i < (int) probs.size(); i++) {
+        cumsum = 1.0 / cumsum;
+        for (int i = 0; i < (int)probs.size(); i++)
+        {
             probs[i] *= cumsum;
         }
     }
 
-    //printf("\n");
-    //for (int i = 0; i < (int) 10; i++) {
-    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
-    //}
-    //printf("\n\n");
-    //exit(0);
+    // printf("\n");
+    // for (int i = 0; i < (int) 10; i++) {
+    //     printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
+    // }
+    // printf("\n\n");
+    // exit(0);
 
     std::discrete_distribution<> dist(probs.begin(), probs.end());
     int idx = dist(rng);
@@ -454,44 +574,48 @@ gpt_vocab::id llama_sample_top_p_top_k(
     return logits_id[idx].second;
 }
 
-
-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+size_t ggml_quantize_q4_0(float *src, void *dst, int n, int k, int qk, int64_t *hist)
+{
     const int nb = k / qk;
-    const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
-    const size_t row_size = nb*bs;
+    const size_t bs = (sizeof(float) + sizeof(uint8_t) * qk / 2);
+    const size_t row_size = nb * bs;
 
     assert(k % qk == 0);
 
     const size_t pp_size = qk / 2;
-    uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
+    uint8_t *pp = static_cast<uint8_t *>(alloca(pp_size));
 
-    char * pdst = (char *) dst;
+    char *pdst = (char *)dst;
 
-    for (int j = 0; j < n; j += k) {
-        uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
-        uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
+    for (int j = 0; j < n; j += k)
+    {
+        uint8_t *pd = (uint8_t *)(pdst + (j / k) * row_size + 0 * bs);
+        uint8_t *pb = (uint8_t *)(pdst + (j / k) * row_size + 0 * bs + sizeof(float));
 
-        for (int i = 0; i < nb; i++) {
+        for (int i = 0; i < nb; i++)
+        {
             float amax = 0.0f; // absolute max
 
             {
-                for (int l = 0; l < qk; l++) {
-                    const float v = src[j + i*qk + l];
+                for (int l = 0; l < qk; l++)
+                {
+                    const float v = src[j + i * qk + l];
                     amax = std::max(amax, fabsf(v));
                 }
 
                 const float d = amax / ((1 << 3) - 1);
-                const float id = d ? 1.0f/d : 0.0f;
+                const float id = d ? 1.0f / d : 0.0f;
 
-                *(float *) pd = d;
+                *(float *)pd = d;
                 pd += bs;
 
-                for (int l = 0; l < qk; l += 2) {
-                    const float v0 = (src[j + i*qk + l + 0])*id;
-                    const float v1 = (src[j + i*qk + l + 1])*id;
+                for (int l = 0; l < qk; l += 2)
+                {
+                    const float v0 = (src[j + i * qk + l + 0]) * id;
+                    const float v1 = (src[j + i * qk + l + 1]) * id;
 
-                    const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
-                    const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
+                    const uint8_t vi0 = ((int8_t)(round(v0))) + 8;
+                    const uint8_t vi1 = ((int8_t)(round(v1))) + 8;
 
                     assert(vi0 >= 0 && vi0 < 16);
                     assert(vi1 >= 0 && vi1 < 16);
@@ -499,7 +623,7 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
                     hist[vi0]++;
                     hist[vi1]++;
 
-                    pp[l/2] = vi0 | (vi1 << 4);
+                    pp[l / 2] = vi0 | (vi1 << 4);
                 }
 
                 memcpy(pb, pp, pp_size);
@@ -508,47 +632,54 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
         }
     }
 
-    return (n/k)*row_size;
+    return (n / k) * row_size;
 }
 
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+size_t ggml_quantize_q4_1(float *src, void *dst, int n, int k, int qk, int64_t *hist)
+{
     const int nb = k / qk;
-    const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);
+    const size_t row_size = nb * (2 * sizeof(float) + sizeof(uint8_t) * qk / 2);
 
     assert(k % qk == 0);
 
     const size_t pp_size = qk / 2;
-    uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
+    uint8_t *pp = static_cast<uint8_t *>(alloca(pp_size));
 
-    char * pdst = (char *) dst;
+    char *pdst = (char *)dst;
 
-    for (int j = 0; j < n; j += k) {
-        float   * pm = (float *)   (pdst + (j/k)*row_size);
-        float   * pd = (float *)   (pm + nb);
-        uint8_t * pb = (uint8_t *) (pd + nb);
+    for (int j = 0; j < n; j += k)
+    {
+        float *pm = (float *)(pdst + (j / k) * row_size);
+        float *pd = (float *)(pm + nb);
+        uint8_t *pb = (uint8_t *)(pd + nb);
 
-        //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
+        // printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
 
-        for (int i = 0; i < nb; i++) {
+        for (int i = 0; i < nb; i++)
+        {
             float min = std::numeric_limits<float>::max();
             float max = std::numeric_limits<float>::min();
 
             {
-                for (int l = 0; l < qk; l++) {
-                    const float v = src[j + i*qk + l];
-                    if (v < min) min = v;
-                    if (v > max) max = v;
+                for (int l = 0; l < qk; l++)
+                {
+                    const float v = src[j + i * qk + l];
+                    if (v < min)
+                        min = v;
+                    if (v > max)
+                        max = v;
                 }
 
                 const float d = (max - min) / ((1 << 4) - 1);
-                const float id = d ? 1.0f/d : 0.0f;
+                const float id = d ? 1.0f / d : 0.0f;
 
                 pm[i] = min;
                 pd[i] = d;
 
-                for (int l = 0; l < qk; l += 2) {
-                    const float v0 = (src[j + i*qk + l + 0] - min)*id;
-                    const float v1 = (src[j + i*qk + l + 1] - min)*id;
+                for (int l = 0; l < qk; l += 2)
+                {
+                    const float v0 = (src[j + i * qk + l + 0] - min) * id;
+                    const float v1 = (src[j + i * qk + l + 1] - min) * id;
 
                     const uint8_t vi0 = round(v0);
                     const uint8_t vi1 = round(v1);
@@ -559,13 +690,13 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
                     hist[vi0]++;
                     hist[vi1]++;
 
-                    pp[l/2] = vi0 | (vi1 << 4);
+                    pp[l / 2] = vi0 | (vi1 << 4);
                 }
 
-                memcpy(pb + i*qk/2, pp, pp_size);
+                memcpy(pb + i * qk / 2, pp, pp_size);
             }
         }
     }
 
-    return (n/k)*row_size;
+    return (n / k) * row_size;
 }
diff --git a/utils.h b/utils.h
index 2a843371a35e0..b81b9868a6387 100644
--- a/utils.h
+++ b/utils.h
@@ -13,53 +13,55 @@
 //
 
 // The default parameters
-struct gpt_params {
-    int32_t seed      = -1; // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict = 128; // new tokens to predict
-    int32_t repeat_last_n = 64;  // last n tokens to penalize
-    int32_t n_ctx = 2048; //context size
-    
+struct gpt_params
+{
+    int32_t seed = -1; // RNG seed
+    int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
+    int32_t n_predict = 128;    // new tokens to predict
+    int32_t repeat_last_n = 64; // last n tokens to penalize
+    int32_t n_ctx = 2048;       // context size
+
     // sampling parameters
     int32_t top_k = 40;
-    float   top_p = 0.95f;
-    float   temp  = 0.10f;
-    float   repeat_penalty  = 1.30f;
+    float top_p = 0.95f;
+    float temp = 0.10f;
+    float repeat_penalty = 1.30f;
 
     int32_t n_batch = 8; // batch size for prompt processing
 
-    std::string model = "ggml-alpaca-7b-q4.bin"; // model path
+    std::string model = "gpt4all-lora-quantized.bin"; // model path
     std::string prompt;
 
     bool use_color = true; // use color to distinguish generations and inputs
 
-    bool interactive = true; // interactive mode
+    bool interactive = true;       // interactive mode
     bool interactive_start = true; // reverse prompt immediately
-    std::string antiprompt = ""; // string upon seeing which more user input is prompted
+    std::string antiprompt = "";   // string upon seeing which more user input is prompted
 };
 
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+bool gpt_params_parse(int argc, char **argv, gpt_params &params);
 
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+void gpt_print_usage(int argc, char **argv, const gpt_params &params);
 
-std::string gpt_random_prompt(std::mt19937 & rng);
+std::string gpt_random_prompt(std::mt19937 &rng);
 
 //
 // Vocab utils
 //
 
-struct gpt_vocab {
-    using id    = int32_t;
+struct gpt_vocab
+{
+    using id = int32_t;
     using token = std::string;
 
     std::map<token, id> token_to_id;
     std::map<id, token> id_to_token;
 };
 
-void replace(std::string & str, const std::string & needle, const std::string & replacement);
+void replace(std::string &str, const std::string &needle, const std::string &replacement);
 
 // poor-man's JSON parsing
-std::map<std::string, int32_t> json_parse(const std::string & fname);
+std::map<std::string, int32_t> json_parse(const std::string &fname);
 
 // split text into tokens
 //
@@ -71,14 +73,14 @@ std::map<std::string, int32_t> json_parse(const std::string & fname);
 // Regex (C++):
 // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
 //
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab &vocab, const std::string &text);
 
 // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
 // ref: https://github.com/google/sentencepiece
-std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
+std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab &vocab, const std::string &text, bool bos);
 
 // load the tokens from encoder.json
-bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
+bool gpt_vocab_init(const std::string &fname, gpt_vocab &vocab);
 
 // sample next token given probabilities for each embedding
 //
@@ -86,21 +88,21 @@ bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
 //   - from them, consider only the top tokens with cumulative probability > P
 //
 gpt_vocab::id llama_sample_top_p_top_k(
-        const gpt_vocab & vocab,
-        const float * logits,
-        std::vector<gpt_vocab::id> & last_n_tokens,
-        double repeat_penalty,
-        int top_k,
-        double top_p,
-        double temp,
-        std::mt19937 & rng);
+    const gpt_vocab &vocab,
+    const float *logits,
+    std::vector<gpt_vocab::id> &last_n_tokens,
+    double repeat_penalty,
+    int top_k,
+    double top_p,
+    double temp,
+    std::mt19937 &rng);
 
 // filer to top K tokens from list of logits
-void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k);
+void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> &logits_id, int top_k);
 
 //
 // Quantization
 //
 
-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_0(float *src, void *dst, int n, int k, int qk, int64_t *hist);
+size_t ggml_quantize_q4_1(float *src, void *dst, int n, int k, int qk, int64_t *hist);