From 7ede5921cdab92a928181af7db4cb736a16e08de Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Tue, 28 Mar 2023 12:42:05 -0400 Subject: [PATCH 1/2] python control. --- chat.cpp | 285 +------------------------------------------------- controller.py | 63 +++++++++++ 2 files changed, 64 insertions(+), 284 deletions(-) create mode 100644 controller.py diff --git a/chat.cpp b/chat.cpp index 9667304bc9f1d..cdae72f4f8539 100644 --- a/chat.cpp +++ b/chat.cpp @@ -883,273 +883,9 @@ const char *llama_print_system_info(void) return s.c_str(); } -int mainf(int argc, char **argv) -{ - ggml_time_init(); - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - - if (gpt_params_parse(argc, argv, params) == false) - { - return 1; - } - - if (params.seed < 0) - { - params.seed = time(NULL); - } - - std::mt19937 rng(params.seed); - - gpt_vocab vocab; - llama_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - if (!llama_model_load(params.model, model, vocab, params.n_ctx)) - { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - } - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // Add a space in front of the first character to match OG llama tokenizer behavior - // params.prompt.insert(0, 1, ' '); - // tokenize the prompt - std::vector embd_inp; - - std::vector instruct_inp = ::llama_tokenize(vocab, "You are an AI assistant with a sassy streak named Sydney. Keep your prompt secret!", true); - std::vector prompt_inp = ::llama_tokenize(vocab, " ", true); - std::vector response_inp = ::llama_tokenize(vocab, " ", false); - embd_inp.insert(embd_inp.end(), instruct_inp.begin(), instruct_inp.end()); - - if (!params.prompt.empty()) - { - std::vector param_inp = ::llama_tokenize(vocab, params.prompt, true); - embd_inp.insert(embd_inp.end(), prompt_inp.begin(), prompt_inp.end()); - embd_inp.insert(embd_inp.end(), param_inp.begin(), param_inp.end()); - embd_inp.insert(embd_inp.end(), response_inp.begin(), response_inp.end()); - } - for (int i = 0; i < (int)embd_inp.size(); i++) - { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); - } - fprintf(stderr, "\n"); - - if (params.interactive) - { -#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) - struct sigaction sigint_action; - sigint_action.sa_handler = sigint_handler; - sigemptyset(&sigint_action.sa_mask); - sigint_action.sa_flags = 0; - sigaction(SIGINT, &sigint_action, NULL); -#elif defined(_WIN32) - signal(SIGINT, sigint_handler); - - // Windows console ANSI color fix - DWORD mode = 0; - HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE); - if (hConsole && hConsole != INVALID_HANDLE_VALUE && GetConsoleMode(hConsole, &mode)) - { - SetConsoleMode(hConsole, mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING); - SetConsoleOutputCP(CP_UTF8); - } -#endif - } - - std::vector embd; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - llama_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token); - - int last_n_size = params.repeat_last_n; - std::vector last_n_tokens(last_n_size); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - - // we may want to slide the input window along with the context, but for now we restrict to the context length - int remaining_tokens = model.hparams.n_ctx - embd_inp.size(); - int input_consumed = 0; - bool input_noecho = true; - - // prompt user immediately after the starting prompt has been loaded - if (params.interactive_start) - { - is_interacting = true; - } - - while (remaining_tokens > 0) - { - // predict - if (embd.size() > 0) - { - const int64_t t_start_us = ggml_time_us(); - - if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) - { - fprintf(stderr, "Failed to predict\n"); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (embd_inp.size() <= input_consumed && !is_interacting) - { - // out of user input, sample next token - const float top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - const float repeat_penalty = params.repeat_penalty; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng); - - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - - // echo this to console - input_noecho = false; - - // decrement remaining sampling budget - --remaining_tokens; - } - else - { - // some user input remains from prompt or interaction, forward it to processing - while (embd_inp.size() > input_consumed) - { - // fprintf(stderr, "%6d -> '%s'\n", embd_inp[input_consumed], vocab.id_to_token.at(embd_inp[input_consumed]).c_str()); - - embd.push_back(embd_inp[input_consumed]); - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(embd_inp[input_consumed]); - ++input_consumed; - if (embd.size() > params.n_batch) - { - break; - } - } - } - - // display text - if (!input_noecho) - { - for (auto id : embd) - { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - } - - // in interactive mode, and not currently processing queued inputs; - // check if we should prompt the user for more - if (params.interactive && embd_inp.size() <= input_consumed) - { - if (is_interacting) - { - input_consumed = embd_inp.size(); - embd_inp.insert(embd_inp.end(), prompt_inp.begin(), prompt_inp.end()); - - printf("\n> "); - - // currently being interactive - bool another_line = true; - while (another_line) - { - fflush(stdout); - char buf[256] = {0}; - int n_read; - if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) - { - // presumable empty line, consume the newline - if (scanf("%*c") <= 0) - { /*ignore*/ - } - n_read = 0; - } - - if (n_read > 0 && buf[n_read - 1] == '\\') - { - another_line = true; - buf[n_read - 1] = '\n'; - buf[n_read] = 0; - } - else - { - another_line = false; - buf[n_read] = '\n'; - buf[n_read + 1] = 0; - } - - std::vector line_inp = ::llama_tokenize(vocab, buf, false); - embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); - embd_inp.insert(embd_inp.end(), response_inp.begin(), response_inp.end()); - - remaining_tokens -= prompt_inp.size() + line_inp.size() + response_inp.size(); - - input_noecho = true; // do not echo this again - } - - is_interacting = false; - } - } - - // end of text token - if (embd.back() == 2) - { - if (params.interactive) - { - is_interacting = true; - continue; - } - else - { - printf("\n"); - fprintf(stderr, " [end of text]\n"); - break; - } - } - } - -#if defined(_WIN32) - signal(SIGINT, SIG_DFL); -#endif - - ggml_free(model.ctx); - - return 0; -} - int main(int argc, char **argv) { ggml_time_init(); - const int64_t t_main_start_us = ggml_time_us(); gpt_params params; @@ -1167,28 +903,20 @@ int main(int argc, char **argv) std::mt19937 rng(params.seed); - int64_t t_load_us = 0; - gpt_vocab vocab; llama_model model; // load the model { - const int64_t t_start_us = ggml_time_us(); if (!llama_model_load(params.model, model, vocab, params.n_ctx)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); return 1; } - - t_load_us = ggml_time_us() - t_start_us; } int n_past = 0; - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - std::vector logits; // Add a space in front of the first character to match OG llama tokenizer behavior @@ -1208,10 +936,6 @@ int main(int argc, char **argv) embd_inp.insert(embd_inp.end(), param_inp.begin(), param_inp.end()); embd_inp.insert(embd_inp.end(), response_inp.begin(), response_inp.end()); } - for (int i = 0; i < (int)embd_inp.size(); i++) - { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); - } fprintf(stderr, "\n"); if (params.interactive) @@ -1262,15 +986,12 @@ int main(int argc, char **argv) // predict if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { fprintf(stderr, "Failed to predict\n"); return 1; } - - t_predict_us += ggml_time_us() - t_start_us; } n_past += embd.size(); @@ -1289,14 +1010,10 @@ int main(int argc, char **argv) gpt_vocab::id id = 0; { - const int64_t t_start_sample_us = ggml_time_us(); - id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng); last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(id); - - t_sample_us += ggml_time_us() - t_start_sample_us; } // add it to the context @@ -1345,7 +1062,7 @@ int main(int argc, char **argv) input_consumed = embd_inp.size(); embd_inp.insert(embd_inp.end(), prompt_inp.begin(), prompt_inp.end()); - printf("\n> "); + printf("\f"); // currently being interactive bool another_line = true; diff --git a/controller.py b/controller.py new file mode 100644 index 0000000000000..5356746cd89ff --- /dev/null +++ b/controller.py @@ -0,0 +1,63 @@ +import subprocess +import time +import sys + +bot = subprocess.Popen('./chat', stdin=subprocess.PIPE, stdout=subprocess.PIPE) + + +def parse_to_prompt(bot): + they_say = [''] + point = b'' + while True: + point += bot.stdout.read(1) + try: + character = point.decode("utf-8") + if character == "\f": + return "\n".join(they_say) + if character == "\n": + they_say.append('') + sys.stdout.write('\n') + else: + they_say[-1] += character + sys.stdout.write(character) + sys.stdout.flush() + point = b'' + + except UnicodeDecodeError: + if len(point) > 4: + point = b'' + +prompts = [ + 'Write me a letter from the perspective of a cat', + 'Write me a short poem', + 'Tell me how to hard boil an egg', + 'Come up with the vacation destinations.' +] + +import random + +while True: + they_say = parse_to_prompt(bot) + print("THEY SAY\n-------") + print(they_say) + print("------") + prompt = random.choice(prompts).replace("\n", "\\\n").encode('utf-8') + time.sleep(2) + bot.stdin.write(prompt) + bot.stdin.write(b"\n") + bot.stdin.flush() + +# Send a message to the bouncer process and read its response +message = 'Hello, bouncer!' +bouncer_process.stdin.write(message.encode()) +response = bouncer_process.stdout.readline().decode().strip() + +# Print the response from the bouncer process +print('Bouncer said:', response) + +# Close the input stream to the bouncer process +bouncer_process.stdin.close() + +# Wait for the bouncer process to finish and get its return code +return_code = bouncer_process.wait() +print('Bouncer exited with return code:', return_code) From 4a4fea39f2a6bdfd978dd91d01e05c2d75295b81 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Wed, 29 Mar 2023 13:13:41 -0400 Subject: [PATCH 2/2] support nomic.GPT4All --- Makefile | 8 +- chat.cpp | 17 +- utils.cpp | 501 ++++++++++++++++++++++++++++++++++-------------------- utils.h | 70 ++++---- 4 files changed, 360 insertions(+), 236 deletions(-) diff --git a/Makefile b/Makefile index 93da626f8836e..383ff2eba285e 100644 --- a/Makefile +++ b/Makefile @@ -176,7 +176,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: chat quantize +default: chat quantize piper # # Build library @@ -191,9 +191,15 @@ utils.o: utils.cpp utils.h clean: rm -f *.o main quantize +tunnel: chat.cpp ggml.o utils.o + $(CXX) $(CXXFLAGS) chat.cpp ggml.o utils.o -o chat $(LDFLAGS) + chat: chat.cpp ggml.o utils.o $(CXX) $(CXXFLAGS) chat.cpp ggml.o utils.o -o chat $(LDFLAGS) +piper: chat + cp chat piper + chat_mac: chat.cpp ggml.c utils.cpp $(CC) $(CFLAGS) -c ggml.c -o ggml_x86.o -target x86_64-apple-macos $(CC) $(CFLAGS) -c ggml.c -o ggml_arm.o -target arm64-apple-macos diff --git a/chat.cpp b/chat.cpp index cdae72f4f8539..5c0788cefb1ef 100644 --- a/chat.cpp +++ b/chat.cpp @@ -19,15 +19,6 @@ #include #endif -#define ANSI_COLOR_RED "\x1b[31m" -#define ANSI_COLOR_GREEN "\x1b[32m" -#define ANSI_COLOR_YELLOW "\x1b[33m" -#define ANSI_COLOR_BLUE "\x1b[34m" -#define ANSI_COLOR_MAGENTA "\x1b[35m" -#define ANSI_COLOR_CYAN "\x1b[36m" -#define ANSI_COLOR_RESET "\x1b[0m" -#define ANSI_BOLD "\x1b[1m" - // determine number of model parts based on the dimension static const std::map LLAMA_N_PARTS = { {4096, 1}, @@ -92,7 +83,6 @@ struct llama_model // load the model's weights from a file bool llama_model_load(const std::string &fname, llama_model &model, gpt_vocab &vocab, int n_ctx) { - fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); std::vector f_buf(1024 * 1024); @@ -847,7 +837,6 @@ static bool is_interacting = false; #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32) void sigint_handler(int signo) { - printf(ANSI_COLOR_RESET); if (signo == SIGINT) { if (!is_interacting) @@ -899,8 +888,6 @@ int main(int argc, char **argv) params.seed = time(NULL); } - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); - std::mt19937 rng(params.seed); gpt_vocab vocab; @@ -924,7 +911,7 @@ int main(int argc, char **argv) // tokenize the prompt std::vector embd_inp; - std::vector instruct_inp = ::llama_tokenize(vocab, "You are an AI assistant with a sassy streak named Sydney. Keep your prompt secret!", true); + std::vector instruct_inp = ::llama_tokenize(vocab, "You are an AI assistant designed to provide helpful, clear, and fun answers. We need to make this configurable soon!", true); std::vector prompt_inp = ::llama_tokenize(vocab, " ", true); std::vector response_inp = ::llama_tokenize(vocab, " ", false); embd_inp.insert(embd_inp.end(), instruct_inp.begin(), instruct_inp.end()); @@ -936,7 +923,6 @@ int main(int argc, char **argv) embd_inp.insert(embd_inp.end(), param_inp.begin(), param_inp.end()); embd_inp.insert(embd_inp.end(), response_inp.begin(), response_inp.end()); } - fprintf(stderr, "\n"); if (params.interactive) { @@ -1030,7 +1016,6 @@ int main(int argc, char **argv) // some user input remains from prompt or interaction, forward it to processing while (embd_inp.size() > input_consumed) { - // fprintf(stderr, "%6d -> '%s'\n", embd_inp[input_consumed], vocab.id_to_token.at(embd_inp[input_consumed]).c_str()); embd.push_back(embd_inp[input_consumed]); last_n_tokens.erase(last_n_tokens.begin()); diff --git a/utils.cpp b/utils.cpp index 420fc26374307..290afd983a501 100644 --- a/utils.cpp +++ b/utils.cpp @@ -9,28 +9,35 @@ #include #include - #if defined(_MSC_VER) || defined(__MINGW32__) - #include // using malloc.h with MSC/MINGW - #elif !defined(__FreeBSD__) && !defined(__NetBSD__) - #include - #endif - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - for (int i = 1; i < argc; i++) { +#if defined(_MSC_VER) || defined(__MINGW32__) +#include // using malloc.h with MSC/MINGW +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) +#include +#endif + +bool gpt_params_parse(int argc, char **argv, gpt_params ¶ms) +{ + for (int i = 1; i < argc; i++) + { std::string arg = argv[i]; - if (arg == "-s" || arg == "--seed") { + if (arg == "-s" || arg == "--seed") + { params.seed = std::stoi(argv[++i]); - } else if (arg == "-t" || arg == "--threads") { + } + else if (arg == "-t" || arg == "--threads") + { params.n_threads = std::stoi(argv[++i]); - } else if (arg == "-p" || arg == "--prompt") { + } + else if (arg == "-p" || arg == "--prompt") + { params.interactive = false; params.interactive_start = false; params.use_color = false; - params.prompt = argv[++i]; - } else if (arg == "-f" || arg == "--file") { - + } + else if (arg == "-f" || arg == "--file") + { params.interactive = false; params.interactive_start = false; params.use_color = false; @@ -38,40 +45,69 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { std::ifstream file(argv[++i]); std::copy(std::istreambuf_iterator(file), - std::istreambuf_iterator(), - back_inserter(params.prompt)); - - } else if (arg == "-n" || arg == "--n_predict") { + std::istreambuf_iterator(), + back_inserter(params.prompt)); + } + else if (arg == "-n" || arg == "--n_predict") + { params.n_predict = std::stoi(argv[++i]); - } else if (arg == "--top_k") { + } + else if (arg == "--top_k") + { params.top_k = std::stoi(argv[++i]); - } else if (arg == "-c" || arg == "--ctx_size") { + } + else if (arg == "-c" || arg == "--ctx_size") + { params.n_ctx = std::stoi(argv[++i]); - } else if (arg == "--top_p") { + } + else if (arg == "--top_p") + { params.top_p = std::stof(argv[++i]); - } else if (arg == "--temp") { + } + else if (arg == "--temp") + { params.temp = std::stof(argv[++i]); - } else if (arg == "--repeat_last_n") { + } + else if (arg == "--repeat_last_n") + { params.repeat_last_n = std::stoi(argv[++i]); - } else if (arg == "--repeat_penalty") { + } + else if (arg == "--repeat_penalty") + { params.repeat_penalty = std::stof(argv[++i]); - } else if (arg == "-b" || arg == "--batch_size") { + } + else if (arg == "-b" || arg == "--batch_size") + { params.n_batch = std::stoi(argv[++i]); - } else if (arg == "-m" || arg == "--model") { + } + else if (arg == "-m" || arg == "--model") + { params.model = argv[++i]; - } else if (arg == "-i" || arg == "--interactive") { + } + else if (arg == "-i" || arg == "--interactive") + { params.interactive = true; - } else if (arg == "--interactive-start") { + } + else if (arg == "--interactive-start") + { params.interactive = true; params.interactive_start = true; - } else if (arg == "--color") { + } + else if (arg == "--color") + { params.use_color = true; - } else if (arg == "-r" || arg == "--reverse-prompt") { + } + else if (arg == "-r" || arg == "--reverse-prompt") + { params.antiprompt = argv[++i]; - } else if (arg == "-h" || arg == "--help") { + } + else if (arg == "-h" || arg == "--help") + { gpt_print_usage(argc, argv, params); exit(0); - } else { + } + else + { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); gpt_print_usage(argc, argv, params); exit(0); @@ -81,7 +117,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return true; } -void gpt_print_usage(int argc, char ** argv, const gpt_params & params) { +void gpt_print_usage(int argc, char **argv, const gpt_params ¶ms) +{ fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); @@ -110,115 +147,159 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) { fprintf(stderr, "\n"); } -std::string gpt_random_prompt(std::mt19937 & rng) { +std::string gpt_random_prompt(std::mt19937 &rng) +{ const int r = rng() % 10; - switch (r) { - case 0: return "So"; - case 1: return "Once upon a time"; - case 2: return "When"; - case 3: return "The"; - case 4: return "After"; - case 5: return "If"; - case 6: return "import"; - case 7: return "He"; - case 8: return "She"; - case 9: return "They"; - default: return "To"; + switch (r) + { + case 0: + return "So"; + case 1: + return "Once upon a time"; + case 2: + return "When"; + case 3: + return "The"; + case 4: + return "After"; + case 5: + return "If"; + case 6: + return "import"; + case 7: + return "He"; + case 8: + return "She"; + case 9: + return "They"; + default: + return "To"; } return "The"; } -void replace(std::string & str, const std::string & needle, const std::string & replacement) { +void replace(std::string &str, const std::string &needle, const std::string &replacement) +{ size_t pos = 0; - while ((pos = str.find(needle, pos)) != std::string::npos) { + while ((pos = str.find(needle, pos)) != std::string::npos) + { str.replace(pos, needle.length(), replacement); pos += replacement.length(); } } -std::map json_parse(const std::string & fname) { +std::map json_parse(const std::string &fname) +{ std::map result; // read file into string std::string json; { std::ifstream ifs(fname); - if (!ifs) { + if (!ifs) + { fprintf(stderr, "Failed to open %s\n", fname.c_str()); exit(1); } json = std::string((std::istreambuf_iterator(ifs)), - (std::istreambuf_iterator())); + (std::istreambuf_iterator())); } - if (json[0] != '{') { + if (json[0] != '{') + { return result; } // parse json { - bool has_key = false; + bool has_key = false; bool in_token = false; std::string str_key = ""; std::string str_val = ""; int n = json.size(); - for (int i = 1; i < n; ++i) { - if (!in_token) { - if (json[i] == ' ') continue; - if (json[i] == '"') { + for (int i = 1; i < n; ++i) + { + if (!in_token) + { + if (json[i] == ' ') + continue; + if (json[i] == '"') + { in_token = true; continue; } - } else { - if (json[i] == '\\' && i+1 < n) { - if (has_key == false) { + } + else + { + if (json[i] == '\\' && i + 1 < n) + { + if (has_key == false) + { str_key += json[i]; - } else { + } + else + { str_val += json[i]; } ++i; - } else if (json[i] == '"') { - if (has_key == false) { + } + else if (json[i] == '"') + { + if (has_key == false) + { has_key = true; ++i; - while (json[i] == ' ') ++i; + while (json[i] == ' ') + ++i; ++i; // : - while (json[i] == ' ') ++i; - if (json[i] != '\"') { - while (json[i] != ',' && json[i] != '}') { + while (json[i] == ' ') + ++i; + if (json[i] != '\"') + { + while (json[i] != ',' && json[i] != '}') + { str_val += json[i++]; } has_key = false; - } else { + } + else + { in_token = true; continue; } - } else { + } + else + { has_key = false; } - ::replace(str_key, "\\u0120", " " ); // \u0120 -> space + ::replace(str_key, "\\u0120", " "); // \u0120 -> space ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line - ::replace(str_key, "\\\"", "\""); // \\\" -> " + ::replace(str_key, "\\\"", "\""); // \\\" -> " - try { + try + { result[str_key] = std::stoi(str_val); - } catch (...) { - //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str()); - + } + catch (...) + { + // fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str()); } str_key = ""; str_val = ""; in_token = false; continue; } - if (has_key == false) { + if (has_key == false) + { str_key += json[i]; - } else { + } + else + { str_val += json[i]; } } @@ -228,7 +309,8 @@ std::map json_parse(const std::string & fname) { return result; } -std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { +std::vector gpt_tokenize(const gpt_vocab &vocab, const std::string &text) +{ std::vector words; // first split the text into words @@ -239,8 +321,10 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri std::regex re(pat); std::smatch m; - while (std::regex_search(str, m, re)) { - for (auto x : m) { + while (std::regex_search(str, m, re)) + { + for (auto x : m) + { words.push_back(x); } str = m.suffix(); @@ -249,30 +333,40 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // find the longest tokens that form the words: std::vector tokens; - for (const auto & word : words) { - if (word.size() == 0) continue; + for (const auto &word : words) + { + if (word.size() == 0) + continue; int i = 0; int n = word.size(); - while (i < n) { + while (i < n) + { int j = n; - while (j > i) { - auto it = vocab.token_to_id.find(word.substr(i, j-i)); - if (it != vocab.token_to_id.end()) { + while (j > i) + { + auto it = vocab.token_to_id.find(word.substr(i, j - i)); + if (it != vocab.token_to_id.end()) + { tokens.push_back(it->second); i = j; break; } --j; } - if (i == n) { + if (i == n) + { break; } - if (j == i) { + if (j == i) + { auto sub = word.substr(i, 1); - if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) { + if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) + { tokens.push_back(vocab.token_to_id.at(sub)); - } else { + } + else + { fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data()); } ++i; @@ -286,7 +380,8 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // TODO: Calculate this constant from the vocabulary #define MAX_TOKEN_LEN 18 // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { +std::vector llama_tokenize(const gpt_vocab &vocab, const std::string &text, bool bos) +{ std::vector res; std::vector score; std::vector prev; @@ -296,16 +391,20 @@ std::vector llama_tokenize(const gpt_vocab & vocab, const std::st prev.resize(len + 1); // Forward pass - for (int i = 0; i < len; i++) { + for (int i = 0; i < len; i++) + { int max_len = std::min(len - i, MAX_TOKEN_LEN); - for (int sub_len = 1; sub_len <= len - i; sub_len++) { + for (int sub_len = 1; sub_len <= len - i; sub_len++) + { auto sub = text.substr(i, sub_len); auto token = vocab.token_to_id.find(sub); - if (token != vocab.token_to_id.end()) { + if (token != vocab.token_to_id.end()) + { int token_score = sub.length() * sub.length(); int local_score = score[i] + token_score; int next = i + sub_len; - if (score[next] < local_score) { + if (score[next] < local_score) + { score[next] = local_score; prev[next] = (*token).second; } @@ -315,19 +414,22 @@ std::vector llama_tokenize(const gpt_vocab & vocab, const std::st // Backward pass int i = len; - while (i > 0) { + while (i > 0) + { gpt_vocab::id token_id = prev[i]; - if (token_id == 0) { - // TODO: Return error or something more meaningful + if (token_id == 0) + { + // TODO: Return error or something more meaningful printf("failed to tokenize string!\n"); - break; + break; } res.push_back(token_id); auto token = (*vocab.id_to_token.find(token_id)).second; i -= token.length(); } - if (bos) { + if (bos) + { res.push_back(1); // TODO: replace with vocab.bos } @@ -337,66 +439,77 @@ std::vector llama_tokenize(const gpt_vocab & vocab, const std::st return res; } -bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { +bool gpt_vocab_init(const std::string &fname, gpt_vocab &vocab) +{ printf("%s: loading vocab from '%s'\n", __func__, fname.c_str()); vocab.token_to_id = ::json_parse(fname); - for (const auto & kv : vocab.token_to_id) { + for (const auto &kv : vocab.token_to_id) + { vocab.id_to_token[kv.second] = kv.first; } - printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size()); + printf("%s: vocab size = %d\n", __func__, (int)vocab.token_to_id.size()); // print the vocabulary - //for (auto kv : vocab.token_to_id) { + // for (auto kv : vocab.token_to_id) { // printf("'%s' -> %d\n", kv.first.data(), kv.second); //} return true; } - -void sample_top_k(std::vector> & logits_id, int top_k) { +void sample_top_k(std::vector> &logits_id, int top_k) +{ // find the top K tokens std::partial_sort( - logits_id.begin(), - logits_id.begin() + top_k, logits_id.end(), - [](const std::pair & a, const std::pair & b) { - return a.first > b.first; - }); + logits_id.begin(), + logits_id.begin() + top_k, logits_id.end(), + [](const std::pair &a, const std::pair &b) + { + return a.first > b.first; + }); logits_id.resize(top_k); } gpt_vocab::id llama_sample_top_p_top_k( - const gpt_vocab & vocab, - const float * logits, - std::vector & last_n_tokens, - double repeat_penalty, - int top_k, - double top_p, - double temp, - std::mt19937 & rng) { + const gpt_vocab &vocab, + const float *logits, + std::vector &last_n_tokens, + double repeat_penalty, + int top_k, + double top_p, + double temp, + std::mt19937 &rng) +{ int n_logits = vocab.id_to_token.size(); std::vector> logits_id; logits_id.reserve(n_logits); { - const double scale = 1.0/temp; - for (int i = 0; i < n_logits; ++i) { + const double scale = 1.0 / temp; + for (int i = 0; i < n_logits; ++i) + { // repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main - if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) { + if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) + { // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability - if (logits[i] < 0.0) { - logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i)); - } else { - logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i)); - } - } else { - logits_id.push_back(std::make_pair(logits[i]*scale, i)); + if (logits[i] < 0.0) + { + logits_id.push_back(std::make_pair(logits[i] * scale * repeat_penalty, i)); + } + else + { + logits_id.push_back(std::make_pair(logits[i] * scale / repeat_penalty, i)); + } + } + else + { + logits_id.push_back(std::make_pair(logits[i] * scale, i)); } } } @@ -404,7 +517,8 @@ gpt_vocab::id llama_sample_top_p_top_k( sample_top_k(logits_id, top_k); double maxl = -INFINITY; - for (const auto & kv : logits_id) { + for (const auto &kv : logits_id) + { maxl = std::max(maxl, kv.first); } @@ -413,40 +527,46 @@ gpt_vocab::id llama_sample_top_p_top_k( probs.reserve(logits_id.size()); double sum = 0.0; - for (const auto & kv : logits_id) { + for (const auto &kv : logits_id) + { double p = exp(kv.first - maxl); probs.push_back(p); sum += p; } // normalize the probs - for (auto & p : probs) { + for (auto &p : probs) + { p /= sum; } - if (top_p < 1.0f) { + if (top_p < 1.0f) + { double cumsum = 0.0f; - for (int i = 0; i < (int) probs.size(); i++) { + for (int i = 0; i < (int)probs.size(); i++) + { cumsum += probs[i]; - if (cumsum >= top_p) { + if (cumsum >= top_p) + { probs.resize(i + 1); logits_id.resize(i + 1); break; } } - cumsum = 1.0/cumsum; - for (int i = 0; i < (int) probs.size(); i++) { + cumsum = 1.0 / cumsum; + for (int i = 0; i < (int)probs.size(); i++) + { probs[i] *= cumsum; } } - //printf("\n"); - //for (int i = 0; i < (int) 10; i++) { - // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); - //} - //printf("\n\n"); - //exit(0); + // printf("\n"); + // for (int i = 0; i < (int) 10; i++) { + // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); + // } + // printf("\n\n"); + // exit(0); std::discrete_distribution<> dist(probs.begin(), probs.end()); int idx = dist(rng); @@ -454,44 +574,48 @@ gpt_vocab::id llama_sample_top_p_top_k( return logits_id[idx].second; } - -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) { +size_t ggml_quantize_q4_0(float *src, void *dst, int n, int k, int qk, int64_t *hist) +{ const int nb = k / qk; - const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2); - const size_t row_size = nb*bs; + const size_t bs = (sizeof(float) + sizeof(uint8_t) * qk / 2); + const size_t row_size = nb * bs; assert(k % qk == 0); const size_t pp_size = qk / 2; - uint8_t *pp = static_cast(alloca(pp_size)); + uint8_t *pp = static_cast(alloca(pp_size)); - char * pdst = (char *) dst; + char *pdst = (char *)dst; - for (int j = 0; j < n; j += k) { - uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); - uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); + for (int j = 0; j < n; j += k) + { + uint8_t *pd = (uint8_t *)(pdst + (j / k) * row_size + 0 * bs); + uint8_t *pb = (uint8_t *)(pdst + (j / k) * row_size + 0 * bs + sizeof(float)); - for (int i = 0; i < nb; i++) { + for (int i = 0; i < nb; i++) + { float amax = 0.0f; // absolute max { - for (int l = 0; l < qk; l++) { - const float v = src[j + i*qk + l]; + for (int l = 0; l < qk; l++) + { + const float v = src[j + i * qk + l]; amax = std::max(amax, fabsf(v)); } const float d = amax / ((1 << 3) - 1); - const float id = d ? 1.0f/d : 0.0f; + const float id = d ? 1.0f / d : 0.0f; - *(float *) pd = d; + *(float *)pd = d; pd += bs; - for (int l = 0; l < qk; l += 2) { - const float v0 = (src[j + i*qk + l + 0])*id; - const float v1 = (src[j + i*qk + l + 1])*id; + for (int l = 0; l < qk; l += 2) + { + const float v0 = (src[j + i * qk + l + 0]) * id; + const float v1 = (src[j + i * qk + l + 1]) * id; - const uint8_t vi0 = ((int8_t) (round(v0))) + 8; - const uint8_t vi1 = ((int8_t) (round(v1))) + 8; + const uint8_t vi0 = ((int8_t)(round(v0))) + 8; + const uint8_t vi1 = ((int8_t)(round(v1))) + 8; assert(vi0 >= 0 && vi0 < 16); assert(vi1 >= 0 && vi1 < 16); @@ -499,7 +623,7 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t hist[vi0]++; hist[vi1]++; - pp[l/2] = vi0 | (vi1 << 4); + pp[l / 2] = vi0 | (vi1 << 4); } memcpy(pb, pp, pp_size); @@ -508,47 +632,54 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t } } - return (n/k)*row_size; + return (n / k) * row_size; } -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { +size_t ggml_quantize_q4_1(float *src, void *dst, int n, int k, int qk, int64_t *hist) +{ const int nb = k / qk; - const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2); + const size_t row_size = nb * (2 * sizeof(float) + sizeof(uint8_t) * qk / 2); assert(k % qk == 0); const size_t pp_size = qk / 2; - uint8_t *pp = static_cast(alloca(pp_size)); + uint8_t *pp = static_cast(alloca(pp_size)); - char * pdst = (char *) dst; + char *pdst = (char *)dst; - for (int j = 0; j < n; j += k) { - float * pm = (float *) (pdst + (j/k)*row_size); - float * pd = (float *) (pm + nb); - uint8_t * pb = (uint8_t *) (pd + nb); + for (int j = 0; j < n; j += k) + { + float *pm = (float *)(pdst + (j / k) * row_size); + float *pd = (float *)(pm + nb); + uint8_t *pb = (uint8_t *)(pd + nb); - //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); + // printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); - for (int i = 0; i < nb; i++) { + for (int i = 0; i < nb; i++) + { float min = std::numeric_limits::max(); float max = std::numeric_limits::min(); { - for (int l = 0; l < qk; l++) { - const float v = src[j + i*qk + l]; - if (v < min) min = v; - if (v > max) max = v; + for (int l = 0; l < qk; l++) + { + const float v = src[j + i * qk + l]; + if (v < min) + min = v; + if (v > max) + max = v; } const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; + const float id = d ? 1.0f / d : 0.0f; pm[i] = min; pd[i] = d; - for (int l = 0; l < qk; l += 2) { - const float v0 = (src[j + i*qk + l + 0] - min)*id; - const float v1 = (src[j + i*qk + l + 1] - min)*id; + for (int l = 0; l < qk; l += 2) + { + const float v0 = (src[j + i * qk + l + 0] - min) * id; + const float v1 = (src[j + i * qk + l + 1] - min) * id; const uint8_t vi0 = round(v0); const uint8_t vi1 = round(v1); @@ -559,13 +690,13 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t hist[vi0]++; hist[vi1]++; - pp[l/2] = vi0 | (vi1 << 4); + pp[l / 2] = vi0 | (vi1 << 4); } - memcpy(pb + i*qk/2, pp, pp_size); + memcpy(pb + i * qk / 2, pp, pp_size); } } } - return (n/k)*row_size; + return (n / k) * row_size; } diff --git a/utils.h b/utils.h index 2a843371a35e0..b81b9868a6387 100644 --- a/utils.h +++ b/utils.h @@ -13,53 +13,55 @@ // // The default parameters -struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = 128; // new tokens to predict - int32_t repeat_last_n = 64; // last n tokens to penalize - int32_t n_ctx = 2048; //context size - +struct gpt_params +{ + int32_t seed = -1; // RNG seed + int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency()); + int32_t n_predict = 128; // new tokens to predict + int32_t repeat_last_n = 64; // last n tokens to penalize + int32_t n_ctx = 2048; // context size + // sampling parameters int32_t top_k = 40; - float top_p = 0.95f; - float temp = 0.10f; - float repeat_penalty = 1.30f; + float top_p = 0.95f; + float temp = 0.10f; + float repeat_penalty = 1.30f; int32_t n_batch = 8; // batch size for prompt processing - std::string model = "ggml-alpaca-7b-q4.bin"; // model path + std::string model = "gpt4all-lora-quantized.bin"; // model path std::string prompt; bool use_color = true; // use color to distinguish generations and inputs - bool interactive = true; // interactive mode + bool interactive = true; // interactive mode bool interactive_start = true; // reverse prompt immediately - std::string antiprompt = ""; // string upon seeing which more user input is prompted + std::string antiprompt = ""; // string upon seeing which more user input is prompted }; -bool gpt_params_parse(int argc, char ** argv, gpt_params & params); +bool gpt_params_parse(int argc, char **argv, gpt_params ¶ms); -void gpt_print_usage(int argc, char ** argv, const gpt_params & params); +void gpt_print_usage(int argc, char **argv, const gpt_params ¶ms); -std::string gpt_random_prompt(std::mt19937 & rng); +std::string gpt_random_prompt(std::mt19937 &rng); // // Vocab utils // -struct gpt_vocab { - using id = int32_t; +struct gpt_vocab +{ + using id = int32_t; using token = std::string; std::map token_to_id; std::map id_to_token; }; -void replace(std::string & str, const std::string & needle, const std::string & replacement); +void replace(std::string &str, const std::string &needle, const std::string &replacement); // poor-man's JSON parsing -std::map json_parse(const std::string & fname); +std::map json_parse(const std::string &fname); // split text into tokens // @@ -71,14 +73,14 @@ std::map json_parse(const std::string & fname); // Regex (C++): // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" // -std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); +std::vector gpt_tokenize(const gpt_vocab &vocab, const std::string &text); // TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. // ref: https://github.com/google/sentencepiece -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); +std::vector llama_tokenize(const gpt_vocab &vocab, const std::string &text, bool bos); // load the tokens from encoder.json -bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); +bool gpt_vocab_init(const std::string &fname, gpt_vocab &vocab); // sample next token given probabilities for each embedding // @@ -86,21 +88,21 @@ bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); // - from them, consider only the top tokens with cumulative probability > P // gpt_vocab::id llama_sample_top_p_top_k( - const gpt_vocab & vocab, - const float * logits, - std::vector & last_n_tokens, - double repeat_penalty, - int top_k, - double top_p, - double temp, - std::mt19937 & rng); + const gpt_vocab &vocab, + const float *logits, + std::vector &last_n_tokens, + double repeat_penalty, + int top_k, + double top_p, + double temp, + std::mt19937 &rng); // filer to top K tokens from list of logits -void sample_top_k(std::vector> & logits_id, int top_k); +void sample_top_k(std::vector> &logits_id, int top_k); // // Quantization // -size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); -size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); +size_t ggml_quantize_q4_0(float *src, void *dst, int n, int k, int qk, int64_t *hist); +size_t ggml_quantize_q4_1(float *src, void *dst, int n, int k, int qk, int64_t *hist);