From 338423439d53ad0714084a6287304e38e9d19185 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@mozilla.com>
Date: Mon, 8 Jan 2024 23:52:58 -0800
Subject: [PATCH] Upgrade to cosmocc 3.2.4

We're now able to use the C++ exception code in the upstream llama.cpp,
json.h, and httplib.h projects. This is important since it means things
like invalid JSON requests won't cause the entire server to crash. This
change is also going to make merging from upstream easier going forward

Fixes #116
---
 .gitignore                      |  2 +-
 build/config.mk                 |  8 +--
 llama.cpp/base64.h              |  6 +--
 llama.cpp/common.cpp            | 27 ++--------
 llama.cpp/grammar-parser.cpp    | 33 +++++-------
 llama.cpp/llama.cpp             | 96 +++++++++++----------------------
 llama.cpp/llava/clip.cpp        |  9 ++--
 llama.cpp/quantize/quantize.cpp |  4 --
 llama.cpp/runtime.h             |  7 ---
 llama.cpp/server/server.cpp     | 17 +-----
 llama.cpp/unicode.h             | 20 ++++---
 llamafile/cuda.c                | 12 ++---
 llamafile/llamafile.c           |  8 +--
 llamafile/rocm.sh               |  1 +
 14 files changed, 80 insertions(+), 170 deletions(-)
 delete mode 100644 llama.cpp/runtime.h
diff --git a/.gitignore b/.gitignore
index 523dcb0242..fcbdcd99cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 # -*- conf -*-
 
 /o
-/cosmocc
+/.cosmocc
 /TAGS
 /HTAGS
 /cosmocc
diff --git a/build/config.mk b/build/config.mk
index e508b776d4..9373e5e54a 100644
--- a/build/config.mk
+++ b/build/config.mk
@@ -2,7 +2,7 @@
 #── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
 
 PREFIX = /usr/local
-COSMOCC = cosmocc/3.2.3
+COSMOCC = .cosmocc/3.2.4
 TOOLCHAIN = $(COSMOCC)/bin/cosmo
 
 AR = $(TOOLCHAIN)ar
@@ -13,7 +13,7 @@ MKDEPS = $(COSMOCC)/bin/mkdeps
 INSTALL = install
 
 ARFLAGS = rcsD
-CCFLAGS = -g -O3
+CCFLAGS = -g -O3 -fexceptions
 CPPFLAGS_ = -iquote. -mcosmo
 TARGET_ARCH = -Xx86_64-mssse3
 
@@ -50,5 +50,5 @@ clean:; rm -rf o
 .PHONY: distclean
 distclean:; rm -rf o cosmocc
 
-cosmocc/3.2.3:
-	build/download-cosmocc.sh $@ 3.2.3 6b2bffb9bc4ebff41a2e4657aab6b8e725729fb2fbd128b06196b2b28aedb322
+.cosmocc/3.2.4:
+	build/download-cosmocc.sh $@ 3.2.4 d2fa6dbf6f987310494581deff5b915dbdc5ca701f20f7613bb0dcf1de2ee511
diff --git a/llama.cpp/base64.h b/llama.cpp/base64.h
index 0234d2281c..ab4775d364 100644
--- a/llama.cpp/base64.h
+++ b/llama.cpp/base64.h
@@ -33,8 +33,6 @@ For more information, please refer to <http://unlicense.org>
 #include <stdexcept>
 #include <string>
 
-#include "runtime.h"
-
 class base64_error : public std::runtime_error
 {
 public:
@@ -237,7 +235,7 @@ class base64
                 ++in_begin;
 
                 if (c != '=') {
-                    ThrowRuntimeError("invalid base64 character.");
+                    throw std::runtime_error("invalid base64 character.");
                 }
             }
         }
@@ -387,7 +385,7 @@ class base64
             }
         }
 
-        ThrowRuntimeError("invalid base64 character.");
+        throw std::runtime_error("invalid base64 character.");
     }
 };
 
diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
index 453ae69885..3102384411 100644
--- a/llama.cpp/common.cpp
+++ b/llama.cpp/common.cpp
@@ -4,7 +4,6 @@
 #include "llama.h"
 #include "ggml-cuda.h"
 #include "ggml-metal.h"
-#include "runtime.h"
 
 #include <algorithm>
 #include <cassert>
@@ -124,21 +123,17 @@ void process_escapes(std::string& input) {
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     bool result = true;
-#ifndef _LIBCPP_NO_EXCEPTIONS
     try {
-#endif
         if (!gpt_params_parse_ex(argc, argv, params)) {
             gpt_print_usage(argc, argv, gpt_params());
             exit(0);
         }
-#ifndef _LIBCPP_NO_EXCEPTIONS
     }
     catch (const std::invalid_argument & ex) {
         fprintf(stderr, "%s\n", ex.what());
         gpt_print_usage(argc, argv, gpt_params());
         exit(1);
     }
-#endif
     if (FLAG_gpu == LLAMAFILE_GPU_DISABLE) {
         params.n_gpu_layers = 0;
     }
@@ -669,7 +664,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             llama_token key;
             char sign = 0;
             std::string value_str;
-#ifndef _LIBCPP_NO_EXCEPTIONS
             try {
                 if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
                     sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
@@ -680,19 +674,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-#else
-            if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                errno = 0;
-                sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-                if (errno) {
-                    invalid_param = true;
-                    break;
-                }
-            } else {
-                invalid_param = true;
-                break;
-            }
-#endif
         } else if (arg == "-h" || arg == "--help") {
             return false;
 
@@ -795,17 +776,17 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         // End of Parse args for logging parameters
 #endif // LOG_DISABLE_LOGS
         } else {
-            ThrowInvalidArgument("error: unknown argument: " + arg);
+            throw std::invalid_argument("error: unknown argument: " + arg);
         }
     }
     if (invalid_param) {
-        ThrowInvalidArgument("error: invalid parameter for argument: " + arg);
+        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
     }
     if (params.prompt_cache_all &&
             (params.interactive || params.interactive_first ||
              params.instruct)) {
 
-        ThrowInvalidArgument("error: --prompt-cache-all not supported in interactive mode yet\n");
+        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
     if (params.escape) {
@@ -1081,7 +1062,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
         return GGML_TYPE_Q5_1;
     }
 
-    ThrowRuntimeError("Invalid cache type: " + s);
+    throw std::runtime_error("Invalid cache type: " + s);
 }
 
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
diff --git a/llama.cpp/grammar-parser.cpp b/llama.cpp/grammar-parser.cpp
index 224e9c3e87..bf89a96f36 100644
--- a/llama.cpp/grammar-parser.cpp
+++ b/llama.cpp/grammar-parser.cpp
@@ -1,5 +1,4 @@
 #include "grammar-parser.h"
-#include "runtime.h"
 #include <cstdint>
 #include <cwchar>
 #include <string>
@@ -69,7 +68,7 @@ namespace grammar_parser {
             }
         }
         if (pos != end) {
-            ThrowRuntimeError("expecting " + std::to_string(size) + " hex chars at " + src);
+            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
         }
         return std::make_pair(value, pos);
     }
@@ -95,7 +94,7 @@ namespace grammar_parser {
             pos++;
         }
         if (pos == src) {
-            ThrowRuntimeError(std::string("expecting name at ") + src);
+            throw std::runtime_error(std::string("expecting name at ") + src);
         }
         return pos;
     }
@@ -115,12 +114,12 @@ namespace grammar_parser {
                 case ']':
                     return std::make_pair(src[1], src + 2);
                 default:
-                    ThrowRuntimeError(std::string("unknown escape at ") + src);
+                    throw std::runtime_error(std::string("unknown escape at ") + src);
             }
         } else if (*src) {
             return decode_utf8(src);
         }
-        ThrowRuntimeError("unexpected end of input");
+        throw std::runtime_error("unexpected end of input");
     }
 
     const char * parse_alternates(
@@ -186,12 +185,12 @@ namespace grammar_parser {
                 // output reference to synthesized rule
                 out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
                 if (*pos != ')') {
-                    ThrowRuntimeError(std::string("expecting ')' at ") + pos);
+                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
                 }
                 pos = parse_space(pos + 1, is_nested);
             } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                 if (last_sym_start == out_elements.size()) {
-                    ThrowRuntimeError(std::string("expecting preceding item to */+/? at ") + pos);
+                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
                 }
 
                 // apply transformation to previous symbol (last_sym_start to end) according to
@@ -256,7 +255,7 @@ namespace grammar_parser {
         const std::string name(src, name_len);
 
         if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
-            ThrowRuntimeError(std::string("expecting ::= at ") + pos);
+            throw std::runtime_error(std::string("expecting ::= at ") + pos);
         }
         pos = parse_space(pos + 3, true);
 
@@ -267,27 +266,23 @@ namespace grammar_parser {
         } else if (*pos == '\n') {
             pos++;
         } else if (*pos) {
-            ThrowRuntimeError(std::string("expecting newline or end at ") + pos);
+            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
         }
         return parse_space(pos, true);
     }
 
     parse_state parse(const char * src) {
-#ifndef _LIBCPP_NO_EXCEPTIONS
         try {
-#endif
             parse_state state;
             const char * pos = parse_space(src, true);
             while (*pos) {
                 pos = parse_rule(state, pos);
             }
             return state;
-#ifndef _LIBCPP_NO_EXCEPTIONS
         } catch (const std::exception & err) {
             fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
             return parse_state();
         }
-#endif
     }
 
     static void print_grammar_char(FILE * file, uint32_t c) {
@@ -345,7 +340,7 @@ namespace grammar_parser {
             const std::vector<llama_grammar_element> & rule,
             const std::map<uint32_t, std::string>    & symbol_id_names) {
         if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
-            ThrowRuntimeError(
+            throw std::runtime_error(
                 "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
         }
         fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
@@ -353,7 +348,7 @@ namespace grammar_parser {
             llama_grammar_element elem = rule[i];
             switch (elem.type) {
                 case LLAMA_GRETYPE_END:
-                    ThrowRuntimeError(
+                    throw std::runtime_error(
                         "unexpected end of rule: " + std::to_string(rule_id) + "," +
                         std::to_string(i));
                 case LLAMA_GRETYPE_ALT:
@@ -372,7 +367,7 @@ namespace grammar_parser {
                     break;
                 case LLAMA_GRETYPE_CHAR_RNG_UPPER:
                     if (i == 0 || !is_char_element(rule[i - 1])) {
-                        ThrowRuntimeError(
+                        throw std::runtime_error(
                             "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
                             std::to_string(rule_id) + "," + std::to_string(i));
                     }
@@ -381,7 +376,7 @@ namespace grammar_parser {
                     break;
                 case LLAMA_GRETYPE_CHAR_ALT:
                     if (i == 0 || !is_char_element(rule[i - 1])) {
-                        ThrowRuntimeError(
+                        throw std::runtime_error(
                             "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
                             std::to_string(rule_id) + "," + std::to_string(i));
                     }
@@ -402,9 +397,7 @@ namespace grammar_parser {
     }
 
     void print_grammar(FILE * file, const parse_state & state) {
-#ifndef _LIBCPP_NO_EXCEPTIONS
         try {
-#endif
             std::map<uint32_t, std::string> symbol_id_names;
             for (const auto & kv : state.symbol_ids) {
                 symbol_id_names[kv.second] = kv.first;
@@ -415,11 +408,9 @@ namespace grammar_parser {
                 print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
                 // fprintf(file, "\n");
             }
-#ifndef _LIBCPP_NO_EXCEPTIONS
         } catch (const std::exception & err) {
             fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
         }
-#endif
     }
 
     std::vector<const llama_grammar_element *> parse_state::c_rules() {
diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp
index b0bbb26675..fd71e1850d 100644
--- a/llama.cpp/llama.cpp
+++ b/llama.cpp/llama.cpp
@@ -6,7 +6,6 @@
 #include "llama.h"
 
 #include "unicode.h"
-#include "runtime.h"
 
 #include "ggml.h"
 
@@ -86,19 +85,6 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
 // helpers
 //
 
-void ThrowRuntimeError(std::string message) {
-#ifndef _LIBCPP_NO_EXCEPTIONS
-    ThrowRuntimeError(message);
-#else
-    fprintf(stderr, "error: %s\n", message.c_str());
-    exit(1);
-#endif
-}
-
-void ThrowInvalidArgument(std::string message) {
-  ThrowRuntimeError(message);
-}
-
 static size_t utf8_len(char src) {
     const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
     uint8_t highbits = static_cast<uint8_t>(src) >> 4;
@@ -122,7 +108,7 @@ static void replace_all(std::string & s, const std::string & search, const std::
 static bool is_float_close(float a, float b, float abs_tol) {
     // Check for non-negative tolerance
     if (abs_tol < 0.0) {
-        ThrowInvalidArgument("Tolerance must be non-negative");
+        throw std::invalid_argument("Tolerance must be non-negative");
     }
 
     // Exact equality check
@@ -748,7 +734,7 @@ struct llama_file {
     llama_file(const char * fname, const char * mode) {
         file = llamafile_open_gguf(fname, mode);
         if (!file) {
-            ThrowRuntimeError(format("failed to open %s: %s", fname, strerror(errno)));
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
         }
     }
 
@@ -766,10 +752,10 @@ struct llama_file {
         }
         long rc = llamafile_read(file, ptr, len);
         if (rc == -1) {
-            ThrowRuntimeError(format("read error: %s", strerror(errno)));
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
         }
         if (!rc) {
-            ThrowRuntimeError("unexpectedly reached end of file");
+            throw std::runtime_error("unexpectedly reached end of file");
         }
     }
 
@@ -783,7 +769,7 @@ struct llama_file {
         errno = 0;
         long rc = llamafile_write(file, ptr, len);
         if (rc == -1) {
-            ThrowRuntimeError(format("write error: %s", strerror(errno)));
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
         }
     }
 
@@ -832,7 +818,7 @@ struct llama_mmap {
         if (numa) { prefetch = 0; }
         addr = mmap(NULL, size, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0);
         if (addr == MAP_FAILED) {
-            ThrowRuntimeError(format("mmap failed: %s", strerror(errno)));
+            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
         }
 
         if (prefetch > 0) {
@@ -1828,7 +1814,7 @@ namespace GGUFMeta {
             const enum gguf_type kt = gguf_get_kv_type(ctx, k);
 
             if (kt != GKV::gt) {
-                ThrowRuntimeError(format("key %s has wrong type %s but expected type %s",
+                throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
                     gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
             }
             return GKV::getter(ctx, k);
@@ -1860,7 +1846,7 @@ namespace GGUFMeta {
                     } break;
                     default:
                         // Shouldn't be possible to end up here, but just in case...
-                        ThrowRuntimeError(
+                        throw std::runtime_error(
                             format("Unsupported attempt to override %s type for metadata key %s\n",
                                 override_type_to_str(override->tag), override->key));
                 }
@@ -1908,7 +1894,7 @@ namespace GGUFMeta {
             (void)override;
             if (!override) { return false; }
             // Currently, we should never end up here so it would be a bug if we do.
-            ThrowRuntimeError(format("Unsupported attempt to override string type for metadata key %s\n",
+            throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
                 override ? override->key : "NULL"));
         }
 
@@ -1968,7 +1954,7 @@ struct llama_model_loader {
 
         ctx_gguf = gguf_init_from_file(file.file, params);
         if (!ctx_gguf) {
-            ThrowRuntimeError(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
+            throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
         }
         llamafile_seek(file.file, 0, SEEK_SET);
 
@@ -2094,7 +2080,7 @@ struct llama_model_loader {
 
         if (kid < 0) {
             if (required) {
-                ThrowRuntimeError(format("key not found in model: %s", key.c_str()));
+                throw std::runtime_error(format("key not found in model: %s", key.c_str()));
             }
             return false;
         }
@@ -2123,7 +2109,7 @@ struct llama_model_loader {
         const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
 
         if (required && !found) {
-            ThrowRuntimeError(format("key not found in model: %s", key.c_str()));
+            throw std::runtime_error(format("key not found in model: %s", key.c_str()));
         }
 
         return found;
@@ -2171,12 +2157,12 @@ struct llama_model_loader {
             if (!required) {
                 return NULL;
             }
-            ThrowRuntimeError(format("%s: tensor '%s' not found", __func__, name.c_str()));
+            throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
         }
 
         if (backend == GGML_BACKEND_GPU_SPLIT) {
             if (ne.size() == 1) {
-                ThrowRuntimeError(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
+                throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
             }
         }
 
@@ -2189,7 +2175,7 @@ struct llama_model_loader {
                 }
             }
             if (!is_ok) {
-                ThrowRuntimeError(
+                throw std::runtime_error(
                         format("%s: tensor '%s' has wrong shape; expected %s, got %s",
                             __func__, name.c_str(),
                             llama_format_tensor_shape(ne).c_str(),
@@ -2202,7 +2188,7 @@ struct llama_model_loader {
 
     void done_getting_tensors() const {
         if (n_created != n_tensors) {
-            ThrowRuntimeError(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
+            throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
         }
     }
 
@@ -2210,7 +2196,7 @@ struct llama_model_loader {
         const int idx = gguf_find_tensor(ctx_gguf, name);
 
         if (idx < 0) {
-            ThrowRuntimeError(format("%s: tensor '%s' not found in the file", __func__, name));
+            throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
         }
 
         return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
@@ -2424,7 +2410,7 @@ static const char * llama_model_type_name(e_model type) {
 static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
     model.arch = ml.get_arch();
     if (model.arch == LLM_ARCH_UNKNOWN) {
-        ThrowRuntimeError("unknown model architecture: '" + ml.get_arch_name() + "'");
+        throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
     }
 }
 
@@ -2502,7 +2488,7 @@ static void llm_load_hparams(
 
         if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
             if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
-                ThrowRuntimeError(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
+                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
             }
         }
         // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
@@ -2675,7 +2661,7 @@ static void llm_load_vocab(
 
     const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
     if (token_idx == -1) {
-        ThrowRuntimeError("cannot find tokenizer vocab in model file\n");
+        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
     }
 
     const float * scores = nullptr;
@@ -2711,7 +2697,7 @@ static void llm_load_vocab(
             // read bpe merges and populate bpe ranks
             const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
             if (merges_keyidx == -1) {
-                ThrowRuntimeError("cannot find tokenizer merges in model file\n");
+                throw std::runtime_error("cannot find tokenizer merges in model file\n");
             }
 
             const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
@@ -2995,7 +2981,7 @@ static bool llm_load_tensors(
 
         model.ctx = ggml_init(params);
         if (!model.ctx) {
-            ThrowRuntimeError(format("ggml_init() failed"));
+            throw std::runtime_error(format("ggml_init() failed"));
         }
     }
 
@@ -3675,7 +3661,7 @@ static bool llm_load_tensors(
                     }
                 } break;
             default:
-                ThrowRuntimeError("unknown architecture");
+                throw std::runtime_error("unknown architecture");
         }
     }
 
@@ -3798,9 +3784,7 @@ static bool llm_load_tensors(
 
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
-#ifndef _LIBCPP_NO_EXCEPTIONS
     try {
-#endif
         llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
 
         model.hparams.vocab_only = params.vocab_only;
@@ -3812,7 +3796,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
         llm_load_print_meta(ml, model);
 
         if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
-            ThrowRuntimeError("vocab size mismatch");
+            throw std::runtime_error("vocab size mismatch");
         }
 
         if (params.vocab_only) {
@@ -3826,12 +3810,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
         )) {
             return -2;
         }
-#ifndef _LIBCPP_NO_EXCEPTIONS
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
         return -1;
     }
-#endif
 
     return 0;
 }
@@ -7033,7 +7015,7 @@ struct llm_tokenizer_bpe {
                         std::string byte_str(1, *j);
                         auto token_multibyte = vocab.token_to_id.find(byte_str);
                         if (token_multibyte == vocab.token_to_id.end()) {
-                            ThrowRuntimeError("ERROR: byte not found in vocab");
+                            throw std::runtime_error("ERROR: byte not found in vocab");
                         }
                         output.push_back((*token_multibyte).second);
                     }
@@ -8663,10 +8645,10 @@ static void llama_convert_tensor_internal(
     if (ggml_is_quantized(tensor->type)) {
         qtype = ggml_internal_get_type_traits(tensor->type);
         if (qtype.to_float == NULL) {
-            ThrowRuntimeError(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
+            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
         }
     } else if (tensor->type != GGML_TYPE_F16) {
-        ThrowRuntimeError(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
+        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
     }
 
     if (nthread < 2) {
@@ -8823,7 +8805,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
             case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
             case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
             case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
-            default: ThrowRuntimeError("\nUnsupported tensor size encountered\n");
+            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
         }
         LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
         ++qs.n_fallback;
@@ -8856,7 +8838,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
         case LLAMA_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_TYPE_Q6_K; break;
 
-        default: ThrowRuntimeError(format("invalid output file type %d\n", ftype));
+        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
 
     int nthread = params->nthread;
@@ -8999,7 +8981,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             if (tensor->type == GGML_TYPE_F32) {
                 f32_data = (float *) tensor->data;
             } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
-                ThrowRuntimeError(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
+                throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
             } else {
                 llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
                 f32_data = (float *) f32_conv_buf.data();
@@ -9271,7 +9253,7 @@ static int llama_apply_lora_from_file_internal(
 #if !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
             if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
                 if (dest_t->type != GGML_TYPE_F16) {
-                    ThrowRuntimeError(format(
+                    throw std::runtime_error(format(
                         "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type));
                 }
                 offload_func = ggml_cuda_assign_buffers;
@@ -9832,43 +9814,31 @@ uint32_t llama_model_quantize(
         const char * fname_inp,
         const char * fname_out,
         const llama_model_quantize_params * params) {
-#ifndef _LIBCPP_NO_EXCEPTIONS
     try {
-#endif
         llama_model_quantize_internal(fname_inp, fname_out, params);
         return 0;
-#ifndef _LIBCPP_NO_EXCEPTIONS
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
         return 1;
     }
-#endif
 }
 
 int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
-#ifndef _LIBCPP_NO_EXCEPTIONS
     try {
-#endif
         return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
-#ifndef _LIBCPP_NO_EXCEPTIONS
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
         return 1;
     }
-#endif
 }
 
 int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
-#ifndef _LIBCPP_NO_EXCEPTIONS
     try {
-#endif
         return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
-#ifndef _LIBCPP_NO_EXCEPTIONS
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
         return 1;
     }
-#endif
 }
 
 struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
@@ -10421,16 +10391,12 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
 }
 
 bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-#ifndef _LIBCPP_NO_EXCEPTIONS
     try {
-#endif
         return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
-#ifndef _LIBCPP_NO_EXCEPTIONS
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
         return false;
     }
-#endif
 }
 
 bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
diff --git a/llama.cpp/llava/clip.cpp b/llama.cpp/llava/clip.cpp
index 5e467dbf9d..0e6e236ae2 100644
--- a/llama.cpp/llava/clip.cpp
+++ b/llama.cpp/llava/clip.cpp
@@ -19,7 +19,6 @@
 #include "clip.h"
 #include "llamafile/log.h"
 #include "llama.cpp/ggml.h"
-#include "llama.cpp/runtime.h"
 #include "llama.cpp/ggml-cuda.h"
 #include "llama.cpp/ggml-metal.h"
 #include "llama.cpp/ggml-alloc.h"
@@ -98,7 +97,7 @@ static int get_key_idx(const gguf_context * ctx, const char * key) {
     int i = gguf_find_key(ctx, key);
     if (i == -1) {
         fprintf(stderr, "key %s not found in file\n", key);
-        ThrowRuntimeError(format("Missing required key: %s", key));
+        throw std::runtime_error(format("Missing required key: %s", key));
     }
 
     return i;
@@ -119,7 +118,7 @@ static float get_f32(const gguf_context * ctx, const std::string & key) {
 static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
     struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
     if (!cur) {
-        ThrowRuntimeError(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
+        throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
     }
 
     return cur;
@@ -142,7 +141,7 @@ static std::string get_ftype(int ftype) {
     case 8:
         return "q8_0";
     default:
-        ThrowRuntimeError(format("%s: Unrecognized file type: %d\n", __func__, ftype));
+        throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
     }
 }
 
@@ -483,7 +482,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     if (file) llamafile_close(file);
     if (!file || !ctx) {
       OpenFailed:
-        ThrowRuntimeError(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
+        throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
     }
 
     if (verbosity >= 1) {
diff --git a/llama.cpp/quantize/quantize.cpp b/llama.cpp/quantize/quantize.cpp
index b46d3b18c7..0e23f3bc6c 100644
--- a/llama.cpp/quantize/quantize.cpp
+++ b/llama.cpp/quantize/quantize.cpp
@@ -177,17 +177,13 @@ int main(int argc, char ** argv) {
 
     // parse nthreads
     if (argc > arg_idx) {
-#ifndef _LIBCPP_NO_EXCEPTIONS
         try {
-#endif
             params.nthread = std::stoi(argv[arg_idx]);
-#ifndef _LIBCPP_NO_EXCEPTIONS
         }
         catch (const std::exception & e) {
             fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
             return 1;
         }
-#endif
     }
 
     print_build_info();
diff --git a/llama.cpp/runtime.h b/llama.cpp/runtime.h
deleted file mode 100644
index c2909206f1..0000000000
--- a/llama.cpp/runtime.h
+++ /dev/null
@@ -1,7 +0,0 @@
-// -*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-
-// vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
-#pragma once
-#include <string>
-
-void ThrowRuntimeError(std::string) __attribute__((__noreturn__));
-void ThrowInvalidArgument(std::string) __attribute__((__noreturn__));
diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp
index 0e5cd788f2..a5fac3721e 100644
--- a/llama.cpp/server/server.cpp
+++ b/llama.cpp/server/server.cpp
@@ -14,17 +14,8 @@
 #include "llamafile/version.h"
 #include "llamafile/log.h"
 
-#define CPPHTTPLIB_NO_EXCEPTIONS 1
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
 
-#define JSON_THROW_USER(exception)                                      \
-    {std::clog << program_invocation_name                               \
-               << ": error in " << __FILE__ << ":" << __LINE__          \
-               << " (function " << __FUNCTION__ << ")" << std::endl     \
-               << (exception).what() << std::endl                       \
-               << "Server terminated." << std::endl;                    \
-        std::exit(1);}
-
 #include "httplib.h"
 #include "json.h"
 
@@ -36,6 +27,8 @@
 #include <mutex>
 #include <chrono>
 #include <condition_variable>
+#include <sys/types.h>
+#include <ifaddrs.h>
 
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@@ -893,10 +886,8 @@ struct llama_server_context
                         if (end_pos != std::string::npos)
                         {
                             std::string image_id = prompt.substr(pos, end_pos - pos);
-#ifndef _LIBCPP_NO_EXCEPTIONS
                             try
                             {
-#endif
                                 int img_id = std::stoi(image_id);
                                 bool found = false;
                                 for (slot_image &img : slot->images)
@@ -913,13 +904,11 @@ struct llama_server_context
                                     slot->images.clear();
                                     return false;
                                 }
-#ifndef _LIBCPP_NO_EXCEPTIONS
                             } catch (const std::invalid_argument& e) {
                                 LOG_TEE("Invalid image number id in prompt\n");
                                 slot->images.clear();
                                 return false;
                             }
-#endif
                         }
                     }
                     slot->prompt = "";
@@ -3171,7 +3160,6 @@ int server_cli(int argc, char **argv)
 
     svr.set_logger(log_server_request);
 
-#ifndef _LIBCPP_NO_EXCEPTIONS
     svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
             {
                 const char fmt[] = "500 Internal Server Error\n%s";
@@ -3191,7 +3179,6 @@ int server_cli(int argc, char **argv)
                 res.set_content(buf, "text/plain; charset=utf-8");
                 res.status = 500;
             });
-#endif
 
     svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
             {
diff --git a/llama.cpp/unicode.h b/llama.cpp/unicode.h
index e4636726b2..aeca879ea6 100644
--- a/llama.cpp/unicode.h
+++ b/llama.cpp/unicode.h
@@ -1,7 +1,5 @@
 ﻿#pragma once
 
-#include "runtime.h"
-
 #include <cassert>
 #include <stdexcept>
 #include <vector>
@@ -245,7 +243,7 @@ static std::string codepoint_to_utf8(uint32_t cp) {
         result.push_back(0x80 | (cp & 0x3f));
     }
     else {
-        ThrowInvalidArgument("invalid codepoint");
+        throw std::invalid_argument("invalid codepoint");
     }
     return result;
 }
@@ -266,30 +264,30 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
         return result;
     }
     else if (!(utf8[offset + 0] & 0x40)) {
-        ThrowInvalidArgument("invalid character");
+        throw std::invalid_argument("invalid character");
     }
     else if (!(utf8[offset + 0] & 0x20)) {
         if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
-            ThrowInvalidArgument("invalid character");
+            throw std::invalid_argument("invalid character");
         auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
         offset += 2;
         return result;
     }
     else if (!(utf8[offset + 0] & 0x10)) {
         if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
-            ThrowInvalidArgument("invalid character");
+            throw std::invalid_argument("invalid character");
         auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
         offset += 3;
         return result;
     }
     else if (!(utf8[offset + 0] & 0x08)) {
         if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
-            ThrowInvalidArgument("invalid character");
+            throw std::invalid_argument("invalid character");
         auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
         offset += 4;
         return result;
     }
-    ThrowInvalidArgument("invalid string");
+    throw std::invalid_argument("invalid string");
 }
 
 static std::vector<uint32_t> codepoints_from_utf8(const std::string & utf8) {
@@ -311,7 +309,7 @@ static std::vector<uint16_t> codepoint_to_utf16(uint32_t cp) {
         result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
     }
     else {
-        ThrowInvalidArgument("invalid codepoint");
+        throw std::invalid_argument("invalid codepoint");
     }
     return result;
 }
@@ -334,12 +332,12 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
     }
     else {
         if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
-            ThrowInvalidArgument("invalid character");
+            throw std::invalid_argument("invalid character");
         auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
         offset += 2;
         return result;
     }
-    ThrowInvalidArgument("invalid string");
+    throw std::invalid_argument("invalid string");
 }
 
 static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
diff --git a/llamafile/cuda.c b/llamafile/cuda.c
index e5aa9031f6..3ce825c62d 100644
--- a/llamafile/cuda.c
+++ b/llamafile/cuda.c
@@ -15,9 +15,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "x.h"
-#include <cosmo.h>
 #include <time.h>
+#include <cosmo.h>
 #include <dlfcn.h>
 #include <errno.h>
 #include <spawn.h>
@@ -31,6 +30,7 @@
 #include <sys/wait.h>
 #include <sys/stat.h>
 #include <stdatomic.h>
+#include "llamafile/x.h"
 #include "llamafile/log.h"
 #include "llama.cpp/ggml-cuda.h"
 #include "llama.cpp/ggml-metal.h"
@@ -485,10 +485,10 @@ static bool CompileAmdWindows(const char *clangxx, const char *dso, const char *
         "-mllvm", "-amdgpu-function-calls=false",
         "-mllvm", "-amdgpu-early-inline-all=true",
         FLAG_tinyblas ? "-DGGML_USE_TINYBLAS" : "-DIGNORE",
-        "-isystem", _gc(xasprintf("%s/include", hip_path)),
-        BLAS_ONLY("-l"), BLAS_ONLY(_gc(xasprintf("%s/lib/hipblas.%s", hip_path, lib))),
-        BLAS_ONLY("-l"), BLAS_ONLY(_gc(xasprintf("%s/lib/rocblas.%s", hip_path, lib))),
-        "-l", _gc(xasprintf("%s/lib/amdhip64.%s", hip_path, lib)),
+        "-isystem", gc(xasprintf("%s/include", hip_path)),
+        BLAS_ONLY("-l"), BLAS_ONLY(gc(xasprintf("%s/lib/hipblas.%s", hip_path, lib))),
+        BLAS_ONLY("-l"), BLAS_ONLY(gc(xasprintf("%s/lib/rocblas.%s", hip_path, lib))),
+        "-l", gc(xasprintf("%s/lib/amdhip64.%s", hip_path, lib)),
         "-lkernel32",
         NULL,
     };
diff --git a/llamafile/llamafile.c b/llamafile/llamafile.c
index 0cccb9ff7c..883dca6c13 100644
--- a/llamafile/llamafile.c
+++ b/llamafile/llamafile.c
@@ -91,7 +91,7 @@ static struct llamafile *llamafile_open_zip(const char *prog, const char *fname,
         off = file->size - 65536;
         amt = file->size - off;
     }
-    if (!(bufdata = _gc(malloc(65536)))) {
+    if (!(bufdata = gc(malloc(65536)))) {
         goto Failure;
     }
     if (pread(fd, bufdata, amt, off) != amt) {
@@ -134,7 +134,7 @@ static struct llamafile *llamafile_open_zip(const char *prog, const char *fname,
 
     // read the central directory
     cdirsize = amt;
-    if (!(cdirdata = _gc(malloc(cdirsize)))) {
+    if (!(cdirdata = gc(malloc(cdirsize)))) {
         goto Failure;
     }
     if (pread(fd, cdirdata, cdirsize, off) != (long)cdirsize) {
@@ -168,7 +168,7 @@ static struct llamafile *llamafile_open_zip(const char *prog, const char *fname,
                 !memcmp(fname, entry_name_bytes, fname_len))
              : (entry_name_len > 5 &&
                 !memcasecmp(entry_name_bytes + entry_name_len - 5, ".gguf", 5)))) {
-            zip_name = _gc(strndup(entry_name_bytes, entry_name_len));
+            zip_name = gc(strndup(entry_name_bytes, entry_name_len));
             off = get_zip_cfile_offset(cdirdata + entry_offset);
             file->size = get_zip_cfile_compressed_size(cdirdata + entry_offset);
             cdir_offset = entry_offset;
@@ -272,7 +272,7 @@ struct llamafile *llamafile_open_gguf(const char *fname, const char *mode) {
     // support filenames like `foo.zip@weights.gguf`
     const char *p;
     if ((p = strchr(fname, '@'))) {
-        return llamafile_open_zip(_gc(strndup(fname, p - fname)), p + 1, mode);
+        return llamafile_open_zip(gc(strndup(fname, p - fname)), p + 1, mode);
     }
 
     // open from file or from our own executable if it doesn't exist
diff --git a/llamafile/rocm.sh b/llamafile/rocm.sh
index aa4298112e..714d59f000 100755
--- a/llamafile/rocm.sh
+++ b/llamafile/rocm.sh
@@ -15,6 +15,7 @@ hipcc \
   -use_fast_math \
   -DGGML_BUILD=1 \
   -DGGML_SHARED=1 \
+  -Wno-return-type \
   -Wno-unused-result \
   -DGGML_CUDA_DMMV_X=32 \
   -DGGML_CUDA_MMV_Y=1 \