From e0656ea190fa1687712c46641a721b02164e06d0 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@mozilla.com>
Date: Wed, 5 Jun 2024 04:31:45 -0700
Subject: [PATCH] Introduce new llamafile server
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

You can now build and run `o//llamafile/server/main` which launches an
HTTP server that currently supports a single endpoint at /tokenize. If
wrk sends it a request to tokenize a string that has 51 tokens then it
serves two million requests per second on my workstation, where 99 pct
latency is 179 µs. This server is designed to be crash proof, reliable
and preeempting. Workers are able to be asynchronously canceled so the
supervisor thread can respawn them. Cosmo's new memory allocator helps
this server be high performance for llama.cpp's STL-heavy use case too
---
 build/config.mk                         |   9 +-
 llamafile/BUILD.mk                      |  10 +-
 llamafile/debug.cpp                     |  24 +-
 llamafile/flags.cpp                     | 279 ++++++++++++-
 llamafile/gpu.c                         |   6 -
 llamafile/llamafile.h                   |  39 +-
 llamafile/log.c                         |   4 +-
 llamafile/server/.clang-format          |  13 +
 llamafile/server/BUILD.mk               |  34 ++
 llamafile/server/atob.cpp               |  32 ++
 llamafile/server/buffer.cpp             |  48 +++
 llamafile/server/buffer.h               |  30 ++
 llamafile/server/client.cpp             | 494 ++++++++++++++++++++++++
 llamafile/server/client.h               |  67 ++++
 llamafile/server/crash.cpp              | 128 ++++++
 llamafile/server/hexcpy.cpp             |  29 ++
 llamafile/server/hextoint.cpp           |  37 ++
 llamafile/server/json.cpp               | 172 +++++++++
 llamafile/server/json.h                 |  43 +++
 llamafile/server/json_test.cpp          | 128 ++++++
 llamafile/server/listen.cpp             | 114 ++++++
 llamafile/server/log.cpp                |  90 +++++
 llamafile/server/log.h                  |  36 ++
 llamafile/server/main.cpp               |  90 +++++
 llamafile/server/path.cpp               |  67 ++++
 llamafile/server/path.h                 |  28 ++
 llamafile/server/server.cpp             | 193 +++++++++
 llamafile/server/server.h               |  51 +++
 llamafile/server/signals.cpp            |  94 +++++
 llamafile/server/signals.h              |  28 ++
 llamafile/server/time.cpp               | 203 ++++++++++
 llamafile/{security.c => server/time.h} |  16 +-
 llamafile/server/utils.h                |  27 ++
 llamafile/server/worker.cpp             | 140 +++++++
 llamafile/server/worker.h               |  41 ++
 llamafile/sgemm.h                       |  20 +-
 llamafile/tinyblas_cpu_unsupported.cpp  |  10 +-
 llamafile/tokenize.cpp                  |  99 ++---
 llamafile/zipalign.c                    |  50 +--
 39 files changed, 2902 insertions(+), 121 deletions(-)
 create mode 100644 llamafile/server/.clang-format
 create mode 100644 llamafile/server/BUILD.mk
 create mode 100644 llamafile/server/atob.cpp
 create mode 100644 llamafile/server/buffer.cpp
 create mode 100644 llamafile/server/buffer.h
 create mode 100644 llamafile/server/client.cpp
 create mode 100644 llamafile/server/client.h
 create mode 100644 llamafile/server/crash.cpp
 create mode 100644 llamafile/server/hexcpy.cpp
 create mode 100644 llamafile/server/hextoint.cpp
 create mode 100644 llamafile/server/json.cpp
 create mode 100644 llamafile/server/json.h
 create mode 100644 llamafile/server/json_test.cpp
 create mode 100644 llamafile/server/listen.cpp
 create mode 100644 llamafile/server/log.cpp
 create mode 100644 llamafile/server/log.h
 create mode 100644 llamafile/server/main.cpp
 create mode 100644 llamafile/server/path.cpp
 create mode 100644 llamafile/server/path.h
 create mode 100644 llamafile/server/server.cpp
 create mode 100644 llamafile/server/server.h
 create mode 100644 llamafile/server/signals.cpp
 create mode 100644 llamafile/server/signals.h
 create mode 100644 llamafile/server/time.cpp
 rename llamafile/{security.c => server/time.h} (71%)
 create mode 100644 llamafile/server/utils.h
 create mode 100644 llamafile/server/worker.cpp
 create mode 100644 llamafile/server/worker.h

diff --git a/build/config.mk b/build/config.mk
index a7f255202c..0bf1af6b35 100644
--- a/build/config.mk
+++ b/build/config.mk
@@ -2,7 +2,7 @@
 #── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
 
 PREFIX = /usr/local
-COSMOCC = .cosmocc/3.3.10
+COSMOCC = .cosmocc/3.4.0
 TOOLCHAIN = $(COSMOCC)/bin/cosmo
 
 AR = $(TOOLCHAIN)ar
@@ -13,7 +13,8 @@ MKDEPS = $(COSMOCC)/bin/mkdeps
 INSTALL = install
 
 ARFLAGS = rcsD
-CCFLAGS = -g -O3 -fexceptions -fsignaling-nans
+CXXFLAGS = -frtti -std=gnu++23
+CCFLAGS = -g -ggdb -O3 -fexceptions -fsignaling-nans -ffunction-sections -fdata-sections
 CPPFLAGS_ = -iquote. -mcosmo -DGGML_MULTIPLATFORM -Wno-attributes -DLLAMAFILE_DEBUG
 TARGET_ARCH = -Xx86_64-mavx -Xx86_64-mtune=znver4
 
@@ -51,5 +52,5 @@ clean:; rm -rf o
 .PHONY: distclean
 distclean:; rm -rf o .cosmocc
 
-.cosmocc/3.3.10:
-	build/download-cosmocc.sh $@ 3.3.10 00d61c1215667314f66e288c8285bae38cc6137fca083e5bba6c74e3a52439de
+.cosmocc/3.4.0:
+	build/download-cosmocc.sh $@ 3.4.0 475e24b84a18973312433f5280e267acbe1b4dac1b2e2ebb3cfce46051a8c08c
diff --git a/llamafile/BUILD.mk b/llamafile/BUILD.mk
index 30f4eab040..2be033db1b 100644
--- a/llamafile/BUILD.mk
+++ b/llamafile/BUILD.mk
@@ -12,11 +12,17 @@ LLAMAFILE_SRCS_CPP = $(filter %.cpp,$(LLAMAFILE_FILES))
 LLAMAFILE_SRCS = $(LLAMAFILE_SRCS_C) $(LLAMAFILE_SRCS_CPP) $(LLAMAFILE_SRCS_CU)
 LLAMAFILE_DOCS = $(filter %.1,$(LLAMAFILE_FILES))
 
-LLAMAFILE_OBJS =					\
+LLAMAFILE_OBJS :=					\
 	$(LLAMAFILE_SRCS_C:%.c=o/$(MODE)/%.o)		\
 	$(LLAMAFILE_SRCS_CPP:%.cpp=o/$(MODE)/%.o)	\
 	$(LLAMAFILE_FILES:%=o/$(MODE)/%.zip.o)
 
+# this executable defines its own malloc(), free(), etc.
+# therefore we want to avoid it going inside the .a file
+LLAMAFILE_OBJS := $(filter-out o/$(MODE)/llamafile/zipalign.o,$(LLAMAFILE_OBJS))
+
+include llamafile/server/BUILD.mk
+
 o/$(MODE)/llamafile/zipalign:				\
 		o/$(MODE)/llamafile/zipalign.o		\
 		o/$(MODE)/llamafile/help.o		\
@@ -29,7 +35,6 @@ o/$(MODE)/llamafile/zipcheck:				\
 
 o/$(MODE)/llamafile/simple:				\
 		o/$(MODE)/llamafile/simple.o		\
-		o/$(MODE)/llama.cpp/llava/llava.a	\
 		o/$(MODE)/llama.cpp/llama.cpp.a
 
 o/$(MODE)/llamafile/tokenize:				\
@@ -39,6 +44,7 @@ o/$(MODE)/llamafile/tokenize:				\
 .PHONY: o/$(MODE)/llamafile
 o/$(MODE)/llamafile:					\
 		$(LLAMAFILE_OBJS)			\
+		o/$(MODE)/llamafile/server		\
 		o/$(MODE)/llamafile/simple		\
 		o/$(MODE)/llamafile/zipalign		\
 		o/$(MODE)/llamafile/zipcheck		\
diff --git a/llamafile/debug.cpp b/llamafile/debug.cpp
index d44177f462..eb3acd698f 100644
--- a/llamafile/debug.cpp
+++ b/llamafile/debug.cpp
@@ -18,12 +18,12 @@
 #include "debug.h"
 #include "log.h"
 
+#include <atomic>
 #include <cosmo.h>
 #include <fenv.h>
 #include <libc/calls/struct/aarch64.internal.h>
 #include <libc/calls/struct/ucontext.internal.h>
 #include <signal.h>
-#include <stdatomic.h>
 #include <termios.h>
 #include <ucontext.h>
 #include <unistd.h>
@@ -36,7 +36,7 @@
 #define UNDERFLOW_DELAY 2
 
 bool FLAG_trap;
-static atomic_llong g_underflowed;
+static std::atomic_llong g_underflowed;
 static thread_local int g_enabled;
 thread_local int llamafile_debug_op_index;
 const struct ggml_cgraph *llamafile_debug_graph;
@@ -59,17 +59,17 @@ static long long millis(void) {
     return timespec_tomillis(timespec_real());
 }
 
-static inline void spinlock(atomic_uint *lock) {
+static inline void spinlock(std::atomic_uint *lock) {
     int x;
     for (;;) {
-        x = atomic_exchange_explicit(lock, 1, memory_order_acquire);
+        x = lock->exchange(1, std::memory_order_acquire);
         if (!x)
             break;
     }
 }
 
-static inline void spunlock(atomic_uint *lock) {
-    atomic_store_explicit(lock, 0, memory_order_release);
+static inline void spunlock(std::atomic_uint *lock) {
+    lock->store(0, std::memory_order_release);
 }
 
 static const char *describe_vertex(struct ggml_tensor *t) {
@@ -130,8 +130,7 @@ static void on_sigfpe(int sig, siginfo_t *si, void *arg) {
     if (reason == FPE_FLTUND) {
         if (g_terminal_buddy.is_terminal) {
             long long now = millis();
-            if ((now - atomic_exchange_explicit(&g_underflowed, now, memory_order_relaxed)) >
-                UNDERFLOW_DELAY) {
+            if ((now - g_underflowed.exchange(now, std::memory_order_relaxed)) > UNDERFLOW_DELAY) {
                 write(2, UNDERFLOW_ALARM, strlen(UNDERFLOW_ALARM));
             }
         }
@@ -139,7 +138,7 @@ static void on_sigfpe(int sig, siginfo_t *si, void *arg) {
         return;
     }
 
-    static atomic_uint lock;
+    static std::atomic_uint lock;
     spinlock(&lock);
 
     const char *issue;
@@ -205,7 +204,7 @@ static void setup_sigfpe(void) {
 }
 
 int llamafile_trapping_enabled(int delta) {
-    static atomic_uint once;
+    static _Atomic(uint32_t) once;
     bool was_enabled = g_enabled > 0;
     bool is_enabled = (g_enabled += delta) > 0;
     feclearexcept(FE_ALL_EXCEPT);
@@ -225,11 +224,10 @@ void llamafile_trapping_restore(void) {
         feenableexcept(TRAPS);
         long long last;
         if (g_terminal_buddy.is_terminal &&
-            (last = atomic_load_explicit(&g_underflowed, memory_order_relaxed))) {
+            (last = g_underflowed.load(std::memory_order_relaxed))) {
             long long now = millis();
             if (now - last > UNDERFLOW_DELAY &&
-                now - atomic_exchange_explicit(&g_underflowed, 0, memory_order_relaxed) >
-                    UNDERFLOW_DELAY) {
+                now - g_underflowed.exchange(0, std::memory_order_relaxed) > UNDERFLOW_DELAY) {
                 write(2, UNDERFLOW_RESET, strlen(UNDERFLOW_RESET));
             }
         }
diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
index 975c026d06..c36f208a73 100644
--- a/llamafile/flags.cpp
+++ b/llamafile/flags.cpp
@@ -15,6 +15,283 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "debug.h"
 #include "llamafile.h"
 
-bool FLAG_precise;
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "llama.cpp/llama.h"
+
+bool FLAG_log_disable = false;
+bool FLAG_mlock = false;
+bool FLAG_mmap = true;
+bool FLAG_nocompile = false;
+bool FLAG_precise = false;
+bool FLAG_recompile = false;
+bool FLAG_tinyblas = false;
+bool FLAG_unsecure = false;
+const char *FLAG_file = nullptr;
+const char *FLAG_listen = "0.0.0.0:8080";
+const char *FLAG_model = nullptr;
+const char *FLAG_prompt = nullptr;
+float FLAG_temp = 0.8;
+int FLAG_batch = 2048;
+int FLAG_ctx = 512;
+int FLAG_flash_attn = false;
+int FLAG_gpu = 0;
+int FLAG_keepalive = 5;
+int FLAG_main_gpu = 0;
+int FLAG_n_gpu_layers = -1;
+int FLAG_seed = LLAMA_DEFAULT_SEED;
+int FLAG_split_mode = LLAMA_SPLIT_MODE_LAYER;
+int FLAG_threads = 64;
+int FLAG_ubatch = 512;
+int FLAG_verbose = 0;
+int FLAG_workers = 0;
+
+int cpu_get_num_math();
+
+static wontreturn void usage(int rc, int fd) {
+    tinyprint(fd, "usage: ", program_invocation_name, " -m MODEL -l [HOST:]PORT\n", NULL);
+    exit(rc);
+}
+
+static wontreturn void error(const char *message) {
+    tinyprint(2, program_invocation_name, ": ", message, "\n", NULL);
+    exit(1);
+}
+
+static wontreturn void bad(const char *flag) {
+    tinyprint(2, program_invocation_name, ": bad value for ", flag, "\n", NULL);
+    exit(1);
+}
+
+static wontreturn void missing(const char *flag) {
+    tinyprint(2, program_invocation_name, ": ", flag, " missing argument\n", NULL);
+    exit(1);
+}
+
+static wontreturn void required(const char *flag) {
+    tinyprint(2, program_invocation_name, ": ", flag, " is required\n", NULL);
+    exit(1);
+}
+
+static wontreturn void unknown(const char *flag) {
+    tinyprint(2, program_invocation_name, ": ", flag, " unknown argument\n", NULL);
+    exit(1);
+}
+
+void llamafile_get_flags(int argc, char **argv) {
+    FLAG_threads = cpu_get_num_math();
+    for (int i = 1; i < argc;) {
+        const char *flag = argv[i++];
+
+        if (*flag != '-')
+            break;
+
+        //////////////////////////////////////////////////////////////////////
+        // logging flags
+
+        if (!strcmp(flag, "--log-disable")) {
+            FLAG_log_disable = true;
+            continue;
+        }
+
+        if (!strcmp(flag, "-v") || !strcmp(flag, "--verbose")) {
+            FLAG_verbose++;
+            continue;
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // server flags
+
+        if (!strcmp(flag, "-l") || !strcmp(flag, "--listen")) {
+            if (i == argc)
+                missing("--listen");
+            FLAG_listen = argv[i++];
+            continue;
+        }
+
+        if (!strcmp(flag, "-k") || !strcmp(flag, "--keepalive")) {
+            if (i == argc)
+                missing("--keepalive");
+            FLAG_keepalive = atoi(argv[i++]);
+            continue;
+        }
+
+        if (!strcmp(flag, "-w") || !strcmp(flag, "--workers")) {
+            if (i == argc)
+                missing("--workers");
+            FLAG_workers = atoi(argv[i++]);
+            continue;
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // model flags
+
+        if (!strcmp(flag, "-m") || !strcmp(flag, "--model")) {
+            if (i == argc)
+                missing("--model");
+            FLAG_model = argv[i++];
+            continue;
+        }
+
+        if (!strcmp(flag, "-f") || !strcmp(flag, "--file")) {
+            if (i == argc)
+                missing("--file");
+            FLAG_file = argv[i++];
+            continue;
+        }
+
+        if (!strcmp(flag, "--seed")) {
+            if (i == argc)
+                missing("--seed");
+            FLAG_seed = atoi(argv[i++]);
+            continue;
+        }
+
+        if (!strcmp(flag, "--temp")) {
+            if (i == argc)
+                missing("--temp");
+            FLAG_temp = atof(argv[i++]);
+            continue;
+        }
+
+        if (!strcmp(flag, "-t") || !strcmp(flag, "--threads")) {
+            if (i == argc)
+                missing("--threads");
+            FLAG_threads = atoi(argv[i++]);
+            continue;
+        }
+
+        if (!strcmp(flag, "-b") || !strcmp(flag, "--batch-size")) {
+            if (i == argc)
+                missing("--batch-size");
+            FLAG_batch = atoi(argv[i++]);
+            continue;
+        }
+
+        if (!strcmp(flag, "-ub") || !strcmp(flag, "--ubatch-size")) {
+            if (i == argc)
+                missing("--ubatch-size");
+            FLAG_ubatch = atoi(argv[i++]);
+            continue;
+        }
+
+        if (!strcmp(flag, "-fa") || !strcmp(flag, "--flash-attn")) {
+            if (i == argc)
+                missing("--flash-attn");
+            FLAG_flash_attn = true;
+            continue;
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // cpu flags
+
+        if (!strcmp(flag, "--fast")) {
+            FLAG_precise = false;
+            continue;
+        }
+
+        if (!strcmp(flag, "--precise")) {
+            FLAG_precise = true;
+            continue;
+        }
+
+        if (!strcmp(flag, "--trap")) {
+            FLAG_trap = true;
+            FLAG_unsecure = true;
+            llamafile_trapping_enabled(+1);
+            continue;
+        }
+
+        if (!strcmp(flag, "--mlock")) {
+            FLAG_mlock = true;
+            continue;
+        }
+
+        if (!strcmp(flag, "--no-mmap")) {
+            FLAG_mmap = false;
+            continue;
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // gpu flags
+
+        if (!strcmp(flag, "--tinyblas")) {
+            FLAG_tinyblas = true;
+            continue;
+        }
+
+        if (!strcmp(flag, "--nocompile")) {
+            FLAG_nocompile = true;
+            continue;
+        }
+
+        if (!strcmp(flag, "--recompile")) {
+            FLAG_recompile = true;
+            continue;
+        }
+
+        if (!strcmp(flag, "--gpu")) {
+            if (i == argc)
+                missing("--gpu");
+            FLAG_gpu = llamafile_gpu_parse(argv[i++]);
+            if (FLAG_gpu == LLAMAFILE_GPU_ERROR)
+                bad("--gpu");
+            continue;
+        }
+
+        if (!strcmp(flag, "-ngl") || //
+            !strcmp(flag, "--gpu-layers") || //
+            !strcmp(flag, "--n-gpu-layers")) {
+            if (i == argc)
+                missing("--n-gpu-layers");
+            FLAG_n_gpu_layers = atoi(argv[i++]);
+            if (FLAG_n_gpu_layers <= 0)
+                FLAG_gpu = LLAMAFILE_GPU_DISABLE;
+            continue;
+        }
+
+        if (!strcmp(flag, "-mg") || !strcmp(flag, "--main-gpu")) {
+            if (i == argc)
+                missing("--main-gpu");
+            FLAG_main_gpu = atoi(argv[i++]);
+            continue;
+        }
+
+        if (!strcmp(flag, "-sm") || !strcmp(flag, "--split-mode")) {
+            if (i == argc)
+                missing("--split-mode");
+            const char *value = argv[i];
+            if (!strcmp(value, "none"))
+                FLAG_split_mode = LLAMA_SPLIT_MODE_NONE;
+            else if (!strcmp(value, "layer"))
+                FLAG_split_mode = LLAMA_SPLIT_MODE_LAYER;
+            else if (!strcmp(value, "row"))
+                FLAG_split_mode = LLAMA_SPLIT_MODE_ROW;
+            else
+                bad("--split-mode");
+            continue;
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // security flags
+
+        if (!strcmp(flag, "--unsecure")) {
+            FLAG_unsecure = true;
+            continue;
+        }
+
+        unknown(flag);
+    }
+
+    if (!FLAG_model)
+        required("--model");
+
+    FLAG_n_gpu_layers = llamafile_gpu_layers(FLAG_n_gpu_layers);
+}
diff --git a/llamafile/gpu.c b/llamafile/gpu.c
index dd2c7d6721..757805f89a 100644
--- a/llamafile/gpu.c
+++ b/llamafile/gpu.c
@@ -24,12 +24,6 @@
 #include <string.h>
 #include <unistd.h>
 
-int FLAG_gpu;
-bool FLAG_nogpu;
-bool FLAG_tinyblas;
-bool FLAG_nocompile;
-bool FLAG_recompile;
-
 const char *llamafile_describe_gpu(void) {
     switch (FLAG_gpu) {
     case LLAMAFILE_GPU_AUTO:
diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
index 3b2b7047a3..5b261b2033 100644
--- a/llamafile/llamafile.h
+++ b/llamafile/llamafile.h
@@ -6,6 +6,35 @@
 extern "C" {
 #endif
 
+extern bool FLAG_log_disable;
+extern bool FLAG_mlock;
+extern bool FLAG_mmap;
+extern bool FLAG_nocompile;
+extern bool FLAG_precise;
+extern bool FLAG_recompile;
+extern bool FLAG_tinyblas;
+extern bool FLAG_trap;
+extern bool FLAG_unsecure;
+extern const char *FLAG_file;
+extern const char *FLAG_listen;
+extern const char *FLAG_model;
+extern const char *FLAG_prompt;
+extern float FLAG_temp;
+extern int FLAG_batch;
+extern int FLAG_ctx;
+extern int FLAG_flash_attn;
+extern int FLAG_gpu;
+extern int FLAG_gpu;
+extern int FLAG_keepalive;
+extern int FLAG_main_gpu;
+extern int FLAG_n_gpu_layers;
+extern int FLAG_seed;
+extern int FLAG_split_mode;
+extern int FLAG_threads;
+extern int FLAG_ubatch;
+extern int FLAG_verbose;
+extern int FLAG_workers;
+
 struct llamafile;
 struct llamafile *llamafile_open_gguf(const char *, const char *);
 void llamafile_close(struct llamafile *);
@@ -18,6 +47,7 @@ size_t llamafile_size(struct llamafile *);
 FILE *llamafile_fp(struct llamafile *);
 void llamafile_ref(struct llamafile *);
 void llamafile_unref(struct llamafile *);
+char *llamafile_get_prompt(void);
 
 void llamafile_govern(void);
 void llamafile_check_cpu(void);
@@ -30,10 +60,7 @@ int llamafile_is_file_newer_than(const char *, const char *);
 void llamafile_schlep(const void *, size_t);
 void llamafile_get_app_dir(char *, size_t);
 void llamafile_launch_browser(const char *);
-
-extern bool FLAG_trap;
-extern bool FLAG_precise;
-extern bool FLAG_unsecure;
+void llamafile_get_flags(int, char **);
 
 #define LLAMAFILE_GPU_ERROR -2
 #define LLAMAFILE_GPU_DISABLE -1
@@ -41,10 +68,6 @@ extern bool FLAG_unsecure;
 #define LLAMAFILE_GPU_AMD 1
 #define LLAMAFILE_GPU_APPLE 2
 #define LLAMAFILE_GPU_NVIDIA 4
-extern int FLAG_gpu;
-extern bool FLAG_tinyblas;
-extern bool FLAG_nocompile;
-extern bool FLAG_recompile;
 bool llamafile_has_gpu(void);
 int llamafile_gpu_layers(int);
 bool llamafile_has_cuda(void);
diff --git a/llamafile/log.c b/llamafile/log.c
index b32f04319a..47301eb5bc 100644
--- a/llamafile/log.c
+++ b/llamafile/log.c
@@ -16,12 +16,12 @@
 // limitations under the License.
 
 #include "log.h"
+#include "llamafile.h"
+
 #include <pthread.h>
 #include <string.h>
 #include <unistd.h>
 
-bool FLAG_log_disable;
-
 void(tinylog)(const char *s, ...) {
     size_t n;
     int c, cs;
diff --git a/llamafile/server/.clang-format b/llamafile/server/.clang-format
new file mode 100644
index 0000000000..5c2c5d2398
--- /dev/null
+++ b/llamafile/server/.clang-format
@@ -0,0 +1,13 @@
+---
+BasedOnStyle: Mozilla
+IndentWidth: 4
+ColumnLimit: 80
+---
+Language: Cpp
+AllowShortFunctionsOnASingleLine: false
+AlignTrailingComments: false
+AlignEscapedNewlines: DontAlign
+AlwaysBreakTemplateDeclarations: true
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+FixNamespaceComments: true
+---
diff --git a/llamafile/server/BUILD.mk b/llamafile/server/BUILD.mk
new file mode 100644
index 0000000000..10cab9568d
--- /dev/null
+++ b/llamafile/server/BUILD.mk
@@ -0,0 +1,34 @@
+#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
+#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
+
+PKGS += LLAMAFILE_SERVER
+
+LLAMAFILE_SERVER_FILES := $(wildcard llamafile/server/*)
+LLAMAFILE_SERVER_HDRS = $(filter %.h,$(LLAMAFILE_SERVER_FILES))
+LLAMAFILE_SERVER_SRCS = $(filter %.cpp,$(LLAMAFILE_SERVER_FILES))
+LLAMAFILE_SERVER_OBJS = $(LLAMAFILE_SERVER_SRCS:%.cpp=o/$(MODE)/%.o)
+
+o/$(MODE)/llamafile/server/server.a:				\
+		$(filter-out %_test.o,$(LLAMAFILE_SERVER_OBJS))
+
+o/$(MODE)/llamafile/server/main:				\
+		o/$(MODE)/llamafile/server/main.o		\
+		o/$(MODE)/llamafile/server/server.a		\
+		o/$(MODE)/llama.cpp/llama.cpp.a			\
+		o/$(MODE)/llama.cpp/llava/llava.a		\
+		o/$(MODE)/double-conversion/double-conversion.a	\
+		o/$(MODE)/stb/stb.a				\
+		o/$(MODE)/llamafile/server/malloc.o		\
+
+$(LLAMAFILE_SERVER_OBJS): llamafile/server/BUILD.mk
+$(LLAMAFILE_SERVER_OBJS): private CCFLAGS += -O
+
+o/$(MODE)/llamafile/server/json_test:				\
+		o/$(MODE)/llamafile/server/json_test.o		\
+		o/$(MODE)/llamafile/server/json.o		\
+		o/$(MODE)/double-conversion/double-conversion.a	\
+
+.PHONY: o/$(MODE)/llamafile/server
+o/$(MODE)/llamafile/server:					\
+		o/$(MODE)/llamafile/server/main			\
+		o/$(MODE)/llamafile/server/json_test.runs	\
diff --git a/llamafile/server/atob.cpp b/llamafile/server/atob.cpp
new file mode 100644
index 0000000000..ee0dd3ee46
--- /dev/null
+++ b/llamafile/server/atob.cpp
@@ -0,0 +1,32 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "utils.h"
+
+bool
+atob(ctl::string_view val, bool dflt)
+{
+    if (val == "0")
+        return false;
+    if (val == "false")
+        return false;
+    if (val == "1")
+        return true;
+    if (val == "true")
+        return true;
+    return dflt;
+}
diff --git a/llamafile/server/buffer.cpp b/llamafile/server/buffer.cpp
new file mode 100644
index 0000000000..65bfc34940
--- /dev/null
+++ b/llamafile/server/buffer.cpp
@@ -0,0 +1,48 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "buffer.h"
+
+#include <sys/auxv.h>
+#include <sys/mman.h>
+
+static int pagesz = getauxval(AT_PAGESZ);
+
+Buffer::Buffer(size_t capacity) noexcept
+  : i(0)
+  , n(0)
+  , c(capacity - pagesz)
+  , p((char*)mmap(nullptr,
+                  capacity,
+                  PROT_READ | PROT_WRITE,
+                  MAP_PRIVATE | MAP_ANONYMOUS,
+                  -1,
+                  0))
+{
+    if (p == MAP_FAILED)
+        __builtin_trap();
+    if (c & (pagesz - 1))
+        __builtin_trap();
+    if (mprotect(p + c, pagesz, PROT_NONE))
+        __builtin_trap();
+}
+
+Buffer::~Buffer() noexcept
+{
+    if (munmap(p, c + pagesz))
+        __builtin_trap();
+}
diff --git a/llamafile/server/buffer.h b/llamafile/server/buffer.h
new file mode 100644
index 0000000000..757a5d1541
--- /dev/null
+++ b/llamafile/server/buffer.h
@@ -0,0 +1,30 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stddef.h>
+
+struct Buffer
+{
+    size_t i;
+    size_t n;
+    const size_t c;
+    char* const p;
+
+    explicit Buffer(size_t) noexcept;
+    ~Buffer() noexcept;
+};
diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
new file mode 100644
index 0000000000..b08b9f96a5
--- /dev/null
+++ b/llamafile/server/client.cpp
@@ -0,0 +1,494 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "client.h"
+
+#include <ctl/optional.h>
+#include <ctl/string.h>
+#include <errno.h>
+#include <libc/fmt/itoa.h>
+#include <libc/str/slice.h>
+#include <limits.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include <sys/uio.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "json.h"
+#include "llama.cpp/llama.h"
+#include "llamafile/llamafile.h"
+#include "llamafile/version.h"
+#include "log.h"
+#include "time.h"
+#include "utils.h"
+
+#define STANDARD_RESPONSE_HEADERS \
+    "Server: llamafile/" LLAMAFILE_VERSION_STRING "\r\n" \
+    "Referrer-Policy: origin\r\n" \
+    "Cache-Control: private; max-age=0\r\n"
+
+#define HasHeader(H) (!!msg.headers[H].a)
+#define HeaderData(H) (ibuf.p + msg.headers[H].a)
+#define HeaderLength(H) (msg.headers[H].b - msg.headers[H].a)
+#define HeaderEqual(H, S) \
+    SlicesEqual(S, strlen(S), HeaderData(H), HeaderLength(H))
+#define HeaderEqualCase(H, S) \
+    SlicesEqualCase(S, strlen(S), HeaderData(H), HeaderLength(H))
+
+using namespace ctl;
+
+Client::Client() : ibuf(8 * 1024 * 1024), obuf(64 * 1024 * 1024)
+{
+    InitHttpMessage(&msg, 0);
+    url.params.p = nullptr;
+}
+
+int
+Client::close()
+{
+    int rc = 0;
+    clear();
+    DestroyHttpMessage(&msg);
+    if (fd != -1) {
+        if (FLAG_verbose >= 2)
+            LOG("close");
+        rc = ::close(fd);
+        fd = -1;
+    }
+    return rc;
+}
+
+void
+Client::clear()
+{
+    free(url_memory);
+    url_memory = nullptr;
+    free(params_memory);
+    params_memory = nullptr;
+    free(url.params.p);
+    url.params.p = nullptr;
+    close_connection = false;
+    payload = "";
+    unread = 0;
+}
+
+void
+Client::run()
+{
+    ibuf.n = 0;
+    for (;;) {
+
+        // read headers
+        clear();
+        if (!read_request())
+            break;
+
+        // process message
+        if (!transport())
+            break;
+
+        // synchronize message stream
+        if (close_connection)
+            break;
+        if (!read_payload())
+            break;
+
+        // move pipelined bytes back to beginning
+        if (ibuf.n == ibuf.i) {
+            ibuf.n = 0;
+        } else {
+            memmove(ibuf.p, ibuf.p + ibuf.i, ibuf.n - ibuf.i);
+            ibuf.n -= ibuf.i;
+        }
+    }
+}
+
+bool
+Client::read_request()
+{
+    int inmsglen;
+    ResetHttpMessage(&msg, kHttpRequest);
+    for (;;) {
+        inmsglen = ParseHttpMessage(&msg, ibuf.p, ibuf.n, ibuf.c);
+        if (inmsglen > 0) {
+            message_started = timespec_real();
+            ibuf.i = inmsglen;
+            return true;
+        }
+        if (inmsglen == -1) {
+            LOG("bad message %m");
+            return false;
+        }
+        if (ibuf.n)
+            LOG("fragmented message with %zu bytes", ibuf.n);
+        ssize_t got;
+        got = read(fd, ibuf.p + ibuf.n, ibuf.c - ibuf.n);
+        if (!got && ibuf.n)
+            LOG("unexpected eof after %zu bytes", ibuf.n);
+        if (got == -1 && (ibuf.n || (errno != EAGAIN && errno != ECONNRESET)))
+            LOG("read failed %m");
+        if (got <= 0)
+            return false;
+        ibuf.n += got;
+    }
+}
+
+bool
+Client::transport()
+{
+    if (msg.version > 11) {
+        close_connection = true;
+        return send_error(505);
+    }
+
+    if (msg.method == kHttpConnect) {
+        close_connection = true;
+        return send_error(501);
+    }
+
+    if (!has_at_most_this_element(kHttpExpect, "100-continue")) {
+        close_connection = true;
+        return send_error(417);
+    }
+
+    if (HasHeader(kHttpTransferEncoding))
+        if (!HeaderEqualCase(kHttpTransferEncoding, "identity")) {
+            close_connection = true;
+            return send_error(501, "Transfer-Encoding Not Implemented");
+        }
+
+    if (HasHeader(kHttpContentLength)) {
+        long cl;
+        cl = ParseContentLength(HeaderData(kHttpContentLength),
+                                HeaderLength(kHttpContentLength));
+        if (cl == -1) {
+            close_connection = true;
+            return send_error(400, "Bad Content-Length");
+        }
+        if (cl > ibuf.c - ibuf.i) {
+            close_connection = true;
+            return send_error(413);
+        }
+        unread = cl;
+    } else if (msg.method == kHttpPost || msg.method == kHttpPut) {
+        close_connection = true;
+        return send_error(411);
+    }
+
+    if (FLAG_verbose >= 1)
+        LOG("get %#.*s", msg.uri.b - msg.uri.a, ibuf.p + msg.uri.a);
+
+    if (msg.version >= 11)
+        if (HeaderEqualCase(kHttpExpect, "100-continue"))
+            if (!send("HTTP/1.1 100 Continue\r\n\r\n"))
+                return false;
+
+    url_memory = ParseUrl(
+      ibuf.p + msg.uri.a, msg.uri.b - msg.uri.a, &url, kUrlPlus | kUrlLatin1);
+    if (!url_memory)
+        __builtin_trap();
+
+    return dispatch();
+}
+
+bool
+Client::send_error(int code, const char* reason)
+{
+    if (!reason)
+        reason = GetHttpReason(code);
+    LOG("error %d %s", code, reason);
+    char* p = start_response(obuf.p, code, reason);
+    return send_response(obuf.p, p, string(reason) + "\r\n");
+}
+
+char*
+Client::start_response(char* p, int code, const char* reason)
+{
+    *p++ = 'H';
+    *p++ = 'T';
+    *p++ = 'T';
+    *p++ = 'P';
+    *p++ = '/';
+    *p++ = '1';
+    *p++ = '.';
+    *p++ = '0' + (msg.version & 1);
+    *p++ = ' ';
+    *p++ = '0' + code / 100;
+    *p++ = '0' + code / 10 % 10;
+    *p++ = '0' + code % 10;
+    *p++ = ' ';
+    if (!reason)
+        reason = GetHttpReason(code);
+    p = stpcpy(p, reason);
+    *p++ = '\r';
+    *p++ = '\n';
+    p = stpcpy(p, STANDARD_RESPONSE_HEADERS);
+    return p;
+}
+
+bool
+Client::send_response(char* p0, char* p, string_view content)
+{
+    // append date header
+    tm tm;
+    p = stpcpy(p, "Date: ");
+    gmtime_lockless(message_started.tv_sec, &tm);
+    p = FormatHttpDateTime(p, &tm);
+    *p++ = '\r';
+    *p++ = '\n';
+
+    // inform client of close() intent
+    if (msg.version < 11)
+        close_connection = true;
+    if (HeaderEqualCase(kHttpConnection, "close"))
+        close_connection = true;
+    if (close_connection)
+        p = stpcpy(p, "Connection: close\r\n");
+
+    // append content length
+    p = stpcpy(p, "Content-Length: ");
+    p = FormatInt64(p, content.size());
+    *p++ = '\r';
+    *p++ = '\n';
+
+    // finish message
+    *p++ = '\r';
+    *p++ = '\n';
+
+    return send2(string_view(p0, p - p0), content);
+}
+
+bool
+Client::send(const string_view s)
+{
+    ssize_t sent;
+    if ((sent = write(fd, s.data(), s.size())) != s.size()) {
+        if (sent == -1 && errno != EAGAIN && errno != ECONNRESET)
+            LOG("write failed %m");
+        return false;
+    }
+    return true;
+}
+
+bool
+Client::send2(const string_view s1, const string_view s2)
+{
+    iovec iov[2];
+    ssize_t sent;
+    iov[0].iov_base = (void*)s1.data();
+    iov[0].iov_len = s1.size();
+    iov[1].iov_base = (void*)s2.data();
+    iov[1].iov_len = s2.size();
+    if ((sent = writev(fd, iov, 2)) != s1.size() + s2.size()) {
+        if (sent == -1 && errno != EAGAIN && errno != ECONNRESET)
+            LOG("writev failed %m");
+        return false;
+    }
+    return true;
+}
+
+bool
+Client::has_at_most_this_element(int h, const string_view s)
+{
+    if (!HasHeader(h))
+        return true;
+    if (!SlicesEqualCase(s.data(), s.size(), HeaderData(h), HeaderLength(h)))
+        return false;
+    struct HttpHeader* x;
+    for (unsigned i = 0; i < msg.xheaders.n; ++i) {
+        x = msg.xheaders.p + i;
+        if (GetHttpHeader(ibuf.p + x->k.a, x->k.b - x->k.a) == h &&
+            !SlicesEqualCase(
+              ibuf.p + x->v.a, x->v.b - x->v.a, s.data(), s.size())) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool
+Client::read_payload()
+{
+    while (ibuf.n - ibuf.i < unread) {
+        ssize_t got;
+        if ((got = read(fd, ibuf.p + ibuf.n, ibuf.c - ibuf.n)) <= 0) {
+            if (!got)
+                LOG("unexpected eof");
+            if (got == -1)
+                LOG("read failed %m");
+            return false;
+        }
+        ibuf.n += got;
+    }
+    payload = string_view(ibuf.p + ibuf.i, unread);
+    ibuf.i += unread;
+    unread = 0;
+    if (msg.method == kHttpPost && //
+        HasHeader(kHttpContentType) &&
+        IsMimeType(HeaderData(kHttpContentType),
+                   HeaderLength(kHttpContentType),
+                   "application/x-www-form-urlencoded")) {
+        params_memory =
+          ParseParams(payload.data(), payload.size(), &url.params);
+    }
+
+    return true;
+}
+
+bool
+Client::dispatch()
+{
+    if (path() == "/tokenize")
+        return tokenize();
+    return send_error(404);
+}
+
+string_view
+Client::path()
+{
+    if (!url.path.n)
+        return "/";
+    return { url.path.p, url.path.n };
+}
+
+optional<string_view>
+Client::param(string_view key)
+{
+    for (size_t i = 0; i < url.params.n; ++i)
+        if (key.size() == url.params.p[i].key.n)
+            if (!memcmp(key.data(), url.params.p[i].key.p, key.size()))
+                return optional(
+                  string_view(url.params.p[i].val.p, url.params.p[i].val.n));
+    return {};
+}
+
+static string_view
+or_empty(optional<string_view> x)
+{
+    if (x.has_value())
+        return x.value();
+    return {};
+}
+
+bool
+Client::tokenize()
+{
+    if (msg.method != kHttpGet && msg.method != kHttpPost)
+        return send_error(405);
+
+    if (!read_payload())
+        return false;
+
+    // get prompt
+    //
+    //   1. Allow GET "/tokenize?prompt=foo"
+    //   2. Allow POST "prompt=foo" (application/x-www-form-urlencoded)
+    //   3. Allow POST "foo" (text/plain)
+    //
+    string_view input;
+    optional<string_view> prompt = param("prompt");
+    if (prompt.has_value()) {
+        input = prompt.value();
+    } else if (HasHeader(kHttpContentType)) {
+        if (IsMimeType(HeaderData(kHttpContentType),
+                       HeaderLength(kHttpContentType),
+                       "text/plain")) {
+            input = payload;
+        } else {
+            return send_error(501, "Content Type Not Implemented");
+        }
+    } else {
+        input = payload;
+    }
+
+    // get optional parameters
+    bool add_special = atob(or_empty(param("add_special")), true);
+    bool parse_special = atob(or_empty(param("parse_special")), false);
+
+    // setup statistics
+    rusage rustart = {};
+    getrusage(RUSAGE_THREAD, &rustart);
+    timespec started = timespec_real();
+
+    // turn text into tokens
+    extern llama_model* g_model;
+    int maxcount = input.size() + 16;
+    llama_token* toks = new llama_token[maxcount];
+    int count = llama_tokenize(g_model,
+                               input.data(),
+                               input.size(),
+                               toks,
+                               maxcount,
+                               add_special,
+                               parse_special);
+    if (count < 0) {
+        delete[] toks;
+        __builtin_trap();
+    }
+
+    // serialize tokens to json
+    char* p = obuf.p;
+    p = stpcpy(p, "{\r\n");
+    p = stpcpy(p, "  \"add_special\": ");
+    p = encode_bool(p, add_special);
+    p = stpcpy(p, ",\n");
+    p = stpcpy(p, "  \"parse_special\": ");
+    p = encode_bool(p, parse_special);
+    p = stpcpy(p, ",\n");
+    p = stpcpy(p, "  \"tokens\": [");
+    for (int i = 0; i < count; ++i) {
+        if (i)
+            *p++ = ',';
+        p = stpcpy(p, "\r\n    ");
+        char s[32];
+        int n = llama_token_to_piece(g_model, toks[i], s, sizeof(s), true);
+        if (n < 0) {
+            delete[] toks;
+            __builtin_trap();
+        }
+        p = encode_json(p, string_view(s, n));
+    }
+    p = stpcpy(p, "\r\n  ]\r\n");
+    p = stpcpy(p, "}\r\n");
+    string_view content(obuf.p, p - obuf.p);
+    delete[] toks;
+
+    // collect statistics
+    rusage ruend = {};
+    getrusage(RUSAGE_THREAD, &ruend);
+    timeval user = timeval_sub(ruend.ru_utime, rustart.ru_utime);
+    timeval system = timeval_sub(ruend.ru_stime, rustart.ru_stime);
+    timespec ended = timespec_real();
+    timespec wall = timespec_sub(ended, started);
+    long wall_us = timespec_tomicros(wall);
+    long user_us = timeval_tomicros(user);
+    long system_us = timeval_tomicros(system);
+
+    // send response
+    char* headers = p;
+    p = start_response(p, 200);
+    p = stpcpy(p, "Content-Type: application/json\r\n");
+    p = stpcpy(p, "X-Wall-Micros: ");
+    p = FormatInt64(p, wall_us);
+    p = stpcpy(p, "\r\nX-User-Micros: ");
+    p = FormatInt64(p, user_us);
+    p = stpcpy(p, "\r\nX-System-Micros: ");
+    p = FormatInt64(p, system_us);
+    p = stpcpy(p, "\r\n");
+    return send_response(headers, p, content);
+}
diff --git a/llamafile/server/client.h b/llamafile/server/client.h
new file mode 100644
index 0000000000..2234c87416
--- /dev/null
+++ b/llamafile/server/client.h
@@ -0,0 +1,67 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "buffer.h"
+
+#include <ctl/optional.h>
+#include <ctl/string_view.h>
+#include <net/http/http.h>
+#include <net/http/url.h>
+#include <sys/resource.h>
+#include <time.h>
+
+struct Client
+{
+    int fd = -1;
+    bool close_connection = false;
+    size_t unread = 0;
+    timespec message_started;
+    HttpMessage msg;
+    Url url = {};
+    char* url_memory = nullptr;
+    char* params_memory = nullptr;
+    ctl::string_view payload;
+    Buffer ibuf;
+    Buffer obuf;
+
+    Client();
+
+    void run();
+    int close();
+    void clear();
+    bool transport() __wur;
+    bool synchronize() __wur;
+    bool read_payload() __wur;
+    bool read_request() __wur;
+    bool read_content() __wur;
+    bool send_continue() __wur;
+    bool send(const ctl::string_view) __wur;
+    char* start_response(char*, int, const char* = nullptr);
+    bool send_error(int, const char* = nullptr) __wur;
+    bool send_response(char*, char*, const ctl::string_view) __wur;
+    bool send2(const ctl::string_view, const ctl::string_view) __wur;
+    char* append_header(const ctl::string_view, const ctl::string_view);
+    bool has_at_most_this_element(int, const ctl::string_view);
+
+    ctl::string_view path();
+    ctl::optional<ctl::string_view> param(ctl::string_view);
+
+    bool dispatch() __wur;
+    bool tokenize() __wur;
+};
diff --git a/llamafile/server/crash.cpp b/llamafile/server/crash.cpp
new file mode 100644
index 0000000000..3aa9f2c87f
--- /dev/null
+++ b/llamafile/server/crash.cpp
@@ -0,0 +1,128 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "signals.h"
+#include "utils.h"
+
+#include <assert.h>
+#include <cosmo.h>
+#include <ucontext.h>
+
+#ifdef __aarch64__
+#define PC pc
+#define BP regs[29]
+#else
+#define PC gregs[REG_RIP]
+#define BP gregs[REG_RBP]
+#endif
+
+// returns true if `p` is preceded by x86 call instruction
+// this is actually impossible to do but we'll do our best
+int
+is_call(const unsigned char* p)
+{
+    if (p[-5] == 0xe8)
+        return 5; // call Jvds
+    if (p[-2] == 0xff && (p[-1] & 070) == 020)
+        return 2; // call %reg
+    if (p[-4] == 0xff && (p[-3] & 070) == 020)
+        return 4; // call disp8(%reg,%reg)
+    if (p[-3] == 0xff && (p[-2] & 070) == 020)
+        return 3; // call disp8(%reg)
+    if (p[-7] == 0xff && (p[-6] & 070) == 020)
+        return 7; // call disp32(%reg,%reg)
+    if (p[-6] == 0xff && (p[-5] & 070) == 020)
+        return 6; // call disp32(%reg)
+    return 0;
+}
+
+void
+describe_crash(char* buf, size_t len, int sig, siginfo_t* si, void* arg)
+{
+    unassert(len >= 64);
+
+    // describe crash
+    char* p = buf;
+    char* pe = p + len;
+    char signame[21];
+    p = stpcpy(p, strsignal_r(sig, signame));
+    if (si && //
+        (sig == SIGFPE || //
+         sig == SIGILL || //
+         sig == SIGBUS || //
+         sig == SIGSEGV || //
+         sig == SIGTRAP)) {
+        p = stpcpy(p, " at ");
+        p = hexcpy(p, (long)si->si_addr);
+    }
+
+    // get stack frame daisy chain
+    long pc;
+    ucontext_t* ctx;
+    struct StackFrame* sf;
+    if ((ctx = (ucontext_t*)arg)) {
+        pc = ctx->uc_mcontext.PC;
+        sf = (struct StackFrame*)ctx->uc_mcontext.BP;
+    } else {
+        pc = 0;
+        sf = (struct StackFrame*)__builtin_frame_address(0);
+    }
+
+    // describe backtrace
+    p = stpcpy(p, " bt ");
+    if (pc) {
+        p = hexcpy(p, pc);
+        *p++ = ' ';
+    }
+    bool gotsome = false;
+    while (sf) {
+        if (kisdangerous(sf)) {
+            if (p + 1 + 9 + 1 < pe) {
+                if (gotsome)
+                    *p++ = ' ';
+                p = stpcpy(p, "DANGEROUS");
+                if (p + 16 + 1 < pe) {
+                    *p++ = ' ';
+                    p = hexcpy(p, (long)sf);
+                }
+            }
+            break;
+        }
+        if (p + 16 + 1 < pe) {
+            unsigned char* ip = (unsigned char*)sf->addr;
+#ifdef __x86_64__
+            // x86 advances the progrem counter before an instruction
+            // begins executing. return addresses in backtraces shall
+            // point to code after the call, which means addr2line is
+            // going to print unrelated code unless we fixup the addr
+            if (!kisdangerous(ip))
+                ip -= is_call(ip);
+#endif
+            if (gotsome)
+                *p++ = ' ';
+            else
+                gotsome = true;
+            p = hexcpy(p, (long)ip);
+        } else {
+            break;
+        }
+        sf = sf->next;
+    }
+
+    // terminate string
+    *p = '\0';
+}
diff --git a/llamafile/server/hexcpy.cpp b/llamafile/server/hexcpy.cpp
new file mode 100644
index 0000000000..bc53d50788
--- /dev/null
+++ b/llamafile/server/hexcpy.cpp
@@ -0,0 +1,29 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "utils.h"
+
+char*
+hexcpy(char* p, unsigned long x)
+{
+    int k = x ? (__builtin_clzl(x) ^ 63) + 1 : 1;
+    k = (k + 3) & -4;
+    while (k > 0)
+        *p++ = "0123456789abcdef"[(x >> (k -= 4)) & 15];
+    *p = '\0';
+    return p;
+}
diff --git a/llamafile/server/hextoint.cpp b/llamafile/server/hextoint.cpp
new file mode 100644
index 0000000000..f6bec4f79a
--- /dev/null
+++ b/llamafile/server/hextoint.cpp
@@ -0,0 +1,37 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "utils.h"
+
+alignas(signed char) const signed char kHexToInt[256] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x00
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x10
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x20
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  -1, -1, -1, -1, -1, -1, // 0x30
+    -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x40
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x50
+    -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x60
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x70
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x80
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x90
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0xa0
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0xb0
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0xc0
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0xd0
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0xe0
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0xf0
+};
diff --git a/llamafile/server/json.cpp b/llamafile/server/json.cpp
new file mode 100644
index 0000000000..8f03c57bc6
--- /dev/null
+++ b/llamafile/server/json.cpp
@@ -0,0 +1,172 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "json.h"
+
+#include <cosmo.h>
+#include <ctl/string.h>
+#include <net/http/escape.h>
+
+#include "double-conversion/double-to-string.h"
+#include "double-conversion/utils.h"
+
+static const char kEscapeLiteral[128] = {
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 2, 9, 4, 3, 9, 9, // 0x00
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 0x10
+    0, 0, 7, 0, 0, 0, 9, 9, 0, 0, 0, 0, 0, 0, 0, 6, // 0x20
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 0, // 0x30
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, // 0x50
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, // 0x70
+};
+
+static const double_conversion::DoubleToStringConverter kDoubleToJson(
+  double_conversion::DoubleToStringConverter::UNIQUE_ZERO |
+    double_conversion::DoubleToStringConverter::EMIT_POSITIVE_EXPONENT_SIGN,
+  "1e5000",
+  "null",
+  'e',
+  -6,
+  21,
+  6,
+  0);
+
+char*
+encode_bool(char* p, bool x) noexcept
+{
+    return stpcpy(p, x ? "true" : "false");
+}
+
+char*
+encode_json(char* p, int x) noexcept
+{
+    return FormatInt32(p, x);
+}
+
+char*
+encode_json(char* p, long x) noexcept
+{
+    return FormatInt64(p, x);
+}
+
+char*
+encode_json(char* p, unsigned x) noexcept
+{
+    return FormatUint32(p, x);
+}
+
+char*
+encode_json(char* p, unsigned long x) noexcept
+{
+    return FormatUint64(p, x);
+}
+
+char*
+encode_json(char* p, double x) noexcept
+{
+    double_conversion::StringBuilder b(p, 128);
+    kDoubleToJson.ToShortest(x, &b);
+    b.Finalize();
+    return p + strlen(p);
+}
+
+char*
+encode_json(char* p, const ctl::string_view s) noexcept
+{
+    *p++ = '"';
+    p = encode_js_string_literal(p, s);
+    *p++ = '"';
+    *p = 0;
+    return p;
+}
+
+char*
+encode_js_string_literal(char* p, const ctl::string_view s) noexcept
+{
+    uint64_t w;
+    size_t i, j, m;
+    wint_t x, a, b;
+    for (size_t i = 0; i < s.size();) {
+        x = s[i++] & 255;
+        if (x >= 0300) {
+            a = ThomPikeByte(x);
+            m = ThomPikeLen(x) - 1;
+            if (i + m <= s.size()) {
+                for (j = 0;;) {
+                    b = s[i + j] & 0xff;
+                    if (!ThomPikeCont(b))
+                        break;
+                    a = ThomPikeMerge(a, b);
+                    if (++j == m) {
+                        x = a;
+                        i += j;
+                        break;
+                    }
+                }
+            }
+        }
+        switch (0 <= x && x <= 127 ? kEscapeLiteral[x] : 9) {
+            case 0:
+                *p++ = x;
+                break;
+            case 1:
+                *p++ = '\\';
+                *p++ = 't';
+                break;
+            case 2:
+                *p++ = '\\';
+                *p++ = 'n';
+                break;
+            case 3:
+                *p++ = '\\';
+                *p++ = 'r';
+                break;
+            case 4:
+                *p++ = '\\';
+                *p++ = 'f';
+                break;
+            case 5:
+                *p++ = '\\';
+                *p++ = '\\';
+                break;
+            case 6:
+                *p++ = '\\';
+                *p++ = '/';
+                break;
+            case 7:
+                *p++ = '\\';
+                *p++ = '"';
+                break;
+            case 9:
+                w = EncodeUtf16(x);
+                do {
+                    *p++ = '\\';
+                    *p++ = 'u';
+                    *p++ = "0123456789abcdef"[(w & 0xF000) >> 014];
+                    *p++ = "0123456789abcdef"[(w & 0x0F00) >> 010];
+                    *p++ = "0123456789abcdef"[(w & 0x00F0) >> 004];
+                    *p++ = "0123456789abcdef"[(w & 0x000F) >> 000];
+                } while ((w >>= 16));
+                break;
+            default:
+                __builtin_unreachable();
+        }
+    }
+    *p = 0;
+    return p;
+}
diff --git a/llamafile/server/json.h b/llamafile/server/json.h
new file mode 100644
index 0000000000..99ab5e0d35
--- /dev/null
+++ b/llamafile/server/json.h
@@ -0,0 +1,43 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ctl/string_view.h>
+
+char*
+encode_bool(char*, bool) noexcept;
+
+char*
+encode_json(char*, int) noexcept;
+
+char*
+encode_json(char*, long) noexcept;
+
+char*
+encode_json(char*, double) noexcept;
+
+char*
+encode_json(char*, unsigned) noexcept;
+
+char*
+encode_json(char*, unsigned long) noexcept;
+
+char*
+encode_json(char*, const ctl::string_view) noexcept;
+
+char*
+encode_js_string_literal(char*, const ctl::string_view) noexcept;
diff --git a/llamafile/server/json_test.cpp b/llamafile/server/json_test.cpp
new file mode 100644
index 0000000000..cf9d664702
--- /dev/null
+++ b/llamafile/server/json_test.cpp
@@ -0,0 +1,128 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "json.h"
+
+#include <ctl/string.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+using namespace ctl;
+
+string
+encode_json(int x) noexcept
+{
+    char buf[12];
+    return { buf, (size_t)(encode_json(buf, x) - buf) };
+}
+
+string
+encode_json(long x) noexcept
+{
+    char buf[21];
+    return { buf, (size_t)(encode_json(buf, x) - buf) };
+}
+
+string
+encode_json(double x) noexcept
+{
+    char buf[128];
+    return { buf, (size_t)(encode_json(buf, x) - buf) };
+}
+
+string
+encode_json(unsigned x) noexcept
+{
+    char buf[128];
+    return { buf, (size_t)(encode_json(buf, x) - buf) };
+}
+
+string
+encode_json(unsigned long x) noexcept
+{
+    char buf[128];
+    return { buf, (size_t)(encode_json(buf, x) - buf) };
+}
+
+string
+encode_json(const string_view x) noexcept
+{
+    char buf[128];
+    return { buf, (size_t)(encode_json(buf, x) - buf) };
+}
+
+string
+encode_js_string_literal(const string_view x) noexcept
+{
+    char buf[256]; // this isn't secure (no guard page)
+    return { buf, (size_t)(encode_json(buf, x) - buf) };
+}
+
+int
+main(int argc, char* argv[])
+{
+
+    if (encode_json(0) != "0")
+        return 1;
+    if (encode_json(INT_MAX) != "2147483647")
+        return 2;
+    if (encode_json(INT_MIN) != "-2147483648")
+        return 3;
+    if (encode_json(UINT_MAX) != "4294967295")
+        return 4;
+    if (encode_json(LONG_MAX) != "9223372036854775807")
+        return 5;
+    if (encode_json(LONG_MIN) != "-9223372036854775808")
+        return 6;
+    if (encode_json(ULONG_MAX) != "18446744073709551615")
+        return 7;
+
+    if (encode_json("") != "\"\"")
+        return 8;
+    if (encode_json(string_view("\0\1", 2)) != "\"\\u0000\\u0001\"")
+        return 9;
+    if (encode_json("\n\"\\\t") != "\"\\n\\\"\\\\\\t\"")
+        return 10;
+    if (encode_json("'") != "\"\\u0027\"")
+        return 11;
+    if (encode_json("µ") != "\"\\u00b5\"")
+        return 12;
+    if (encode_json("𐌰") != "\"\\ud800\\udf30\"")
+        return 13;
+
+    if (encode_json(3.) != "3")
+        return 14;
+    if (encode_json(3.14) != "3.14")
+        return 15;
+    if (encode_json(1e+100) != "1e+100")
+        return 16;
+    if (encode_json(1e-100) != "1e-100")
+        return 17;
+    if (encode_json(+INFINITY) != "1e5000")
+        return 18;
+    if (encode_json(-INFINITY) != "-1e5000")
+        return 19;
+    if (encode_json(+NAN) != "null")
+        return 20;
+    if (encode_json(-NAN) != "null")
+        return 21;
+    if (encode_json(1e-300) != "1e-300")
+        return 21;
+
+    return 0;
+}
diff --git a/llamafile/server/listen.cpp b/llamafile/server/listen.cpp
new file mode 100644
index 0000000000..608589da18
--- /dev/null
+++ b/llamafile/server/listen.cpp
@@ -0,0 +1,114 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "server.h"
+
+#include <netdb.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+
+#include "log.h"
+
+void
+print_listening_url(unsigned ip, int port)
+{
+    LOG("listen http://%hhu.%hhu.%hhu.%hhu:%hu",
+        ip >> 24,
+        ip >> 16,
+        ip >> 8,
+        ip,
+        port);
+}
+
+int
+create_listening_socket(const char* hostport)
+{
+    // parse hostname:port
+    char* p;
+    char* host;
+    char* port;
+    char addr[128];
+    strlcpy(addr, hostport, sizeof(addr));
+    if ((p = strrchr(addr, ':'))) {
+        *p = '\0';
+        host = addr;
+        port = p + 1;
+    } else {
+        host = NULL;
+        port = addr;
+    }
+
+    // turn listen address names into numbers
+    int status;
+    struct addrinfo* ai;
+    struct addrinfo hints = {
+        .ai_family = AF_INET,
+        .ai_socktype = SOCK_STREAM,
+        .ai_protocol = IPPROTO_TCP,
+    };
+    if ((status = getaddrinfo(host, port, &hints, &ai))) {
+        fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(status));
+        exit(1);
+    }
+
+    // create socket
+    int fd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
+    if (fd == -1) {
+        perror(hostport);
+        exit(1);
+    }
+
+    // these fail on some platforms but it's harmless
+    int yes = 1;
+    int qlen = 5;
+    setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes));
+    setsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen));
+
+    // bind the socket
+    if (bind(fd, (struct sockaddr*)ai->ai_addr, ai->ai_addrlen) == -1) {
+        perror(hostport);
+        exit(1);
+    }
+
+    // listen for connections
+    if (listen(fd, SOMAXCONN)) {
+        perror(hostport);
+        exit(1);
+    }
+
+    // print listening urls
+    if (getsockname(fd, (struct sockaddr*)ai->ai_addr, &ai->ai_addrlen)) {
+        perror(hostport);
+        exit(1);
+    }
+    struct sockaddr_in* in = (struct sockaddr_in*)ai->ai_addr;
+    if (ntohl(in->sin_addr.s_addr) == INADDR_ANY) {
+        int i;
+        uint32_t* hostips;
+        for (hostips = GetHostIps(), i = 0; hostips[i]; ++i)
+            print_listening_url(hostips[i], ntohs(in->sin_port));
+    } else {
+        print_listening_url(ntohl(in->sin_addr.s_addr), ntohs(in->sin_port));
+    }
+
+    freeaddrinfo(ai);
+    return fd;
+}
diff --git a/llamafile/server/log.cpp b/llamafile/server/log.cpp
new file mode 100644
index 0000000000..b78345407f
--- /dev/null
+++ b/llamafile/server/log.cpp
@@ -0,0 +1,90 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "log.h"
+
+#include <pthread.h>
+#include <string.h>
+
+#include "time.h"
+
+static thread_local char g_thread_name[128];
+
+const char*
+get_thread_name(void)
+{
+    return g_thread_name;
+}
+
+void
+set_thread_name(const char* name)
+{
+    char shortened[16];
+    strlcpy(shortened, name, sizeof(shortened));
+    pthread_setname_np(pthread_self(), shortened);
+    strlcpy(g_thread_name, name, sizeof(g_thread_name));
+}
+
+char*
+get_log_timestamp(void)
+{
+    tm tm;
+    int x;
+    timespec ts;
+    thread_local static long last;
+    thread_local static char s[27];
+    clock_gettime(0, &ts);
+    if (ts.tv_sec != last) {
+        localtime_lockless(ts.tv_sec, &tm);
+        x = tm.tm_year + 1900;
+        s[0] = '0' + x / 1000;
+        s[1] = '0' + x / 100 % 10;
+        s[2] = '0' + x / 10 % 10;
+        s[3] = '0' + x % 10;
+        s[4] = '-';
+        x = tm.tm_mon + 1;
+        s[5] = '0' + x / 10;
+        s[6] = '0' + x % 10;
+        s[7] = '-';
+        x = tm.tm_mday;
+        s[8] = '0' + x / 10;
+        s[9] = '0' + x % 10;
+        s[10] = 'T';
+        x = tm.tm_hour;
+        s[11] = '0' + x / 10;
+        s[12] = '0' + x % 10;
+        s[13] = ':';
+        x = tm.tm_min;
+        s[14] = '0' + x / 10;
+        s[15] = '0' + x % 10;
+        s[16] = ':';
+        x = tm.tm_sec;
+        s[17] = '0' + x / 10;
+        s[18] = '0' + x % 10;
+        s[19] = '.';
+        s[26] = 0;
+        last = ts.tv_sec;
+    }
+    x = ts.tv_nsec;
+    s[20] = '0' + x / 100000000;
+    s[21] = '0' + x / 10000000 % 10;
+    s[22] = '0' + x / 1000000 % 10;
+    s[23] = '0' + x / 100000 % 10;
+    s[24] = '0' + x / 10000 % 10;
+    s[25] = '0' + x / 1000 % 10;
+    return s;
+}
diff --git a/llamafile/server/log.h b/llamafile/server/log.h
new file mode 100644
index 0000000000..1ddb2bdadf
--- /dev/null
+++ b/llamafile/server/log.h
@@ -0,0 +1,36 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <libc/intrin/kprintf.h>
+
+#define LOG(FMT, ...) \
+    kprintf("%s %s:%d %s " FMT "\n", \
+            get_log_timestamp(), \
+            __FILE__, \
+            __LINE__, \
+            get_thread_name(), \
+            ##__VA_ARGS__)
+
+const char*
+get_thread_name(void);
+
+char*
+get_log_timestamp(void);
+
+void
+set_thread_name(const char*);
diff --git a/llamafile/server/main.cpp b/llamafile/server/main.cpp
new file mode 100644
index 0000000000..2e48aac1cb
--- /dev/null
+++ b/llamafile/server/main.cpp
@@ -0,0 +1,90 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <assert.h>
+#include <third_party/dlmalloc/dlmalloc.h>
+#include <tool/args/args.h>
+
+#include "llama.cpp/llama.h"
+#include "llamafile/llamafile.h"
+#include "llamafile/version.h"
+
+#include "json.h"
+#include "log.h"
+#include "server.h"
+#include "signals.h"
+#include "time.h"
+
+Server* g_server;
+llama_model* g_model;
+
+int
+main(int argc, char* argv[])
+{
+    llamafile_check_cpu();
+    if (llamafile_has(argv, "--version")) {
+        puts("llamafile-server v" LLAMAFILE_VERSION_STRING);
+        exit(0);
+    }
+
+    // get config
+    LoadZipArgs(&argc, &argv);
+    llamafile_get_flags(argc, argv);
+    time_init();
+
+    // load model
+    llama_model_params mparams = {
+        .n_gpu_layers = FLAG_n_gpu_layers,
+        .split_mode = (enum llama_split_mode)FLAG_split_mode,
+        .main_gpu = FLAG_main_gpu,
+        .tensor_split = nullptr,
+        .rpc_servers = nullptr,
+        .progress_callback = nullptr,
+        .progress_callback_user_data = nullptr,
+        .kv_overrides = nullptr,
+        .vocab_only = false,
+        .use_mmap = true,
+        .use_mlock = false,
+        .check_tensors = false,
+    };
+    g_model = llama_load_model_from_file(FLAG_model, mparams);
+
+    // create server
+    if (FLAG_workers <= 0)
+        FLAG_workers = __get_cpu_count();
+    if (FLAG_workers <= 0)
+        FLAG_workers = 16;
+    set_thread_name("server");
+    g_server = new Server(create_listening_socket(FLAG_listen));
+    for (int i = 0; i < FLAG_workers; ++i)
+        unassert(!g_server->spawn());
+
+    // run server
+    setup_signals();
+    g_server->run();
+    restore_signals();
+
+    // shutdown server
+    LOG("shutdown");
+    g_server->shutdown();
+    g_server->close();
+    delete g_server;
+    LOG("exit");
+
+    // quality assurance
+    CheckForMemoryLeaks();
+}
diff --git a/llamafile/server/path.cpp b/llamafile/server/path.cpp
new file mode 100644
index 0000000000..c3aee5a841
--- /dev/null
+++ b/llamafile/server/path.cpp
@@ -0,0 +1,67 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "path.h"
+
+using namespace ctl;
+
+string
+basename(const string_view path) noexcept
+{
+    size_t i, e;
+    if ((e = path.size())) {
+        while (e > 1 && path[e - 1] == '/')
+            --e;
+        i = e - 1;
+        while (i && path[i - 1] != '/')
+            --i;
+        return path.substr(i, e - i);
+    } else {
+        return ".";
+    }
+}
+
+string
+dirname(const string_view path) noexcept
+{
+    size_t e = path.size();
+    if (e--) {
+        for (; path[e] == '/'; e--)
+            if (!e)
+                return "/";
+        for (; path[e] != '/'; e--)
+            if (!e)
+                return ".";
+        for (; path[e] == '/'; e--)
+            if (!e)
+                return "/";
+        return path.substr(0, e + 1);
+    }
+    return ".";
+}
+
+string
+resolve(const string_view lhs, const string_view rhs) noexcept
+{
+    if (lhs.empty())
+        return rhs;
+    if (!rhs.empty() && rhs[0] == '/')
+        return rhs;
+    if (!lhs.empty() && lhs[lhs.size() - 1] == '/')
+        return string(lhs) + rhs;
+    return string(lhs) + "/" + rhs;
+}
diff --git a/llamafile/server/path.h b/llamafile/server/path.h
new file mode 100644
index 0000000000..c24822f594
--- /dev/null
+++ b/llamafile/server/path.h
@@ -0,0 +1,28 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ctl/string.h>
+
+ctl::string
+dirname(const ctl::string_view) noexcept;
+
+ctl::string
+basename(const ctl::string_view) noexcept;
+
+ctl::string
+resolve(const ctl::string_view, const ctl::string_view) noexcept;
diff --git a/llamafile/server/server.cpp b/llamafile/server/server.cpp
new file mode 100644
index 0000000000..3d3e8d98ba
--- /dev/null
+++ b/llamafile/server/server.cpp
@@ -0,0 +1,193 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "server.h"
+
+#include <assert.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <stdio.h>
+#include <sys/auxv.h>
+#include <sys/socket.h>
+#include <time.h>
+
+#include "llamafile/llamafile.h"
+#include "log.h"
+#include "server.h"
+#include "worker.h"
+
+Server::Server(int fd) : fd(fd)
+{
+}
+
+Server::~Server()
+{
+    unassert(fd == -1);
+    unassert(!worker_count.load(std::memory_order_relaxed));
+    unassert(dll_is_empty(active_workers));
+    unassert(dll_is_empty(idle_workers));
+    pthread_mutex_destroy(&lock_);
+    pthread_cond_destroy(&cond_);
+}
+
+void
+Server::lock()
+{
+    pthread_mutex_lock(&lock_);
+}
+
+void
+Server::signal()
+{
+    pthread_cond_signal(&cond_);
+}
+
+void
+Server::wait()
+{
+    struct timespec waitfor =
+      timespec_add(timespec_real(), timespec_fromseconds(1));
+    pthread_cond_timedwait(&cond_, &lock_, &waitfor);
+}
+
+void
+Server::unlock()
+{
+    pthread_mutex_unlock(&lock_);
+}
+
+void
+Server::terminate()
+{
+    terminated.store(true, std::memory_order_release);
+    signal();
+}
+
+int
+Server::close()
+{
+    int rc = 0;
+    if (fd != -1) {
+        rc = ::close(fd);
+        fd = -1;
+    }
+    return rc;
+}
+
+void*
+worker_thread(void* arg)
+{
+    Worker* worker = (Worker*)arg;
+    worker->run();
+    return 0;
+}
+
+errno_t
+Server::spawn()
+{
+    errno_t err;
+    Worker* worker;
+    pthread_attr_t attr;
+    worker = new Worker(this);
+    pthread_attr_init(&attr);
+    pthread_attr_setstacksize(&attr, 65536);
+    pthread_attr_setguardsize(&attr, getauxval(AT_PAGESZ));
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+    if ((err = pthread_create(&worker->th, &attr, worker_thread, worker)))
+        delete worker;
+    pthread_attr_destroy(&attr);
+    return err;
+}
+
+int
+Server::accept()
+{
+    // accept connection
+    sockaddr_in clientaddr;
+    set_thread_name("listen");
+    uint32_t clientsize = sizeof(clientaddr);
+    int clifd = ::accept(fd, (sockaddr*)&clientaddr, &clientsize);
+    if (clifd == -1)
+        return -1;
+
+    // set name
+    char name[17];
+    unsigned ip = ntohl(clientaddr.sin_addr.s_addr);
+    snprintf(name,
+             sizeof(name),
+             "%hhu.%hhu.%hhu.%hhu",
+             ip >> 24,
+             ip >> 16,
+             ip >> 8,
+             ip);
+    set_thread_name(name);
+
+    // keep sockets open
+    if (FLAG_keepalive > 0) {
+        int yes = 1;
+        int secs = FLAG_keepalive;
+        setsockopt(clifd, SOL_SOCKET, SO_KEEPALIVE, &yes, sizeof(yes));
+        setsockopt(clifd, IPPROTO_TCP, TCP_NODELAY, &yes, sizeof(yes));
+        setsockopt(clifd, IPPROTO_TCP, TCP_KEEPIDLE, &secs, sizeof(secs));
+        setsockopt(clifd, IPPROTO_TCP, TCP_KEEPINTVL, &secs, sizeof(secs));
+    }
+
+    if (FLAG_verbose >= 2)
+        LOG("accept");
+    return clifd;
+}
+
+void
+Server::run()
+{
+    while (!terminated.load(std::memory_order_acquire)) {
+        lock();
+        if (!terminated.load(std::memory_order_acquire))
+            wait();
+        unlock();
+        if (terminated.load(std::memory_order_acquire))
+            break;
+        int missing =
+          FLAG_threads - worker_count.load(std::memory_order_acquire);
+        for (int i = 0; i < missing; ++i)
+            spawn();
+    }
+}
+
+void
+Server::shutdown()
+{
+    // on windows this is the only way accept() can be canceled
+    if (IsWindows())
+        close();
+
+    // kill workers
+    lock();
+    for (Dll* e = dll_first(idle_workers); e; e = dll_next(idle_workers, e))
+        WORKER(e)->kill();
+    for (Dll* e = dll_first(active_workers); e; e = dll_next(active_workers, e))
+        WORKER(e)->kill();
+    unlock();
+
+    // wait for workers to die
+    while (worker_count.load(std::memory_order_acquire) > 0) {
+        lock();
+        if (worker_count.load(std::memory_order_acquire) > 0)
+            wait();
+        unlock();
+    }
+}
diff --git a/llamafile/server/server.h b/llamafile/server/server.h
new file mode 100644
index 0000000000..967909fde8
--- /dev/null
+++ b/llamafile/server/server.h
@@ -0,0 +1,51 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <atomic>
+#include <cosmo.h>
+#include <pthread.h>
+
+struct Server
+{
+    Server(int);
+    ~Server();
+
+    int accept();
+    errno_t spawn();
+    void terminate();
+    void shutdown();
+    int close();
+    void run();
+    void lock();
+    void unlock();
+    void signal();
+    void wait();
+
+    int fd;
+    Dll* idle_workers = nullptr;
+    Dll* active_workers = nullptr;
+    pthread_cond_t cond_ = PTHREAD_COND_INITIALIZER;
+    pthread_mutex_t lock_ = PTHREAD_MUTEX_INITIALIZER;
+    std::atomic_int worker_count = ATOMIC_VAR_INIT(0);
+    std::atomic_bool terminated = ATOMIC_VAR_INIT(false);
+};
+
+extern Server* g_server;
+
+int
+create_listening_socket(const char*);
diff --git a/llamafile/server/signals.cpp b/llamafile/server/signals.cpp
new file mode 100644
index 0000000000..f363ce5b31
--- /dev/null
+++ b/llamafile/server/signals.cpp
@@ -0,0 +1,94 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "signals.h"
+
+#include <ucontext.h>
+
+#include "log.h"
+#include "server.h"
+
+void
+on_termination_signal(int sig)
+{
+    LOG("%G", sig);
+    g_server->terminate();
+}
+
+void
+on_crash_signal(int sig, siginfo_t* si, void* arg)
+{
+    LOG("crashed %G", sig);
+    char message[256];
+    describe_crash(message, sizeof(message), sig, si, arg);
+    LOG("crashed %s", message);
+    pthread_exit(PTHREAD_CANCELED);
+}
+
+void
+setup_signals(void)
+{
+    struct sigaction sa;
+    sa.sa_flags = SA_SIGINFO;
+    sigemptyset(&sa.sa_mask);
+    sa.sa_handler = on_termination_signal;
+
+    sigaction(SIGINT, &sa, 0); // ctrl-c
+    sigaction(SIGHUP, &sa, 0); // terminal close
+    sigaction(SIGTERM, &sa, 0); // kill
+
+    sa.sa_sigaction = on_crash_signal;
+    sigaddset(&sa.sa_mask, SIGABRT); // abort()
+    sigaddset(&sa.sa_mask, SIGTRAP); // breakpoint
+    sigaddset(&sa.sa_mask, SIGFPE); // illegal math
+    sigaddset(&sa.sa_mask, SIGBUS); // illegal memory
+    sigaddset(&sa.sa_mask, SIGSEGV); // illegal memory
+    sigaddset(&sa.sa_mask, SIGILL); // illegal instruction
+    sigaddset(&sa.sa_mask, SIGXCPU); // out of cpu quota
+    sigaddset(&sa.sa_mask, SIGXFSZ); // file too large
+
+    sigaction(SIGABRT, &sa, 0); // abort()
+    sigaction(SIGTRAP, &sa, 0); // breakpoint
+    sigaction(SIGFPE, &sa, 0); // illegal math
+    sigaction(SIGBUS, &sa, 0); // illegal memory
+    sigaction(SIGSEGV, &sa, 0); // illegal memory
+    sigaction(SIGILL, &sa, 0); // illegal instruction
+    sigaction(SIGXCPU, &sa, 0); // out of cpu quota
+    sigaction(SIGXFSZ, &sa, 0); // file too large
+}
+
+void
+restore_signals(void)
+{
+    struct sigaction sa;
+    sa.sa_flags = 0;
+    sa.sa_handler = SIG_DFL;
+    sigemptyset(&sa.sa_mask);
+
+    sigaction(SIGINT, &sa, 0); // ctrl-c
+    sigaction(SIGHUP, &sa, 0); // terminal close
+    sigaction(SIGTERM, &sa, 0); // kill
+
+    sigaction(SIGABRT, &sa, 0); // abort()
+    sigaction(SIGTRAP, &sa, 0); // breakpoint
+    sigaction(SIGFPE, &sa, 0); // illegal math
+    sigaction(SIGBUS, &sa, 0); // illegal memory
+    sigaction(SIGSEGV, &sa, 0); // illegal memory
+    sigaction(SIGILL, &sa, 0); // illegal instruction
+    sigaction(SIGXCPU, &sa, 0); // out of cpu quota
+    sigaction(SIGXFSZ, &sa, 0); // file too large
+}
diff --git a/llamafile/server/signals.h b/llamafile/server/signals.h
new file mode 100644
index 0000000000..ab9104fe33
--- /dev/null
+++ b/llamafile/server/signals.h
@@ -0,0 +1,28 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <signal.h>
+
+void
+setup_signals(void);
+
+void
+restore_signals(void);
+
+void
+describe_crash(char*, size_t, int, siginfo_t*, void*);
diff --git a/llamafile/server/time.cpp b/llamafile/server/time.cpp
new file mode 100644
index 0000000000..c733f80ffc
--- /dev/null
+++ b/llamafile/server/time.cpp
@@ -0,0 +1,203 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "time.h"
+
+#include <atomic>
+#include <pthread.h>
+#include <signal.h>
+#include <sys/auxv.h>
+#include <unistd.h>
+
+#include "log.h"
+
+//
+// lockless implementation of gmtime_r() and localtime_r()
+//
+
+struct Clock
+{
+    std::atomic_uint roll;
+    std::atomic_ulong time;
+    std::atomic_ulong date;
+};
+
+static Clock g_clck[2];
+
+static void
+set_clck(Clock* clck, long time, long date)
+{
+    unsigned long roll;
+    roll = clck->roll.fetch_add(1, std::memory_order_relaxed);
+    time &= 0xffffffffffff;
+    date &= 0xffffffffffff;
+    time |= roll << 48;
+    date |= roll << 48;
+    clck->time.store(time, std::memory_order_relaxed);
+    clck->date.store(date, std::memory_order_relaxed);
+}
+
+static void
+get_clck(Clock* clck, long* out_time, long* out_date)
+{
+    long time, date;
+    do {
+        time = clck->time.load(std::memory_order_relaxed);
+        date = clck->date.load(std::memory_order_relaxed);
+    } while ((time >> 48) != (date >> 48));
+    *out_date = date & 0xffffffffffff;
+    *out_time = time & 0xffffffffffff;
+}
+
+static long
+encode_date(const tm* tm)
+{
+    long date;
+    date = tm->tm_year;
+    date <<= 4;
+    date |= tm->tm_isdst == 1;
+    date <<= 1;
+    date |= tm->tm_mon;
+    date <<= 5;
+    date |= tm->tm_mday;
+    date <<= 3;
+    date |= tm->tm_wday;
+    date <<= 5;
+    date |= tm->tm_hour;
+    date <<= 6;
+    date |= tm->tm_min;
+    date <<= 6;
+    date |= tm->tm_sec;
+    return date;
+}
+
+static void
+decode_date(long date, tm* tm)
+{
+    tm->tm_sec = date & 63;
+    date >>= 6;
+    tm->tm_min = date & 63;
+    date >>= 6;
+    tm->tm_hour = date & 31;
+    date >>= 5;
+    tm->tm_wday = date & 7;
+    date >>= 3;
+    tm->tm_mday = date & 31;
+    date >>= 5;
+    tm->tm_mon = date & 15;
+    date >>= 4;
+    tm->tm_isdst = date & 1;
+    date >>= 1;
+    tm->tm_year = date;
+    tm->tm_gmtoff = 0; // unsupported
+    tm->tm_zone = 0; // unsupported
+    tm->tm_yday = 0; // unsupported
+}
+
+static void
+update_time()
+{
+    tm tm;
+    timespec ts;
+    clock_gettime(0, &ts);
+    gmtime_r(&ts.tv_sec, &tm);
+    set_clck(&g_clck[0], ts.tv_sec, encode_date(&tm));
+    localtime_r(&ts.tv_sec, &tm);
+    set_clck(&g_clck[1], ts.tv_sec, encode_date(&tm));
+}
+
+static void*
+time_worker(void* arg)
+{
+    sigset_t ss;
+    sigemptyset(&ss);
+    sigaddset(&ss, SIGHUP);
+    sigaddset(&ss, SIGINT);
+    sigaddset(&ss, SIGQUIT);
+    sigaddset(&ss, SIGTERM);
+    sigaddset(&ss, SIGUSR1);
+    sigaddset(&ss, SIGALRM);
+    pthread_sigmask(SIG_BLOCK, &ss, 0);
+    set_thread_name("localtime");
+    for (;;) {
+        sleep(10);
+        update_time();
+    }
+    return nullptr;
+}
+
+void
+time_init()
+{
+    update_time();
+    pthread_t th;
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    pthread_attr_setstacksize(&attr, 65536);
+    pthread_attr_setguardsize(&attr, getauxval(AT_PAGESZ));
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+    if (pthread_create(&th, &attr, time_worker, 0))
+        __builtin_trap();
+    pthread_attr_destroy(&attr);
+}
+
+static const char kMonDays[2][12] = {
+    { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 },
+    { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 },
+};
+
+static void
+time_lockless(Clock* clck, long now, tm* tm)
+{
+    long time, date, since;
+    get_clck(clck, &time, &date);
+    decode_date(date, tm);
+    since = now - time;
+    since = since < 60 ? since : 60;
+    for (; since > 0; --since) {
+        if (++tm->tm_sec >= 60) {
+            tm->tm_sec = 0;
+            if (++tm->tm_min >= 60) {
+                tm->tm_min = 0;
+                if (++tm->tm_hour >= 24) {
+                    tm->tm_hour = 0;
+                    if (++tm->tm_mday >= 7)
+                        tm->tm_mday = 0;
+                    if (++tm->tm_mday > kMonDays[!!tm->tm_isdst][tm->tm_mon]) {
+                        tm->tm_mday = 1;
+                        if (++tm->tm_mon >= 12) {
+                            tm->tm_mon = 0;
+                            ++tm->tm_year;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void
+gmtime_lockless(long now, tm* tm)
+{
+    time_lockless(&g_clck[0], now, tm);
+}
+
+void
+localtime_lockless(long now, tm* tm)
+{
+    time_lockless(&g_clck[1], now, tm);
+}
diff --git a/llamafile/security.c b/llamafile/server/time.h
similarity index 71%
rename from llamafile/security.c
rename to llamafile/server/time.h
index 84781bff4b..3ebe77e747 100644
--- a/llamafile/security.c
+++ b/llamafile/server/time.h
@@ -1,5 +1,5 @@
-// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
-// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
 //
 // Copyright 2024 Mozilla Foundation
 //
@@ -15,6 +15,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "llamafile.h"
+#pragma once
+#include <time.h>
 
-bool FLAG_unsecure;
+void
+time_init();
+
+void
+gmtime_lockless(long, tm*);
+
+void
+localtime_lockless(long, tm*);
diff --git a/llamafile/server/utils.h b/llamafile/server/utils.h
new file mode 100644
index 0000000000..0a5c39407b
--- /dev/null
+++ b/llamafile/server/utils.h
@@ -0,0 +1,27 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ctl/string_view.h>
+
+extern const signed char kHexToInt[256];
+
+bool
+atob(ctl::string_view, bool);
+
+char*
+hexcpy(char*, unsigned long);
diff --git a/llamafile/server/worker.cpp b/llamafile/server/worker.cpp
new file mode 100644
index 0000000000..e974650196
--- /dev/null
+++ b/llamafile/server/worker.cpp
@@ -0,0 +1,140 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "worker.h"
+
+#include <assert.h>
+#include <exception>
+#include <pthread.h>
+
+#include "client.h"
+#include "llamafile/llamafile.h"
+#include "log.h"
+#include "signals.h"
+
+Worker::Worker(Server* server) : server(server)
+{
+    dll_init(&elem);
+}
+
+void
+Worker::kill()
+{
+    pthread_cancel(th);
+}
+
+void
+Worker::begin()
+{
+    unassert(!working);
+    server->lock();
+    dll_remove(&server->idle_workers, &elem);
+    if (dll_is_empty(server->idle_workers)) {
+        Dll* slowbro;
+        if ((slowbro = dll_last(server->active_workers))) {
+            LOG("all threads active! dropping oldest client");
+            WORKER(slowbro)->kill();
+        }
+    }
+    working = true;
+    dll_make_first(&server->active_workers, &elem);
+    server->unlock();
+}
+
+void
+Worker::end()
+{
+    unassert(working);
+    server->lock();
+    dll_remove(&server->active_workers, &elem);
+    working = false;
+    dll_make_first(&server->idle_workers, &elem);
+    server->unlock();
+}
+
+void
+Worker::retire()
+{
+    server->lock();
+    if (working)
+        dll_remove(&server->active_workers, &elem);
+    else
+        dll_remove(&server->idle_workers, &elem);
+    server->worker_count.fetch_sub(1, std::memory_order_acq_rel);
+    server->signal();
+    server->unlock();
+    delete this;
+}
+
+void
+Worker::handle(void)
+{
+    if ((client.fd = server->accept()) == -1) {
+        LOG("accept returned %m");
+        return;
+    }
+
+    begin();
+    pthread_cleanup_push(
+      [](void* arg) {
+          Worker* worker = (Worker*)arg;
+          worker->client.close();
+          worker->end();
+      },
+      this);
+
+    try {
+        client.run();
+    } catch (const std::exception& e) {
+        LOG("caught %s", e.what());
+    } catch (...) {
+        LOG("caught unknown exception");
+    }
+
+    pthread_cleanup_pop(true);
+}
+
+void
+Worker::run()
+{
+    server->lock();
+    dll_make_first(&server->idle_workers, &elem);
+    server->worker_count.fetch_add(1, std::memory_order_acq_rel);
+    server->unlock();
+
+    pthread_cleanup_push(
+      [](void* arg) {
+          Worker* worker = (Worker*)arg;
+          worker->retire();
+      },
+      this);
+
+    while (!server->terminated.load(std::memory_order_acquire)) {
+        sigset_t mask;
+        sigemptyset(&mask);
+        sigaddset(&mask, SIGHUP);
+        sigaddset(&mask, SIGINT);
+        sigaddset(&mask, SIGQUIT);
+        sigaddset(&mask, SIGTERM);
+        sigaddset(&mask, SIGUSR1);
+        sigaddset(&mask, SIGALRM);
+        pthread_sigmask(SIG_BLOCK, &mask, 0);
+        handle();
+    }
+
+    pthread_cleanup_pop(true);
+}
diff --git a/llamafile/server/worker.h b/llamafile/server/worker.h
new file mode 100644
index 0000000000..1b65bff4e6
--- /dev/null
+++ b/llamafile/server/worker.h
@@ -0,0 +1,41 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "client.h"
+#include "server.h"
+#include <cosmo.h>
+#include <pthread.h>
+
+#define WORKER(e) DLL_CONTAINER(Worker, elem, e)
+
+struct Worker
+{
+    Server* server;
+    Dll elem;
+    pthread_t th = 0;
+    bool working = false;
+    Client client;
+
+    Worker(Server*);
+    void run();
+    void begin();
+    void handle(void);
+    void end();
+    void retire();
+    void kill();
+};
diff --git a/llamafile/sgemm.h b/llamafile/sgemm.h
index 33fcd60332..ca0606f9a9 100644
--- a/llamafile/sgemm.h
+++ b/llamafile/sgemm.h
@@ -11,14 +11,14 @@ bool iqk_mul_mat(long, long, long, int, const void *, const void *, float *, lon
 bool iqk_mul_mat_zen4(long, long, long, int, const void *, const void *, float *, long, int, int);
 bool iqk_mul_mat_arm82(long, long, long, int, const void *, const void *, float *, long, int, int);
 
-bool iqk_mul_mat_moe(long, long, long, int, int, const void *, const void *,
-        float *, long, long, const void *, int, int);
-bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void *, const void *,
-        float *, long, long, const void *, int, int);
-bool iqk_mul_mat_moe_arm82(long, long, long, int, int, const void *, const void *,
-        float *, long, long, const void *, int, int);
-bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void *, const void *,
-        float *, long, long, const void *, int, int);
+bool iqk_mul_mat_moe(long, long, long, int, int, const void *, const void *, float *, long, long,
+                     const void *, int, int);
+bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void *, const void *, float *, long,
+                          long, const void *, int, int);
+bool iqk_mul_mat_moe_arm82(long, long, long, int, int, const void *, const void *, float *, long,
+                           long, const void *, int, int);
+bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void *, const void *, float *,
+                                 long, long, const void *, int, int);
 
 bool llamafile_sgemm(long, long, long, const void *, long, const void *, long, void *, long, int,
                      int, int, int, int, int, int);
@@ -73,8 +73,8 @@ bool llamafile_mixmul_arm80(const struct ggml_compute_params *, const struct ggm
 bool llamafile_mixmul_arm82(const struct ggml_compute_params *, const struct ggml_tensor *,
                             const struct ggml_tensor *, const struct ggml_tensor *,
                             struct ggml_tensor *);
-bool llamafile_mixmul_iqk(long, long, long, int, int, const void *, const void *,
-        float *, long, long, const void *, int, int);
+bool llamafile_mixmul_iqk(long, long, long, int, int, const void *, const void *, float *, long,
+                          long, const void *, int, int);
 
 #ifdef __cplusplus
 }
diff --git a/llamafile/tinyblas_cpu_unsupported.cpp b/llamafile/tinyblas_cpu_unsupported.cpp
index 166494a402..8347ef3b2f 100644
--- a/llamafile/tinyblas_cpu_unsupported.cpp
+++ b/llamafile/tinyblas_cpu_unsupported.cpp
@@ -17,9 +17,9 @@
 
 #include "sgemm.h"
 
-bool llamafile_sgemm_unsupported(int m, int n, int k, const void *A, int lda, const void *B,
-                                 int ldb, void *C, int ldc, int ith, int nth, int task, int Atype,
-                                 int Btype, int Ctype) {
+bool llamafile_sgemm_unsupported(long m, long n, long k, const void *A, long lda, const void *B,
+                                 long ldb, void *C, long ldc, int ith, int nth, int task, int Atype,
+                                 int Btype, int Ctype, int precision) {
     return false;
 }
 
@@ -30,7 +30,7 @@ bool llamafile_mixmul_unsupported(const struct ggml_compute_params *params,
     return false;
 }
 
-bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void *, const void *,
-        float *, long, long, const void *, int, int) {
+bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void *, const void *, float *,
+                                 long, long, const void *, int, int) {
     return false;
 }
diff --git a/llamafile/tokenize.cpp b/llamafile/tokenize.cpp
index ab135fc8ee..facc42c6d9 100644
--- a/llamafile/tokenize.cpp
+++ b/llamafile/tokenize.cpp
@@ -15,68 +15,82 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <cerrno>
-#include <cmath>
-#include <cosmo.h>
-#include <cstdio>
-#include <cstring>
-#include <vector>
+#include "llamafile.h"
+#include "version.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tool/args/args.h>
 
-#include "llama.cpp/common.h"
 #include "llama.cpp/llama.h"
-#include "llamafile/llamafile.h"
 
 int main(int argc, char **argv) {
+    llamafile_check_cpu();
 
     if (llamafile_has(argv, "--version")) {
         puts("llamafile-tokenize v" LLAMAFILE_VERSION_STRING);
         return 0;
     }
 
-    llamafile_check_cpu();
-    log_disable();
-
-    gpt_params params;
-    params.n_ctx = 0;
+    FLAG_log_disable = true;
 
-    if (!gpt_params_parse(argc, argv, params))
-        return 1;
+    LoadZipArgs(&argc, &argv);
+    llamafile_get_flags(argc, argv);
 
-    llama_model_params model_params = llama_model_default_params();
-    llama_model *model = llama_load_model_from_file(params.model.c_str(), model_params);
+    llama_model_params mparams = {
+        .n_gpu_layers = 0,
+        .split_mode = (enum llama_split_mode)FLAG_split_mode,
+        .main_gpu = 0,
+        .tensor_split = nullptr,
+        .rpc_servers = nullptr,
+        .progress_callback = nullptr,
+        .progress_callback_user_data = nullptr,
+        .kv_overrides = nullptr,
+        .vocab_only = true,
+        .use_mmap = true,
+        .use_mlock = false,
+        .check_tensors = false,
+    };
+    llama_model *model = llama_load_model_from_file(FLAG_model, mparams);
     if (model == NULL)
         return 3;
 
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
-    llama_context *ctx = llama_new_context_with_model(model, ctx_params);
-    if (ctx == NULL)
-        return 4;
-
-    bool should_read_stdin = params.prompt.empty();
+    FILE *input;
+    if (FLAG_prompt) {
+        input = fmemopen((void *)FLAG_prompt, strlen(FLAG_prompt), "rb");
+    } else if (FLAG_file) {
+        if (!(input = fopen(FLAG_file, "rb"))) {
+            perror(FLAG_file);
+            exit(1);
+        }
+    } else {
+        input = stdin;
+    }
 
     for (;;) {
-        ssize_t n;
-        char buf[4097];
-        const char *input;
-        if (should_read_stdin) {
-            n = read(0, buf, 4096);
-            if (n == -1) {
-                fprintf(stderr, "/dev/stdin: %s\n", strerror(errno));
+        char *text;
+        size_t textlen;
+        if (!(text = fgetln(input, &textlen)))
+            break;
+
+        static llama_token toks[4096];
+        int count = llama_tokenize(model, text, textlen, toks, 4096, false, false);
+        if (count < 0) {
+            fprintf(stderr, "%s: failed to tokenize line\n", argv[0]);
+            exit(1);
+        }
+
+        for (int i = 0; i < count; ++i) {
+
+            char s[256];
+            int n = llama_token_to_piece(model, toks[i], s, sizeof(s), false);
+            if (n < 0) {
+                fprintf(stderr, "%s: failed to convert token %d to string\n", argv[0], toks[i]);
                 exit(1);
             }
-            if (!n)
-                break;
-            buf[n] = 0;
-            input = buf;
-        } else {
-            input = params.prompt.c_str();
-        }
 
-        std::vector<llama_token> toks = ::llama_tokenize(ctx, input, false);
-        for (llama_token tok : toks) {
-            std::string str = llama_token_to_piece(ctx, tok, true);
-            const char *s = str.c_str();
-            for (int i = 0; s[i]; ++i) {
+            for (int i = 0; i < n; ++i) {
                 int c = s[i] & 255;
                 switch (c) {
                 case '\\':
@@ -123,6 +137,5 @@ int main(int argc, char **argv) {
         }
     }
 
-    llama_free(ctx);
     llama_free_model(model);
 }
diff --git a/llamafile/zipalign.c b/llamafile/zipalign.c
index c43a24caaf..5fdf3304ab 100644
--- a/llamafile/zipalign.c
+++ b/llamafile/zipalign.c
@@ -17,10 +17,10 @@
 
 #include "llamafile.h"
 #include "zip.h"
+
 #include <assert.h>
 #include <cosmo.h>
 #include <fcntl.h>
-#include <getopt.h>
 #include <libgen.h>
 #include <limits.h>
 #include <stdbool.h>
@@ -29,9 +29,13 @@
 #include <stdlib.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
+#include <third_party/getopt/getopt.internal.h>
 #include <third_party/zlib/zlib.h>
 #include <time.h>
 
+#define TINYMALLOC_MAX_BYTES (64 * 1024 * 1024)
+#include <libc/mem/tinymalloc.inc>
+
 #define CHUNK 2097152
 
 #define Min(a, b) ((a) < (b) ? (a) : (b))
@@ -39,11 +43,11 @@
 #define DOS_TIME(HOUR, MINUTE, SECOND) ((HOUR) << 11 | (MINUTE) << 5 | (SECOND) >> 1)
 
 static const char *prog;
-static int FLAG_junk;
-static int FLAG_level;
-static int FLAG_verbose;
-static int FLAG_alignment = 65536;
-static bool FLAG_nondeterministic;
+static int flag_junk;
+static int flag_level;
+static int flag_verbose;
+static int flag_alignment = 65536;
+static bool flag_nondeterministic;
 
 static wontreturn void Die(const char *thing, const char *reason) {
     tinyprint(2, thing, ": fatal error: ", reason, "\n", NULL);
@@ -113,23 +117,23 @@ int main(int argc, char *argv[]) {
         case '7':
         case '8':
         case '9':
-            FLAG_level = opt - '0';
+            flag_level = opt - '0';
             break;
         case 'v':
-            ++FLAG_verbose;
+            ++flag_verbose;
             break;
         case 'j':
-            FLAG_junk = true;
+            flag_junk = true;
             break;
         case 'N':
-            FLAG_nondeterministic = true;
+            flag_nondeterministic = true;
             break;
         case 'a':
-            FLAG_alignment = atoi(optarg);
-            if (FLAG_alignment < 1)
-                Die(prog, "FLAG_alignment must be at least 1");
-            if (FLAG_alignment & (FLAG_alignment - 1))
-                Die(prog, "FLAG_alignment must be two power");
+            flag_alignment = atoi(optarg);
+            if (flag_alignment < 1)
+                Die(prog, "flag_alignment must be at least 1");
+            if (flag_alignment & (flag_alignment - 1))
+                Die(prog, "flag_alignment must be two power");
             break;
         default:
             return 1;
@@ -201,7 +205,7 @@ int main(int argc, char *argv[]) {
     char **names = Malloc(sizeof(char *) * argc);
     for (int i = optind; i < argc; ++i) {
         names[i] = StrDup(argv[i]);
-        if (FLAG_junk)
+        if (flag_junk)
             names[i] = basename(names[i]);
         else
             while (*names[i] == '/')
@@ -266,7 +270,7 @@ int main(int argc, char *argv[]) {
         // get time
         int64_t ts;
         uint16_t mtime, mdate;
-        if (FLAG_nondeterministic)
+        if (flag_nondeterministic)
             ts = st.st_mtime;
         else
             ts = 1700000000;
@@ -277,20 +281,20 @@ int main(int argc, char *argv[]) {
         size_t namlen = strlen(name);
         size_t extlen = (2 + 2 + 8 + 8);
         size_t hdrlen = kZipLfileHdrMinSize + namlen + extlen;
-        while ((zsize + hdrlen) & (FLAG_alignment - 1))
+        while ((zsize + hdrlen) & (flag_alignment - 1))
             ++zsize;
 
         // initialize zlib in raw deflate mode
         z_stream zs;
         int compression;
-        if (!FLAG_level) {
+        if (!flag_level) {
             compression = kZipCompressionNone;
         } else {
             compression = kZipCompressionDeflate;
             zs.zalloc = 0;
             zs.zfree = 0;
             zs.opaque = 0;
-            switch (deflateInit2(&zs, FLAG_level, Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+            switch (deflateInit2(&zs, flag_level, Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
                                  Z_DEFAULT_STRATEGY)) {
             case Z_OK:
                 break;
@@ -312,7 +316,7 @@ int main(int argc, char *argv[]) {
             if ((rc = pread(fd, iobuf, Min(size, CHUNK), i)) <= 0)
                 DieSys(path);
             crc = crc32(crc, iobuf, rc);
-            if (!FLAG_level) {
+            if (!flag_level) {
                 // write uncompressed chunk to output
                 if (pwrite(zfd, iobuf, rc, zsize + hdrlen + compsize) != rc)
                     DieSys(zpath);
@@ -340,7 +344,7 @@ int main(int argc, char *argv[]) {
                 } while (!zs.avail_out);
             }
         }
-        if (FLAG_level)
+        if (flag_level)
             unassert(deflateEnd(&zs) == Z_OK);
 
         // write local file header
@@ -411,7 +415,7 @@ int main(int argc, char *argv[]) {
             DieSys(path);
 
         // log asset creation
-        if (FLAG_verbose)
+        if (flag_verbose)
             tinyprint(2, path, " -> ", name, "\n", NULL);
     }