Introduce new llamafile server

You can now build and run `o//llamafile/server/main` which launches an HTTP server that currently supports a single endpoint at /tokenize. If wrk sends it a request to tokenize a string that has 51 tokens then it serves two million requests per second on my workstation, where 99 pct latency is 179 µs. This server is designed to be crash proof, reliable and preeempting. Workers are able to be asynchronously canceled so the supervisor thread can respawn them. Cosmo's new memory allocator helps this server be high performance for llama.cpp's STL-heavy use case too
mofosyne · Jun 5, 2024 · e0656ea · e0656ea
1 parent 8b9be96
commit e0656ea
Show file tree

Hide file tree

Showing 39 changed files with 2,902 additions and 121 deletions.
diff --git a/build/config.mk b/build/config.mk
@@ -2,7 +2,7 @@
 #── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘
 
 PREFIX = /usr/local
-COSMOCC = .cosmocc/3.3.10
+COSMOCC = .cosmocc/3.4.0
 TOOLCHAIN = $(COSMOCC)/bin/cosmo
 
 AR = $(TOOLCHAIN)ar
@@ -13,7 +13,8 @@ MKDEPS = $(COSMOCC)/bin/mkdeps
 INSTALL = install
 
 ARFLAGS = rcsD
-CCFLAGS = -g -O3 -fexceptions -fsignaling-nans
+CXXFLAGS = -frtti -std=gnu++23
+CCFLAGS = -g -ggdb -O3 -fexceptions -fsignaling-nans -ffunction-sections -fdata-sections
 CPPFLAGS_ = -iquote. -mcosmo -DGGML_MULTIPLATFORM -Wno-attributes -DLLAMAFILE_DEBUG
 TARGET_ARCH = -Xx86_64-mavx -Xx86_64-mtune=znver4
 
@@ -51,5 +52,5 @@ clean:; rm -rf o
 .PHONY: distclean
 distclean:; rm -rf o .cosmocc
 
-.cosmocc/3.3.10:
-	build/download-cosmocc.sh $@ 3.3.10 00d61c1215667314f66e288c8285bae38cc6137fca083e5bba6c74e3a52439de
+.cosmocc/3.4.0:
+	build/download-cosmocc.sh $@ 3.4.0 475e24b84a18973312433f5280e267acbe1b4dac1b2e2ebb3cfce46051a8c08c
diff --git a/llamafile/BUILD.mk b/llamafile/BUILD.mk
@@ -12,11 +12,17 @@ LLAMAFILE_SRCS_CPP = $(filter %.cpp,$(LLAMAFILE_FILES))
 LLAMAFILE_SRCS = $(LLAMAFILE_SRCS_C) $(LLAMAFILE_SRCS_CPP) $(LLAMAFILE_SRCS_CU)
 LLAMAFILE_DOCS = $(filter %.1,$(LLAMAFILE_FILES))
 
-LLAMAFILE_OBJS =					\
+LLAMAFILE_OBJS :=					\
 	$(LLAMAFILE_SRCS_C:%.c=o/$(MODE)/%.o)		\
 	$(LLAMAFILE_SRCS_CPP:%.cpp=o/$(MODE)/%.o)	\
 	$(LLAMAFILE_FILES:%=o/$(MODE)/%.zip.o)
 
+# this executable defines its own malloc(), free(), etc.
+# therefore we want to avoid it going inside the .a file
+LLAMAFILE_OBJS := $(filter-out o/$(MODE)/llamafile/zipalign.o,$(LLAMAFILE_OBJS))
+
+include llamafile/server/BUILD.mk
+
 o/$(MODE)/llamafile/zipalign:				\
 		o/$(MODE)/llamafile/zipalign.o		\
 		o/$(MODE)/llamafile/help.o		\
@@ -29,7 +35,6 @@ o/$(MODE)/llamafile/zipcheck:				\
 
 o/$(MODE)/llamafile/simple:				\
 		o/$(MODE)/llamafile/simple.o		\
-		o/$(MODE)/llama.cpp/llava/llava.a	\
 		o/$(MODE)/llama.cpp/llama.cpp.a
 
 o/$(MODE)/llamafile/tokenize:				\
@@ -39,6 +44,7 @@ o/$(MODE)/llamafile/tokenize:				\
 .PHONY: o/$(MODE)/llamafile
 o/$(MODE)/llamafile:					\
 		$(LLAMAFILE_OBJS)			\
+		o/$(MODE)/llamafile/server		\
 		o/$(MODE)/llamafile/simple		\
 		o/$(MODE)/llamafile/zipalign		\
 		o/$(MODE)/llamafile/zipcheck		\

diff --git a/llamafile/debug.cpp b/llamafile/debug.cpp
@@ -18,12 +18,12 @@
 #include "debug.h"
 #include "log.h"
 
+#include <atomic>
 #include <cosmo.h>
 #include <fenv.h>
 #include <libc/calls/struct/aarch64.internal.h>
 #include <libc/calls/struct/ucontext.internal.h>
 #include <signal.h>
-#include <stdatomic.h>
 #include <termios.h>
 #include <ucontext.h>
 #include <unistd.h>
@@ -36,7 +36,7 @@
 #define UNDERFLOW_DELAY 2
 
 bool FLAG_trap;
-static atomic_llong g_underflowed;
+static std::atomic_llong g_underflowed;
 static thread_local int g_enabled;
 thread_local int llamafile_debug_op_index;
 const struct ggml_cgraph *llamafile_debug_graph;
@@ -59,17 +59,17 @@ static long long millis(void) {
     return timespec_tomillis(timespec_real());
 }
 
-static inline void spinlock(atomic_uint *lock) {
+static inline void spinlock(std::atomic_uint *lock) {
     int x;
     for (;;) {
-        x = atomic_exchange_explicit(lock, 1, memory_order_acquire);
+        x = lock->exchange(1, std::memory_order_acquire);
         if (!x)
             break;
     }
 }
 
-static inline void spunlock(atomic_uint *lock) {
-    atomic_store_explicit(lock, 0, memory_order_release);
+static inline void spunlock(std::atomic_uint *lock) {
+    lock->store(0, std::memory_order_release);
 }
 
 static const char *describe_vertex(struct ggml_tensor *t) {
@@ -130,16 +130,15 @@ static void on_sigfpe(int sig, siginfo_t *si, void *arg) {
     if (reason == FPE_FLTUND) {
         if (g_terminal_buddy.is_terminal) {
             long long now = millis();
-            if ((now - atomic_exchange_explicit(&g_underflowed, now, memory_order_relaxed)) >
-                UNDERFLOW_DELAY) {
+            if ((now - g_underflowed.exchange(now, std::memory_order_relaxed)) > UNDERFLOW_DELAY) {
                 write(2, UNDERFLOW_ALARM, strlen(UNDERFLOW_ALARM));
             }
         }
         recover(ctx, FE_UNDERFLOW);
         return;
     }
 
-    static atomic_uint lock;
+    static std::atomic_uint lock;
     spinlock(&lock);
 
     const char *issue;
@@ -205,7 +204,7 @@ static void setup_sigfpe(void) {
 }
 
 int llamafile_trapping_enabled(int delta) {
-    static atomic_uint once;
+    static _Atomic(uint32_t) once;
     bool was_enabled = g_enabled > 0;
     bool is_enabled = (g_enabled += delta) > 0;
     feclearexcept(FE_ALL_EXCEPT);
@@ -225,11 +224,10 @@ void llamafile_trapping_restore(void) {
         feenableexcept(TRAPS);
         long long last;
         if (g_terminal_buddy.is_terminal &&
-            (last = atomic_load_explicit(&g_underflowed, memory_order_relaxed))) {
+            (last = g_underflowed.load(std::memory_order_relaxed))) {
             long long now = millis();
             if (now - last > UNDERFLOW_DELAY &&
-                now - atomic_exchange_explicit(&g_underflowed, 0, memory_order_relaxed) >
-                    UNDERFLOW_DELAY) {
+                now - g_underflowed.exchange(0, std::memory_order_relaxed) > UNDERFLOW_DELAY) {
                 write(2, UNDERFLOW_RESET, strlen(UNDERFLOW_RESET));
             }
         }