Add /upstream endpoint (#30)

* remove catch-all route to upstream proxy (it was broken anyways) * add /upstream/:model_id to swap and route to upstream path * add /upstream HTML endpoint and unlisted option * add /upstream endpoint to show a list of available models * add `unlisted` configuration option to omit a model from /v1/models and /upstream lists * add favicon.ico
mostlygeek · Dec 17, 2024 · 891f6a5 · 891f6a5
1 parent 7183f6b
commit 891f6a5
Show file tree

Hide file tree

Showing 7 changed files with 78 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -8,15 +8,16 @@ llama-swap is an OpenAI API compatible server that gives you complete control ov
 Features:
 
 - ✅ Easy to deploy: single binary with no dependencies
-- ✅ Single yaml configuration file
+- ✅ Easy to config: single yaml file
 - ✅ On-demand model switching
 - ✅ Full control over server settings per model
 - ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
 - ✅ Multiple GPU support
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
-- ✅ Use any local server that provides an OpenAI compatible API (llama.cpp, vllm, tabblyAPI, etc)
+- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabblyAPI, etc)
+- ✅ Direct access to proxied upstream HTTP server via `/upstream/:model_id`
 
 ## Releases
 
@@ -73,6 +74,12 @@ models:
       --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
     proxy: http://127.0.0.1:8999
 
+  # unlisted models do not show up in /v1/models or /upstream lists
+  # but they can still be requested as normal
+  "qwen-unlisted":
+    cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
+    unlisted: true
+
 # profiles make it easy to managing multi model (and gpu) configurations.
 #
 # Tips:

diff --git a/config.example.yaml b/config.example.yaml
@@ -33,6 +33,7 @@ models:
       - env1=hello
     cmd: build/simple-responder --port 8999
     proxy: http://127.0.0.1:8999
+    unlisted: true
 
     # use "none" to skip check. Caution this may cause some requests to fail
     # until the upstream server is ready for traffic
@@ -42,9 +43,11 @@ models:
   "broken":
     cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
     proxy: http://127.0.0.1:8999
+    unlisted: true
   "broken_timeout":
     cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
     proxy: http://127.0.0.1:9000
+    unlisted: true
 
 # creating a coding profile with models for code generation and general questions
 profiles:

diff --git a/misc/assets/favicon-raw.png b/misc/assets/favicon-raw.png
diff --git a/proxy/config.go b/proxy/config.go
@@ -16,6 +16,7 @@ type ModelConfig struct {
 	Env           []string `yaml:"env"`
 	CheckEndpoint string   `yaml:"checkEndpoint"`
 	UnloadAfter   int      `yaml:"ttl"`
+	Unlisted      bool     `yaml:"unlisted"`
 }
 
 func (m *ModelConfig) SanitizedCommand() ([]string, error) {

diff --git a/proxy/html/favicon.ico b/proxy/html/favicon.ico
diff --git a/proxy/proxymanager.go b/proxy/proxymanager.go
@@ -2,10 +2,12 @@ package proxy
 
 import (
 	"bytes"
+	"embed"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
+	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -18,6 +20,15 @@ const (
 	PROFILE_SPLIT_CHAR = ":"
 )
 
+//go:embed html/favicon.ico
+var faviconData []byte
+
+//go:embed html/logs.html
+var logsHTML []byte
+
+// make sure embed is kept there by the IDE auto-package importer
+var _ = embed.FS{}
+
 type ProxyManager struct {
 	sync.Mutex
 
@@ -48,7 +59,12 @@ func New(config *Config) *ProxyManager {
 	pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler)
 	pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE)
 
-	pm.ginEngine.NoRoute(pm.proxyNoRouteHandler)
+	pm.ginEngine.GET("/upstream", pm.upstreamIndex)
+	pm.ginEngine.Any("/upstream/:model_id/*upstreamPath", pm.proxyToUpstream)
+
+	pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) {
+		c.Data(http.StatusOK, "image/x-icon", faviconData)
+	})
 
 	// Disable console color for testing
 	gin.DisableConsoleColor()
@@ -86,7 +102,11 @@ func (pm *ProxyManager) stopProcesses() {
 
 func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
 	data := []interface{}{}
-	for id := range pm.config.Models {
+	for id, modelConfig := range pm.config.Models {
+		if modelConfig.Unlisted {
+			continue
+		}
+
 		data = append(data, map[string]interface{}{
 			"id":       id,
 			"object":   "model",
@@ -113,7 +133,7 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
 	pm.Lock()
 	defer pm.Unlock()
 
-	// Check if requestedModel contains a /
+	// Check if requestedModel contains a PROFILE_SPLIT_CHAR
 	profileName, modelName := "", requestedModel
 	if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 {
 		profileName = requestedModel[:idx]
@@ -170,6 +190,48 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
 	return pm.currentProcesses[requestedProcessKey], nil
 }
 
+func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
+	requestedModel := c.Param("model_id")
+
+	if requestedModel == "" {
+		c.AbortWithError(http.StatusBadRequest, fmt.Errorf("model id required in path"))
+		return
+	}
+
+	if process, err := pm.swapModel(requestedModel); err != nil {
+		c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
+	} else {
+		// rewrite the path
+		c.Request.URL.Path = c.Param("upstreamPath")
+		process.ProxyRequest(c.Writer, c.Request)
+	}
+}
+
+func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
+	var html strings.Builder
+
+	html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
+
+	// Extract keys and sort them
+	var modelIDs []string
+	for modelID, modelConfig := range pm.config.Models {
+		if modelConfig.Unlisted {
+			continue
+		}
+
+		modelIDs = append(modelIDs, modelID)
+	}
+	sort.Strings(modelIDs)
+
+	// Iterate over sorted keys
+	for _, modelID := range modelIDs {
+		html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a></li>", modelID, modelID))
+	}
+	html.WriteString("</ul></body></html>")
+	c.Header("Content-Type", "text/html")
+	c.String(http.StatusOK, html.String())
+}
+
 func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
 	bodyBytes, err := io.ReadAll(c.Request.Body)
 	if err != nil {
@@ -201,16 +263,6 @@ func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
 	}
 }
 
-func (pm *ProxyManager) proxyNoRouteHandler(c *gin.Context) {
-	// since maps are unordered, just use the first available process if one exists
-	for _, process := range pm.currentProcesses {
-		process.ProxyRequest(c.Writer, c.Request)
-		return
-	}
-
-	c.AbortWithError(http.StatusBadRequest, fmt.Errorf("no strategy to handle request"))
-}
-
 func ProcessKeyName(groupName, modelName string) string {
 	return groupName + PROFILE_SPLIT_CHAR + modelName
 }
diff --git a/proxy/proxymanager_loghandlers.go b/proxy/proxymanager_loghandlers.go
@@ -1,20 +1,13 @@
 package proxy
 
 import (
-	"embed"
 	"fmt"
 	"net/http"
 	"strings"
 
 	"github.com/gin-gonic/gin"
 )
 
-//go:embed html/logs.html
-var logsHTML []byte
-
-// make sure embed is kept there by the IDE auto-package importer
-var _ = embed.FS{}
-
 func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
 
 	accept := c.GetHeader("Accept")