Patch local LLMs with context_window (#416)

* patch * patch ollama * patch lmstudio * patch kobold
letta-ai · Nov 10, 2023 · 20c08b0 · 20c08b0
1 parent 873c044
commit 20c08b0
Showing 8 changed files with 8 additions and 4 deletions.
diff --git a/memgpt/local_llm/koboldcpp/api.py b/memgpt/local_llm/koboldcpp/api.py
@@ -21,6 +21,7 @@ def get_koboldcpp_completion(prompt, context_window, grammar=None, settings=SIMP
     # Settings for the generation, includes the prompt + stop tokens, max length, etc
     request = settings
     request["prompt"] = prompt
+    request["max_context_length"] = context_window
 
     # Set grammar
     if grammar is not None:

diff --git a/memgpt/local_llm/koboldcpp/settings.py b/memgpt/local_llm/koboldcpp/settings.py
@@ -20,6 +20,6 @@
         # '\n#',
         # '\n\n\n',
     ],
-    "max_context_length": LLM_MAX_TOKENS,
+    # "max_context_length": LLM_MAX_TOKENS,
     "max_length": 512,
 }
diff --git a/memgpt/local_llm/lmstudio/api.py b/memgpt/local_llm/lmstudio/api.py
@@ -20,6 +20,7 @@ def get_lmstudio_completion(prompt, context_window, settings=SIMPLE, api="chat")
 
     # Settings for the generation, includes the prompt + stop tokens, max length, etc
     request = settings
+    request["max_tokens"] = context_window
 
     if api == "chat":
         # Uses the ChatCompletions API style

diff --git a/memgpt/local_llm/lmstudio/settings.py b/memgpt/local_llm/lmstudio/settings.py
@@ -22,7 +22,7 @@
     # This controls the maximum number of tokens that the model can generate
     # Cap this at the model context length (assuming 8k for Mistral 7B)
     # "max_tokens": 8000,
-    "max_tokens": LLM_MAX_TOKENS,
+    # "max_tokens": LLM_MAX_TOKENS,
     # This controls how LM studio handles context overflow
     # In MemGPT we handle this ourselves, so this should be commented out
     # "lmstudio": {"context_overflow_policy": 2},

diff --git a/memgpt/local_llm/ollama/api.py b/memgpt/local_llm/ollama/api.py
@@ -26,6 +26,7 @@ def get_ollama_completion(prompt, context_window, settings=SIMPLE, grammar=None)
     request = settings
     request["prompt"] = prompt
     request["model"] = MODEL_NAME
+    request["options"]["num_ctx"] = context_window
 
     # Set grammar
     if grammar is not None:

diff --git a/memgpt/local_llm/ollama/settings.py b/memgpt/local_llm/ollama/settings.py
@@ -22,7 +22,7 @@
             # '\n#',
             # '\n\n\n',
         ],
-        "num_ctx": LLM_MAX_TOKENS,
+        # "num_ctx": LLM_MAX_TOKENS,
     },
     "stream": False,
     # turn off Ollama's own prompt formatting

diff --git a/memgpt/local_llm/webui/api.py b/memgpt/local_llm/webui/api.py
@@ -20,6 +20,7 @@ def get_webui_completion(prompt, context_window, settings=SIMPLE, grammar=None):
     # Settings for the generation, includes the prompt + stop tokens, max length, etc
     request = settings
     request["prompt"] = prompt
+    request["truncation_length"] = context_window  # assuming mistral 7b
 
     # Set grammar
     if grammar is not None:

diff --git a/memgpt/local_llm/webui/settings.py b/memgpt/local_llm/webui/settings.py
@@ -21,5 +21,5 @@
     ],
     "max_new_tokens": 3072,
     # "truncation_length": 4096,  # assuming llama2 models
-    "truncation_length": LLM_MAX_TOKENS,  # assuming mistral 7b
+    # "truncation_length": LLM_MAX_TOKENS,  # assuming mistral 7b
 }