diff --git a/memgpt/local_llm/koboldcpp/api.py b/memgpt/local_llm/koboldcpp/api.py
index 41e5484d00..1ca93392b8 100644
--- a/memgpt/local_llm/koboldcpp/api.py
+++ b/memgpt/local_llm/koboldcpp/api.py
@@ -21,6 +21,7 @@ def get_koboldcpp_completion(prompt, context_window, grammar=None, settings=SIMP
     # Settings for the generation, includes the prompt + stop tokens, max length, etc
     request = settings
     request["prompt"] = prompt
+    request["max_context_length"] = context_window
 
     # Set grammar
     if grammar is not None:
diff --git a/memgpt/local_llm/koboldcpp/settings.py b/memgpt/local_llm/koboldcpp/settings.py
index ec2bb19514..c3a47dfabd 100644
--- a/memgpt/local_llm/koboldcpp/settings.py
+++ b/memgpt/local_llm/koboldcpp/settings.py
@@ -20,6 +20,6 @@
         # '\n#',
         # '\n\n\n',
     ],
-    "max_context_length": LLM_MAX_TOKENS,
+    # "max_context_length": LLM_MAX_TOKENS,
     "max_length": 512,
 }
diff --git a/memgpt/local_llm/lmstudio/api.py b/memgpt/local_llm/lmstudio/api.py
index e5440799f9..79c5f93f28 100644
--- a/memgpt/local_llm/lmstudio/api.py
+++ b/memgpt/local_llm/lmstudio/api.py
@@ -20,6 +20,7 @@ def get_lmstudio_completion(prompt, context_window, settings=SIMPLE, api="chat")
 
     # Settings for the generation, includes the prompt + stop tokens, max length, etc
     request = settings
+    request["max_tokens"] = context_window
 
     if api == "chat":
         # Uses the ChatCompletions API style
diff --git a/memgpt/local_llm/lmstudio/settings.py b/memgpt/local_llm/lmstudio/settings.py
index bc8b941660..e102577ddf 100644
--- a/memgpt/local_llm/lmstudio/settings.py
+++ b/memgpt/local_llm/lmstudio/settings.py
@@ -22,7 +22,7 @@
     # This controls the maximum number of tokens that the model can generate
     # Cap this at the model context length (assuming 8k for Mistral 7B)
     # "max_tokens": 8000,
-    "max_tokens": LLM_MAX_TOKENS,
+    # "max_tokens": LLM_MAX_TOKENS,
     # This controls how LM studio handles context overflow
     # In MemGPT we handle this ourselves, so this should be commented out
     # "lmstudio": {"context_overflow_policy": 2},
diff --git a/memgpt/local_llm/ollama/api.py b/memgpt/local_llm/ollama/api.py
index 934ba1bf38..6f1fed4d39 100644
--- a/memgpt/local_llm/ollama/api.py
+++ b/memgpt/local_llm/ollama/api.py
@@ -26,6 +26,7 @@ def get_ollama_completion(prompt, context_window, settings=SIMPLE, grammar=None)
     request = settings
     request["prompt"] = prompt
     request["model"] = MODEL_NAME
+    request["options"]["num_ctx"] = context_window
 
     # Set grammar
     if grammar is not None:
diff --git a/memgpt/local_llm/ollama/settings.py b/memgpt/local_llm/ollama/settings.py
index f412361ca3..47f3403385 100644
--- a/memgpt/local_llm/ollama/settings.py
+++ b/memgpt/local_llm/ollama/settings.py
@@ -22,7 +22,7 @@
             # '\n#',
             # '\n\n\n',
         ],
-        "num_ctx": LLM_MAX_TOKENS,
+        # "num_ctx": LLM_MAX_TOKENS,
     },
     "stream": False,
     # turn off Ollama's own prompt formatting
diff --git a/memgpt/local_llm/webui/api.py b/memgpt/local_llm/webui/api.py
index 211100a376..e9373e20ea 100644
--- a/memgpt/local_llm/webui/api.py
+++ b/memgpt/local_llm/webui/api.py
@@ -20,6 +20,7 @@ def get_webui_completion(prompt, context_window, settings=SIMPLE, grammar=None):
     # Settings for the generation, includes the prompt + stop tokens, max length, etc
     request = settings
     request["prompt"] = prompt
+    request["truncation_length"] = context_window  # assuming mistral 7b
 
     # Set grammar
     if grammar is not None:
diff --git a/memgpt/local_llm/webui/settings.py b/memgpt/local_llm/webui/settings.py
index 5d1e915763..cdccc67bef 100644
--- a/memgpt/local_llm/webui/settings.py
+++ b/memgpt/local_llm/webui/settings.py
@@ -21,5 +21,5 @@
     ],
     "max_new_tokens": 3072,
     # "truncation_length": 4096,  # assuming llama2 models
-    "truncation_length": LLM_MAX_TOKENS,  # assuming mistral 7b
+    # "truncation_length": LLM_MAX_TOKENS,  # assuming mistral 7b
 }