diff --git a/memgpt/local_llm/koboldcpp/api.py b/memgpt/local_llm/koboldcpp/api.py index 41e5484d00..1ca93392b8 100644 --- a/memgpt/local_llm/koboldcpp/api.py +++ b/memgpt/local_llm/koboldcpp/api.py @@ -21,6 +21,7 @@ def get_koboldcpp_completion(prompt, context_window, grammar=None, settings=SIMP # Settings for the generation, includes the prompt + stop tokens, max length, etc request = settings request["prompt"] = prompt + request["max_context_length"] = context_window # Set grammar if grammar is not None: diff --git a/memgpt/local_llm/koboldcpp/settings.py b/memgpt/local_llm/koboldcpp/settings.py index ec2bb19514..c3a47dfabd 100644 --- a/memgpt/local_llm/koboldcpp/settings.py +++ b/memgpt/local_llm/koboldcpp/settings.py @@ -20,6 +20,6 @@ # '\n#', # '\n\n\n', ], - "max_context_length": LLM_MAX_TOKENS, + # "max_context_length": LLM_MAX_TOKENS, "max_length": 512, } diff --git a/memgpt/local_llm/lmstudio/api.py b/memgpt/local_llm/lmstudio/api.py index e5440799f9..79c5f93f28 100644 --- a/memgpt/local_llm/lmstudio/api.py +++ b/memgpt/local_llm/lmstudio/api.py @@ -20,6 +20,7 @@ def get_lmstudio_completion(prompt, context_window, settings=SIMPLE, api="chat") # Settings for the generation, includes the prompt + stop tokens, max length, etc request = settings + request["max_tokens"] = context_window if api == "chat": # Uses the ChatCompletions API style diff --git a/memgpt/local_llm/lmstudio/settings.py b/memgpt/local_llm/lmstudio/settings.py index bc8b941660..e102577ddf 100644 --- a/memgpt/local_llm/lmstudio/settings.py +++ b/memgpt/local_llm/lmstudio/settings.py @@ -22,7 +22,7 @@ # This controls the maximum number of tokens that the model can generate # Cap this at the model context length (assuming 8k for Mistral 7B) # "max_tokens": 8000, - "max_tokens": LLM_MAX_TOKENS, + # "max_tokens": LLM_MAX_TOKENS, # This controls how LM studio handles context overflow # In MemGPT we handle this ourselves, so this should be commented out # "lmstudio": {"context_overflow_policy": 2}, diff --git a/memgpt/local_llm/ollama/api.py b/memgpt/local_llm/ollama/api.py index 934ba1bf38..6f1fed4d39 100644 --- a/memgpt/local_llm/ollama/api.py +++ b/memgpt/local_llm/ollama/api.py @@ -26,6 +26,7 @@ def get_ollama_completion(prompt, context_window, settings=SIMPLE, grammar=None) request = settings request["prompt"] = prompt request["model"] = MODEL_NAME + request["options"]["num_ctx"] = context_window # Set grammar if grammar is not None: diff --git a/memgpt/local_llm/ollama/settings.py b/memgpt/local_llm/ollama/settings.py index f412361ca3..47f3403385 100644 --- a/memgpt/local_llm/ollama/settings.py +++ b/memgpt/local_llm/ollama/settings.py @@ -22,7 +22,7 @@ # '\n#', # '\n\n\n', ], - "num_ctx": LLM_MAX_TOKENS, + # "num_ctx": LLM_MAX_TOKENS, }, "stream": False, # turn off Ollama's own prompt formatting diff --git a/memgpt/local_llm/webui/api.py b/memgpt/local_llm/webui/api.py index 211100a376..e9373e20ea 100644 --- a/memgpt/local_llm/webui/api.py +++ b/memgpt/local_llm/webui/api.py @@ -20,6 +20,7 @@ def get_webui_completion(prompt, context_window, settings=SIMPLE, grammar=None): # Settings for the generation, includes the prompt + stop tokens, max length, etc request = settings request["prompt"] = prompt + request["truncation_length"] = context_window # assuming mistral 7b # Set grammar if grammar is not None: diff --git a/memgpt/local_llm/webui/settings.py b/memgpt/local_llm/webui/settings.py index 5d1e915763..cdccc67bef 100644 --- a/memgpt/local_llm/webui/settings.py +++ b/memgpt/local_llm/webui/settings.py @@ -21,5 +21,5 @@ ], "max_new_tokens": 3072, # "truncation_length": 4096, # assuming llama2 models - "truncation_length": LLM_MAX_TOKENS, # assuming mistral 7b + # "truncation_length": LLM_MAX_TOKENS, # assuming mistral 7b }