diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst index 40fbf2a823..c6562746c3 100644 --- a/doc/source/getting_started/installation.rst +++ b/doc/source/getting_started/installation.rst @@ -42,7 +42,7 @@ Currently, supported models include: - ``llama-2``, ``llama-3``, ``llama-2-chat``, ``llama-3-instruct`` - ``baichuan``, ``baichuan-chat``, ``baichuan-2-chat`` - ``internlm-16k``, ``internlm-chat-7b``, ``internlm-chat-8k``, ``internlm-chat-20b`` -- ``mistral-v0.1``, ``mistral-instruct-v0.1``, ``mistral-instruct-v0.2``, ``mistral-instruct-v0.3`` +- ``mistral-v0.1``, ``mistral-instruct-v0.1``, ``mistral-instruct-v0.2``, ``mistral-instruct-v0.3``, ``mistral-nemo-instruct`` - ``codestral-v0.1`` - ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k`` - ``code-llama``, ``code-llama-python``, ``code-llama-instruct`` diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index 2c25cd243b..f3673f7172 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -326,6 +326,11 @@ The following is a list of built-in LLM in Xinference: - 32768 - The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an improved instruct fine-tuned version of Mistral-7B-Instruct-v0.1. + * - :ref:`mistral-nemo-instruct ` + - chat + - 1024000 + - The Mistral-Nemo-Instruct-2407 Large Language Model (LLM) is an instruct fine-tuned version of the Mistral-Nemo-Base-2407 + * - :ref:`mistral-v0.1 ` - generate - 8192 @@ -695,6 +700,8 @@ The following is a list of built-in LLM in Xinference: mistral-instruct-v0.3 + mistral-nemo-instruct + mistral-v0.1 mixtral-8x22b-instruct-v0.1 diff --git a/doc/source/models/builtin/llm/mistral-nemo-instruct.rst b/doc/source/models/builtin/llm/mistral-nemo-instruct.rst new file mode 100644 index 0000000000..d863e09304 --- /dev/null +++ b/doc/source/models/builtin/llm/mistral-nemo-instruct.rst @@ -0,0 +1,159 @@ +.. _models_llm_mistral-nemo-instruct: + +======================================== +mistral-nemo-instruct +======================================== + +- **Context Length:** 1024000 +- **Model Name:** mistral-nemo-instruct +- **Languages:** en, fr, de, es, it, pt, zh, ru, ja +- **Abilities:** chat +- **Description:** The Mistral-Nemo-Instruct-2407 Large Language Model (LLM) is an instruct fine-tuned version of the Mistral-Nemo-Base-2407 + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 12 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 12 +- **Quantizations:** none +- **Engines**: vLLM, Transformers +- **Model ID:** mistralai/Mistral-Nemo-Instruct-2407 +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name mistral-nemo-instruct --size-in-billions 12 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (pytorch, 12 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 12 +- **Quantizations:** 4-bit +- **Engines**: Transformers +- **Model ID:** unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name mistral-nemo-instruct --size-in-billions 12 --model-format pytorch --quantization ${quantization} + + +Model Spec 3 (pytorch, 12 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 12 +- **Quantizations:** 8-bit +- **Engines**: Transformers +- **Model ID:** afrizalha/Mistral-Nemo-Instruct-2407-bnb-8bit +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name mistral-nemo-instruct --size-in-billions 12 --model-format pytorch --quantization ${quantization} + + +Model Spec 4 (gptq, 12 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** gptq +- **Model Size (in billions):** 12 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** ModelCloud/Mistral-Nemo-Instruct-2407-gptq-4bit +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name mistral-nemo-instruct --size-in-billions 12 --model-format gptq --quantization ${quantization} + + +Model Spec 5 (awq, 12 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** awq +- **Model Size (in billions):** 12 +- **Quantizations:** Int4 +- **Engines**: vLLM, Transformers +- **Model ID:** casperhansen/mistral-nemo-instruct-2407-awq +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name mistral-nemo-instruct --size-in-billions 12 --model-format awq --quantization ${quantization} + + +Model Spec 6 (ggufv2, 12 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 12 +- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_K_S, Q4_K_M, Q5_K_S, Q5_K_M, Q6_K, Q8_0, fp16 +- **Engines**: llama.cpp +- **Model ID:** MaziyarPanahi/Mistral-Nemo-Instruct-2407-GGUF +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name mistral-nemo-instruct --size-in-billions 12 --model-format ggufv2 --quantization ${quantization} + + +Model Spec 7 (mlx, 12 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 12 +- **Quantizations:** none +- **Engines**: MLX +- **Model ID:** mlx-community/Mistral-Nemo-Instruct-2407-bf16 +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name mistral-nemo-instruct --size-in-billions 12 --model-format mlx --quantization ${quantization} + + +Model Spec 8 (mlx, 12 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 12 +- **Quantizations:** 4-bit +- **Engines**: MLX +- **Model ID:** mlx-community/Mistral-Nemo-Instruct-2407-4bit +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name mistral-nemo-instruct --size-in-billions 12 --model-format mlx --quantization ${quantization} + + +Model Spec 9 (mlx, 12 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** mlx +- **Model Size (in billions):** 12 +- **Quantizations:** 8-bit +- **Engines**: MLX +- **Model ID:** mlx-community/Mistral-Nemo-Instruct-2407-8bit +- **Model Hubs**: `Hugging Face `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name mistral-nemo-instruct --size-in-billions 12 --model-format mlx --quantization ${quantization} + diff --git a/doc/source/user_guide/backends.rst b/doc/source/user_guide/backends.rst index a2202ba162..7d00b22e18 100644 --- a/doc/source/user_guide/backends.rst +++ b/doc/source/user_guide/backends.rst @@ -49,7 +49,7 @@ Currently, supported model includes: - ``llama-2``, ``llama-3``, ``llama-2-chat``, ``llama-3-instruct`` - ``baichuan``, ``baichuan-chat``, ``baichuan-2-chat`` - ``internlm-16k``, ``internlm-chat-7b``, ``internlm-chat-8k``, ``internlm-chat-20b`` -- ``mistral-v0.1``, ``mistral-instruct-v0.1``, ``mistral-instruct-v0.2``, ``mistral-instruct-v0.3`` +- ``mistral-v0.1``, ``mistral-instruct-v0.1``, ``mistral-instruct-v0.2``, ``mistral-instruct-v0.3``, ``mistral-nemo-instruct`` - ``codestral-v0.1`` - ``Yi``, ``Yi-1.5``, ``Yi-chat``, ``Yi-1.5-chat``, ``Yi-1.5-chat-16k`` - ``code-llama``, ``code-llama-python``, ``code-llama-instruct`` diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 5e456bdfb9..3b35fa333b 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -3936,6 +3936,130 @@ ] } }, + { + "version": 1, + "context_length": 1024000, + "model_name": "mistral-nemo-instruct", + "model_lang": [ + "en", + "fr", + "de", + "es", + "it", + "pt", + "zh", + "ru", + "ja" + ], + "model_ability": [ + "chat" + ], + "model_description": "The Mistral-Nemo-Instruct-2407 Large Language Model (LLM) is an instruct fine-tuned version of the Mistral-Nemo-Base-2407", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 12, + "quantizations": [ + "none" + ], + "model_id": "mistralai/Mistral-Nemo-Instruct-2407", + "model_revision": "05b1e4f3e189ec1b5189fb3c973d4cf3369c27af" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 12, + "quantizations": [ + "4-bit" + ], + "model_id": "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit", + "model_revision": "1d85adc9e0fff0b8e4479a037bd75fe1346333ca" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 12, + "quantizations": [ + "8-bit" + ], + "model_id": "afrizalha/Mistral-Nemo-Instruct-2407-bnb-8bit", + "model_revision": "1d2dacf18a486c745219317d1507441406bc7e25" + }, + { + "model_format": "gptq", + "model_size_in_billions": 12, + "quantizations": [ + "Int4" + ], + "model_id": "ModelCloud/Mistral-Nemo-Instruct-2407-gptq-4bit" + }, + { + "model_format": "awq", + "model_size_in_billions": 12, + "quantizations": [ + "Int4" + ], + "model_id": "casperhansen/mistral-nemo-instruct-2407-awq" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 12, + "quantizations": [ + "Q2_K", + "Q3_K_S", + "Q3_K_M", + "Q3_K_L", + "Q4_K_S", + "Q4_K_M", + "Q5_K_S", + "Q5_K_M", + "Q6_K", + "Q8_0", + "fp16" + ], + "model_id": "MaziyarPanahi/Mistral-Nemo-Instruct-2407-GGUF", + "model_file_name_template": "Mistral-Nemo-Instruct-2407.{quantization}.gguf" + }, + { + "model_format": "mlx", + "model_size_in_billions": 12, + "quantizations": [ + "none" + ], + "model_id": "mlx-community/Mistral-Nemo-Instruct-2407-bf16" + }, + { + "model_format": "mlx", + "model_size_in_billions": 12, + "quantizations": [ + "4-bit" + ], + "model_id": "mlx-community/Mistral-Nemo-Instruct-2407-4bit" + }, + { + "model_format": "mlx", + "model_size_in_billions": 12, + "quantizations": [ + "8-bit" + ], + "model_id": "mlx-community/Mistral-Nemo-Instruct-2407-8bit" + } + ], + "prompt_style": { + "style_name": "mistral-nemo", + "system_prompt": "", + "roles": [ + "[INST]", + "[/INST]" + ], + "intra_message_sep": "", + "inter_message_sep": "", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] + } + }, { "version": 1, "context_length": 32768, diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index 8510beb796..fffd2dfa3d 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -2078,6 +2078,62 @@ ] } }, + { + "version": 1, + "context_length": 1024000, + "model_name": "mistral-nemo-instruct", + "model_lang": [ + "en", + "fr", + "de", + "es", + "it", + "pt", + "zh", + "ru", + "ja" + ], + "model_ability": [ + "chat" + ], + "model_description": "The Mistral-Nemo-Instruct-2407 Large Language Model (LLM) is an instruct fine-tuned version of the Mistral-Nemo-Base-2407", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 12, + "quantizations": [ + "none" + ], + "model_id": "AI-ModelScope/Mistral-Nemo-Instruct-2407", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 12, + "quantizations": [ + "Int4" + ], + "model_id": "LLM-Research/Mistral-Nemo-Instruct-2407-gptq-4bit", + "model_hub": "modelscope" + } + ], + "prompt_style": { + "style_name": "mistral-nemo", + "system_prompt": "", + "roles": [ + "[INST]", + "[/INST]" + ], + "intra_message_sep": "", + "inter_message_sep": "", + "stop_token_ids": [ + 2 + ], + "stop": [ + "" + ] + } + }, { "version": 1, "context_length": 2048, diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py index 9ba61ea78b..aadcbf9471 100644 --- a/xinference/model/llm/utils.py +++ b/xinference/model/llm/utils.py @@ -483,6 +483,27 @@ def get_role(role_name: str): else: ret += role return ret + elif prompt_style.style_name == "mistral-nemo": + seps = [prompt_style.intra_message_sep, prompt_style.inter_message_sep] + ret = "" + for i, message in enumerate(chat_history): + role = get_role(message["role"]) + content = message["content"] + if content: + if i == len(chat_history) - 2 and prompt_style.system_prompt: + ret += ( + role + + " " + + prompt_style.system_prompt + + "\n\n" + + content + + seps[i % 2] + ) + else: + ret += role + " " + content + seps[i % 2] + else: + ret += role + return ret else: raise ValueError(f"Invalid prompt style: {prompt_style.style_name}") diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index 77b0b55b28..bffdf22ecc 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -151,6 +151,9 @@ class VLLMGenerateConfig(TypedDict, total=False): VLLM_SUPPORTED_CHAT_MODELS.append("qwen2-moe-instruct") VLLM_SUPPORTED_CHAT_MODELS.append("c4ai-command-r-v01") +if VLLM_INSTALLED and vllm.__version__ >= "0.5.3": + VLLM_SUPPORTED_CHAT_MODELS.append("mistral-nemo-instruct") + class VLLMModel(LLM): def __init__(