From 2b2923e40c66ade23554cd5650dff58fd703771a Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 22 Feb 2024 18:11:15 +0800 Subject: [PATCH 1/6] add iq2 examples --- .../Advanced-Quantizations/2Bit/README.md | 81 ++++++++++++++++++ .../Advanced-Quantizations/2Bit/generate.py | 84 +++++++++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/README.md create mode 100644 python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/generate.py diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/README.md new file mode 100644 index 00000000000..c57eb0168d7 --- /dev/null +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/README.md @@ -0,0 +1,81 @@ +# 2Bit + +This example shows how to directly run 2-bit(iq2) models using BigDL-LLM on Intel GPU. + +## Verified Models + +- [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf), using [llama-v2-7b.imatrix](https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/resolve/main/llama-v2-7b.imatrix) +- [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), using [llama-v2-7b.imatrix](https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/resolve/main/llama-v2-7b.imatrix) +- [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2), using [mistral-7b-instruct-v0.2.imatrix](https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/resolve/main/mistral-7b-instruct-v0.2.imatrix) +- [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1), using [mixtral-8x7b.imatrix](https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/resolve/main/mixtral-8x7b.imatrix) +- [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1), using [mixtral-8x7b.imatrix](https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/resolve/main/mixtral-8x7b-instruct-v0.1.imatrix) + +## Requirements + +To run these examples with BigDL-LLM, we have some recommended requirements for your machine, please refer to [here](../../../README.md#requirements) for more information. + +## Example: Predict Tokens using `generate()` API + +In the example [generate.py](./generate.py), we show a basic use case for a 2 bit model to predict the next N tokens using `generate()` API, with BigDL-LLM 2 bit optimizations. + +### 1. Install + +We suggest using conda to manage environment: + +```bash +conda create -n llm python=3.9 +conda activate llm +# below command will install intel_extension_for_pytorch==2.1.10+xpu as default +pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu +pip install transformers==4.35.0 +``` +**Note: For Mixtral model, please use transformers 4.36.0:** +```bash +pip install transformers==4.36.0 +``` + +### 2. Configures OneAPI environment variables + +```bash +source /opt/intel/oneapi/setvars.sh +``` + +### 3. Run + +For optimal performance on Arc, it is recommended to set several environment variables. + +```bash +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +``` + +``` +python ./generate.py --repo-id-or-model-path REPO_ID_OR_MODEL_PATH --prompt PROMPT --n-predict N_PREDICT +``` + +Arguments info: + +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'meta-llama/Llama-2-7b-chat-hf'`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is AI?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. + +#### 2.3 Sample Output + +#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) + +```log +Inference time: xxxx s +-------------------- Prompt -------------------- +### HUMAN: +What is AI? + +### RESPONSE: + +-------------------- Output -------------------- +### HUMAN: +What is AI? + +### RESPONSE: + +Artificial intelligence (AI) refers to the ability of machines to perform tasks that would typically require human intelligence, such as learning, problem-solving +``` diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/generate.py new file mode 100644 index 00000000000..70564022eaf --- /dev/null +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/generate.py @@ -0,0 +1,84 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse +from bigdl.llm.transformers import AutoModelForCausalLM +from transformers import AutoTokenizer +import warnings + +# you could tune the prompt based on your own model, +# here the prompt tuning refers to https://huggingface.co/georgesung/llama2_7b_chat_uncensored#prompt-style +PROMPT_FORMAT = """### HUMAN: +{prompt} + +### RESPONSE: +""" + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for LLM model') + parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf", + help='The huggingface repo id' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + warnings.warn("iq2 quantization may need several minutes, please wait a moment, " + "or have a cup of coffee now : )") + + # Load model in 2 bit, + # which convert the relevant layers in the model into iq2_xxs format. + # 2 bit quantization needs weight(imatrix) file to assist in quantization + # and improve generation quality, and different model may need different + # imtraix file, you can find and download imatrix file from + # https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/tree/main. + model = AutoModelForCausalLM.from_pretrained(model_path, + load_in_low_bit='iq2_xxs', + trust_remote_code=True, + # imatrix='llama-v2-7b.imatrix').to("xpu") + imatrix='/home/arda/ruonan/debug/imatrix/llama-v2-7b.imatrix').to("xpu") + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + # Generate predicted tokens + with torch.inference_mode(): + prompt = PROMPT_FORMAT.format(prompt=args.prompt) + input_ids = tokenizer.encode(prompt, return_tensors="pt").to("xpu") + # ipex model needs a warmup, then inference time can be accurate + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + st = time.time() + # if your selected model is capable of utilizing previous key/value attentions + # to enhance decoding speed, but has `"use_cache": false` in its model config, + # it is important to set `use_cache=True` explicitly in the `generate` function + # to obtain optimal performance with BigDL-LLM Low Bit optimizations + output = model.generate(input_ids, + max_new_tokens=args.n_predict, + repetition_penalty=1.1) + end = time.time() + output_str = tokenizer.decode(output[0], skip_special_tokens=True) + print(f'Inference time: {end-st} s') + print('-'*20, 'Prompt', '-'*20) + print(prompt) + print('-'*20, 'Output', '-'*20) + print(output_str) From e8f836e7f88b18461e8d7627513f2916e7b9fa35 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 22 Feb 2024 18:13:05 +0800 Subject: [PATCH 2/6] small fix --- .../Advanced-Quantizations/2Bit/README.md | 2 +- .../Advanced-Quantizations/2Bit/generate.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/README.md index c57eb0168d7..e6724eae775 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/README.md +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/README.md @@ -1,6 +1,6 @@ # 2Bit -This example shows how to directly run 2-bit(iq2) models using BigDL-LLM on Intel GPU. +This example shows how to directly run 2 bit (iq2) models using BigDL-LLM on Intel GPU. ## Verified Models diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/generate.py index 70564022eaf..1fc598283a2 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/generate.py @@ -54,8 +54,7 @@ model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='iq2_xxs', trust_remote_code=True, - # imatrix='llama-v2-7b.imatrix').to("xpu") - imatrix='/home/arda/ruonan/debug/imatrix/llama-v2-7b.imatrix').to("xpu") + imatrix='llama-v2-7b.imatrix').to("xpu") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) From 322b7e76170c55b514f836abffccdc93c0b85fab Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 22 Feb 2024 19:05:34 +0800 Subject: [PATCH 3/6] meet code review --- .../Advanced-Quantizations/{2Bit => GGUF-IQ2}/README.md | 6 +++--- .../{2Bit => GGUF-IQ2}/generate.py | 9 +++++---- python/llm/src/bigdl/llm/ggml/quantize.py | 4 ++-- python/llm/src/bigdl/llm/transformers/low_bit_linear.py | 4 ++-- python/llm/src/bigdl/llm/transformers/model.py | 9 +++++---- python/llm/src/bigdl/llm/transformers/utils.py | 2 +- 6 files changed, 18 insertions(+), 16 deletions(-) rename python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/{2Bit => GGUF-IQ2}/README.md (94%) rename python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/{2Bit => GGUF-IQ2}/generate.py (90%) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/README.md similarity index 94% rename from python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/README.md rename to python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/README.md index e6724eae775..52becec2c29 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/README.md +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/README.md @@ -1,6 +1,6 @@ -# 2Bit +# GGUF-IQ2 -This example shows how to directly run 2 bit (iq2) models using BigDL-LLM on Intel GPU. +This example shows how to directly run GGUF-IQ2 models using BigDL-LLM on Intel GPU. ## Verified Models @@ -16,7 +16,7 @@ To run these examples with BigDL-LLM, we have some recommended requirements for ## Example: Predict Tokens using `generate()` API -In the example [generate.py](./generate.py), we show a basic use case for a 2 bit model to predict the next N tokens using `generate()` API, with BigDL-LLM 2 bit optimizations. +In the example [generate.py](./generate.py), we show a basic use case for a GGUF-IQ2 model to predict the next N tokens using `generate()` API, with BigDL-LLM optimizations. ### 1. Install diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py similarity index 90% rename from python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/generate.py rename to python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py index 1fc598283a2..338782489f9 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/2Bit/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py @@ -46,15 +46,16 @@ "or have a cup of coffee now : )") # Load model in 2 bit, - # which convert the relevant layers in the model into iq2_xxs format. - # 2 bit quantization needs weight(imatrix) file to assist in quantization + # which convert the relevant layers in the model into gguf_iq2_xxs format. + # GGUF-IQ2 quantization needs imatrix file to assist in quantization # and improve generation quality, and different model may need different # imtraix file, you can find and download imatrix file from # https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/tree/main. model = AutoModelForCausalLM.from_pretrained(model_path, - load_in_low_bit='iq2_xxs', + load_in_low_bit='gguf_iq2_xxs', trust_remote_code=True, - imatrix='llama-v2-7b.imatrix').to("xpu") + # imatrix='llama-v2-7b.imatrix').to("xpu") + imatrix='/home/arda/ruonan/debug/imatrix/llama-v2-7b.imatrix').to("xpu") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/src/bigdl/llm/ggml/quantize.py b/python/llm/src/bigdl/llm/ggml/quantize.py index b25dd7ce3a1..382b15eb433 100644 --- a/python/llm/src/bigdl/llm/ggml/quantize.py +++ b/python/llm/src/bigdl/llm/ggml/quantize.py @@ -40,8 +40,8 @@ "fp8_e5m2": 19, # fp8 in e5m2 format "fp8": 19, # fp8 in e5m2 format "bf16": 20, - "iq2_xxs": 21, - "iq2_xs": 22, + "gguf_iq2_xxs": 21, + "gguf_iq2_xs": 22, "q2_k": 23} _llama_quantize_type = {"q4_0": 2, diff --git a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py index 9676de62c93..3a147e0f8a1 100644 --- a/python/llm/src/bigdl/llm/transformers/low_bit_linear.py +++ b/python/llm/src/bigdl/llm/transformers/low_bit_linear.py @@ -70,8 +70,8 @@ MOFQ4 = ggml_tensor_qtype["mixed_fp4"] MOFQ8 = ggml_tensor_qtype["mixed_fp8"] FP8E5 = ggml_tensor_qtype["fp8_e5m2"] -IQ2_XXS = ggml_tensor_qtype["iq2_xxs"] -IQ2_XS = ggml_tensor_qtype["iq2_xs"] +IQ2_XXS = ggml_tensor_qtype["gguf_iq2_xxs"] +IQ2_XS = ggml_tensor_qtype["gguf_iq2_xs"] Q2_K = ggml_tensor_qtype["q2_k"] diff --git a/python/llm/src/bigdl/llm/transformers/model.py b/python/llm/src/bigdl/llm/transformers/model.py index 89dcee8da9c..043c4638bf9 100644 --- a/python/llm/src/bigdl/llm/transformers/model.py +++ b/python/llm/src/bigdl/llm/transformers/model.py @@ -110,7 +110,7 @@ def from_pretrained(cls, :param load_in_low_bit: str value, options are ``'sym_int4'``, ``'asym_int4'``, ``'sym_int5'``, ``'asym_int5'``, ``'sym_int8'``, ``'nf3'``, ``'nf4'``, ``'fp4'``, ``'fp8'``, ``'fp8_e4m3'``, ``'fp8_e5m2'``, - ``'iq2_xxs'``, ``'iq2_xs'``, ``'fp16'`` or ``'bf16'``, + ``'gguf_iq2_xxs'``, ``'gguf_iq2_xs'``, ``'fp16'`` or ``'bf16'``, ``'sym_int4'`` means symmetric int 4, ``'asym_int4'`` means asymmetric int 4, ``'nf4'`` means 4-bit NormalFloat, etc. Relevant low bit optimizations will be applied to the model. @@ -278,12 +278,13 @@ def from_pretrained(cls, kwargs["pretraining_tp"] = 1 q_k = load_in_low_bit if load_in_low_bit else "sym_int4" imatrix_file = kwargs.pop("imatrix", None) - if q_k in ["iq2_xxs", "iq2_xs"]: + if q_k in ["gguf_iq2_xxs", "gguf_iq2_xs"]: invalidInputError(imatrix_file is not None, - "For iq2_xxs and iq2_xs quantization, imatrix is needed.") + "For gguf_iq2_xxs and gguf_iq2_xs quantization," + "imatrix is needed.") cpu_embedding = kwargs.get("cpu_embedding", False) # for 2bit, default use embedding_quantization - if q_k in ["iq2_xxs", "iq2_xs", "q2_k"] and not cpu_embedding and \ + if q_k in ["gguf_iq2_xxs", "gguf_iq2_xs", "q2_k"] and not cpu_embedding and \ embedding_qtype is None: embedding_qtype = "q2_k" if imatrix_file is not None: diff --git a/python/llm/src/bigdl/llm/transformers/utils.py b/python/llm/src/bigdl/llm/transformers/utils.py index 467bcaf68f2..ba6fba9b5bb 100644 --- a/python/llm/src/bigdl/llm/transformers/utils.py +++ b/python/llm/src/bigdl/llm/transformers/utils.py @@ -269,7 +269,7 @@ def module_name_process(full_module_name): def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_type=None): cur_qtype = qtype - if qtype in [ggml_tensor_qtype["iq2_xxs"], ggml_tensor_qtype["iq2_xs"]]: + if qtype in [ggml_tensor_qtype["gguf_iq2_xxs"], ggml_tensor_qtype["gguf_iq2_xs"]]: # For quantization which needs importance matrix new_module_name, layer, cur_module = module_name_process(full_module_name) # custom mixed quantization strategy From cf9424614768a31bae66602eabb2f2ac6831faf8 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 22 Feb 2024 19:10:04 +0800 Subject: [PATCH 4/6] fix --- .../Advanced-Quantizations/GGUF-IQ2/generate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py index 338782489f9..c85a8473c49 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/generate.py @@ -54,8 +54,7 @@ model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit='gguf_iq2_xxs', trust_remote_code=True, - # imatrix='llama-v2-7b.imatrix').to("xpu") - imatrix='/home/arda/ruonan/debug/imatrix/llama-v2-7b.imatrix').to("xpu") + imatrix='llama-v2-7b.imatrix').to("xpu") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) From 1e2fca0244a19985f3ef4e879e26dbe8161dae51 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 22 Feb 2024 20:01:33 +0800 Subject: [PATCH 5/6] meet review --- .../Advanced-Quantizations/GGUF-IQ2/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/README.md index 52becec2c29..dbe2b14ed14 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/README.md +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/README.md @@ -1,6 +1,6 @@ # GGUF-IQ2 -This example shows how to directly run GGUF-IQ2 models using BigDL-LLM on Intel GPU. +This example shows how to run INT2 models using the IQ2 mechanism (first implemented by llama.cpp) in BigDL-LLM on Intel GPU. ## Verified Models From 95a22d03fa2c130dc801de8e3c47a93b8b2821e5 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 22 Feb 2024 20:04:44 +0800 Subject: [PATCH 6/6] small fix --- .../Advanced-Quantizations/GGUF-IQ2/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/README.md b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/README.md index dbe2b14ed14..17c1cb50bfe 100644 --- a/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/README.md +++ b/python/llm/example/GPU/HF-Transformers-AutoModels/Advanced-Quantizations/GGUF-IQ2/README.md @@ -8,7 +8,7 @@ This example shows how to run INT2 models using the IQ2 mechanism (first impleme - [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), using [llama-v2-7b.imatrix](https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/resolve/main/llama-v2-7b.imatrix) - [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2), using [mistral-7b-instruct-v0.2.imatrix](https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/resolve/main/mistral-7b-instruct-v0.2.imatrix) - [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1), using [mixtral-8x7b.imatrix](https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/resolve/main/mixtral-8x7b.imatrix) -- [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1), using [mixtral-8x7b.imatrix](https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/resolve/main/mixtral-8x7b-instruct-v0.1.imatrix) +- [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1), using [mixtral-8x7b-instruct-v0.1.imatrix](https://huggingface.co/datasets/ikawrakow/imatrix-from-wiki-train/resolve/main/mixtral-8x7b-instruct-v0.1.imatrix) ## Requirements