From 381d448ee291f3e619575d73bbd3db2202c63e83 Mon Sep 17 00:00:00 2001 From: Yuwen Hu <54161268+Oscilloscope98@users.noreply.github.com> Date: Tue, 7 Jan 2025 13:52:41 +0800 Subject: [PATCH] [NPU] Example & Quickstart updates (#12650) * Remove model with optimize_model=False in NPU verified models tables, and remove related example * Remove experimental in run optimized model section title * Unify model table order & example cmd * Move embedding example to separate folder & update quickstart example link * Add Quickstart reference in main NPU readme * Small fix * Small fix * Move save/load examples under NPU/HF-Transformers-AutoModels * Add low-bit and polish arguments for LLM Python examples * Small fix * Add low-bit and polish arguments for Multi-Model examples * Polish argument for Embedding models * Polish argument for LLM CPP examples * Add low-bit and polish argument for Save-Load examples * Add accuracy tuning tips for examples * Update NPU qucikstart accuracy tuning with low-bit optimizations * Add save/load section to qucikstart * Update CPP example sample output to EN * Add installation regarding cmake for CPP examples * Small fix * Small fix * Small fix * Small fix * Small fix * Small fix * Unify max prompt length to 512 * Change recommended low-bit for Qwen2.5-3B-Instruct to asym_int4 * Update based on comments * Small fix --- README.md | 2 +- README.zh-CN.md | 2 +- docs/mddocs/Quickstart/npu_quickstart.md | 53 +++++--- .../Embedding/README.md | 59 +++++++++ .../bce-embedding.py | 4 +- .../LLM/CPP_Examples/README.md | 101 +++++++++------- .../LLM/CPP_Examples/convert.py | 16 ++- .../HF-Transformers-AutoModels/LLM/README.md | 100 ++++++---------- .../LLM/baichuan2.py | 11 +- .../LLM/generate.py | 83 ------------- .../NPU/HF-Transformers-AutoModels/LLM/glm.py | 11 +- .../HF-Transformers-AutoModels/LLM/llama2.py | 13 +- .../HF-Transformers-AutoModels/LLM/llama3.py | 13 +- .../HF-Transformers-AutoModels/LLM/minicpm.py | 15 ++- .../HF-Transformers-AutoModels/LLM/qwen.py | 15 +-- .../Multimodal/README.md | 107 +++++------------ .../Multimodal/generate.py | 113 ------------------ .../Multimodal/minicpm-llama3-v2.5.py | 12 +- .../Multimodal/minicpm_v_2_6.py | 14 +-- .../Multimodal/speech_paraformer-large.py | 6 +- .../NPU/HF-Transformers-AutoModels/README.md | 44 ++++--- .../{LLM => }/Save-Load/README.md | 11 +- .../{LLM => }/Save-Load/generate.py | 4 +- 23 files changed, 314 insertions(+), 495 deletions(-) create mode 100644 python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/README.md rename python/llm/example/NPU/HF-Transformers-AutoModels/{Multimodal => Embedding}/bce-embedding.py (91%) delete mode 100644 python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py delete mode 100644 python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/generate.py rename python/llm/example/NPU/HF-Transformers-AutoModels/{LLM => }/Save-Load/README.md (69%) rename python/llm/example/NPU/HF-Transformers-AutoModels/{LLM => }/Save-Load/generate.py (95%) diff --git a/README.md b/README.md index f46fed04c8c..6732178ee3b 100644 --- a/README.md +++ b/README.md @@ -337,7 +337,7 @@ Over 70 models have been optimized/verified on `ipex-llm`, including *LLaMA/LLaM | MiniCPM-Llama3-V-2_5 | | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5) | [Python link](python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal) | | MiniCPM-V-2_6 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6) | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6) | [Python link](python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal) | | StableDiffusion | | [link](python/llm/example/GPU/HuggingFace/Multimodal/StableDiffusion) | -| Bce-Embedding-Base-V1 | | | [Python link](python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal) | +| Bce-Embedding-Base-V1 | | | [Python link](python/llm/example/NPU/HF-Transformers-AutoModels/Embedding) | | Speech_Paraformer-Large | | | [Python link](python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal) | ## Get Support diff --git a/README.zh-CN.md b/README.zh-CN.md index 0567189d76b..fbccb135121 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -337,7 +337,7 @@ See the demo of running [*Text-Generation-WebUI*](https://ipex-llm.readthedocs.i | MiniCPM-Llama3-V-2_5 | | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-Llama3-V-2_5) | [Python link](python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal) | | MiniCPM-V-2_6 | [link](python/llm/example/CPU/HF-Transformers-AutoModels/Model/minicpm-v-2_6) | [link](python/llm/example/GPU/HuggingFace/Multimodal/MiniCPM-V-2_6) | [Python link](python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal) | | StableDiffusion | | [link](python/llm/example/GPU/HuggingFace/Multimodal/StableDiffusion) | -| Bce-Embedding-Base-V1 | | | [Python link](python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal) | +| Bce-Embedding-Base-V1 | | | [Python link](python/llm/example/NPU/HF-Transformers-AutoModels/Embedding) | | Speech_Paraformer-Large | | | [Python link](python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal) | ## 官方支持 diff --git a/docs/mddocs/Quickstart/npu_quickstart.md b/docs/mddocs/Quickstart/npu_quickstart.md index b78b681b8c2..d80d1c742cb 100644 --- a/docs/mddocs/Quickstart/npu_quickstart.md +++ b/docs/mddocs/Quickstart/npu_quickstart.md @@ -71,6 +71,19 @@ conda activate llm-npu > [!TIP] > `ipex-llm` for NPU supports Python 3.10 and 3.11. +### (Optional) Install CMake + +> [!NOTE] +> Cmake installation is for IPEX-LLM **C++ API** on Intel NPU. If you plan to use the **Python API**, skip this step. + +With the `llm-npu` environment active, install CMake: + +```cmd +conda activate llm-npu + +pip install cmake +``` + ## Install `ipex-llm` with NPU Support With the `llm-npu` environment active, use `pip` to install `ipex-llm` for NPU: @@ -115,24 +128,28 @@ Refer to the following table for examples of verified models: [](../../../python/llm/) | Model | Model link | Example link | Verified Platforms | |:--|:--|:--|:--| -| LLaMA 2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md#2-run-optimized-models-experimental) | Meteor Lake, Lunar Lake, Arrow Lake | -| LLaMA 3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md#2-run-optimized-models-experimental) | Meteor Lake, Lunar Lake, Arrow Lake | -| LLaMA 3.2 | [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct), [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md#2-run-optimized-models-experimental) | Meteor Lake, Lunar Lake, Arrow Lake | -| Qwen 2 | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct), [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md#2-run-optimized-models-experimental) | Meteor Lake, Lunar Lake, Arrow Lake | -| Qwen 2.5 | [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md#2-run-optimized-models-experimental) | Lunar Lake | -| | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md#2-run-optimized-models-experimental) | Meteor Lake, Lunar Lake, Arrow Lake | -| GLM-Edge | [THUDM/glm-edge-1.5b-chat](https://huggingface.co/THUDM/glm-edge-1.5b-chat), [THUDM/glm-edge-4b-chat](https://huggingface.co/THUDM/glm-edge-4b-chat) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md#2-run-optimized-models-experimental) | Meteor Lake, Lunar Lake, Arrow Lake | -| MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md#2-run-optimized-models-experimental) | Meteor Lake, Lunar Lake, Arrow Lake | -| Baichuan 2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md#2-run-optimized-models-experimental) | Lunar Lake | -| MiniCPM-Llama3-V-2_5 | [openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md#2-run-optimized-models-experimental) | Lunar Lake | -| MiniCPM-V-2_6 | [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md#2-run-optimized-models-experimental) | Lunar Lake | -| Bce-Embedding-Base-V1 | [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md#2-run-optimized-models-experimental) | Lunar Lake | -| Speech_Paraformer-Large | [iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md#2-run-optimized-models-experimental) | Lunar Lake | +| LLaMA 2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/) | Meteor Lake, Lunar Lake, Arrow Lake | +| LLaMA 3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/) | Meteor Lake, Lunar Lake, Arrow Lake | +| LLaMA 3.2 | [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct), [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/) | Meteor Lake, Lunar Lake, Arrow Lake | +| GLM-Edge | [THUDM/glm-edge-1.5b-chat](https://huggingface.co/THUDM/glm-edge-1.5b-chat), [THUDM/glm-edge-4b-chat](https://huggingface.co/THUDM/glm-edge-4b-chat) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/) | Meteor Lake, Lunar Lake, Arrow Lake | +| Qwen 2 | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct), [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/) | Meteor Lake, Lunar Lake, Arrow Lake | +| Qwen 2.5 | [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/) | Lunar Lake | +| | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/) | Meteor Lake, Lunar Lake, Arrow Lake | +| MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/) | Meteor Lake, Lunar Lake, Arrow Lake | +| Baichuan 2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/LLM/) | Lunar Lake | +| MiniCPM-Llama3-V-2_5 | [openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/) | Lunar Lake | +| MiniCPM-V-2_6 | [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/) | Lunar Lake | +| Speech_Paraformer-Large | [iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/) | Lunar Lake | +| Bce-Embedding-Base-V1 | [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) | [link](../../../python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/) | Lunar Lake | > [!TIP] > You could refer to [here](../../../python/llm/example/NPU/HF-Transformers-AutoModels) for full IPEX-LLM examples on Intel NPU. +### Save & Load Low-Bit Models + +IPEX-LLM also provides Python API for saving/loading models with low-bit optimizations on Intel NPU, to avoid repeated loading & optimizing of the original models. Refer to the [Save-Load example](../../../python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load) for usage in details. + ## C++ API IPEX-LLM also provides C++ API for running Hugging Face `transformers` models. @@ -160,11 +177,17 @@ IPEX-LLM provides several optimization methods for enhancing the accuracy of mod You could set environment variable `IPEX_LLM_NPU_QUANTIZATION_OPT=1` before loading & optimizing the model with `from_pretrained` function from `ipex_llm.transformers.npu_model` Auto Model class to further enhance model accuracy of low-bit models. -### 2. Mixed Precision +### 2. Low-Bit Optimizations + +IPEX-LLM on Intel NPU currently supports `sym_int4`/`asym_int4`/`sym_int8` low-bit optimizations. You could adjust the low-bit value to tune the accuracy. + +For example, you could try to set `load_in_low_bit='asym_int4'` instead of `load_in_low_bit='sym_int4'` when loading & optimizing the model with `from_pretrained` function from `ipex_llm.transformers.npu_model` Auto Model class, to switch from `sym_int4` low-bit optimizations to `asym_int4`. + +### 3. Mixed Precision When loading & optimizing the model with `from_pretrained` function of `ipex_llm.transformers.npu_model` Auto Model class, you could try to set parameter `mixed_precision=True` to enable mixed precision optimization when encountering output problems. -### 3. Group Size +### 4. Group Size IPEX-LLM low-bit optimizations support both channel-wise and group-wise quantization on Intel NPU. When loading & optimizing the model with `from_pretrained` function of Auto Model class from `ipex_llm.transformers.npu_model`, parameter `quantization_group_size` will control whether to use channel-wise or group-wise quantization. diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/README.md new file mode 100644 index 00000000000..1edab9433b8 --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/README.md @@ -0,0 +1,59 @@ +# Run Embedding Model on Intel NPU +In this directory, you will find examples on how you could apply IPEX-LLM low-bit optimizations on embedding models on [Intel NPUs](../../../README.md). See the table blow for verified models. + +## Verified Models + +| Model | Model Link | +|------------|----------------------------------------------------------------| +| Bce-Embedding-Base-V1 | [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) | + +Please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#python-api) for details about verified platforms. + +## 0. Prerequisites +For `ipex-llm` NPU support, please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for details about the required preparations. + +## 1. Install +### 1.1 Installation on Windows +We suggest using conda to manage environment: +```bash +conda create -n llm python=3.11 +conda activate llm + +# install ipex-llm with 'npu' option +pip install --pre --upgrade ipex-llm[npu] + +# [optional] for Bce-Embedding-Base-V1 +pip install BCEmbedding==0.1.5 transformers==4.40.0 +``` +Please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-ipex-llm-with-npu-support) for more details about `ipex-llm` installation on Intel NPU. + +### 1.2 Runtime Configurations +Please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#runtime-configurations) for environment variables setting based on your device. + +## 2. Run Optimized Models +The examples below show how to run the **_optimized HuggingFace model implementations_** on Intel NPU, including +- [Bce-Embedding-Base-V1 ](./bce-embedding.py) + +### 2.1 Run Bce-Embedding-Base-V1 +```bash +# to run Bce-Embedding-Base-V1 +python bce-embedding.py --save-directory +``` + +Arguments info: +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (i.e. `maidalun1020/bce-embedding-base_v1`) to be downloaded, or the path to the huggingface checkpoint folder. +- `--prompt PROMPT`: argument defining the sentences to encode. +- `--max-context-len MAX_CONTEXT_LEN`: argument defining the maximum sequence length for both input and output tokens. It is default to be `1024`. +- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`. +- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded. + +#### Sample Output +##### [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) + +```log +Inference time: xxxx s +[[-0.00674987 -0.01700369 -0.0028928 ... -0.05296675 -0.00352772 + 0.00827096] + [-0.04398304 0.00023038 0.00643183 ... -0.02717186 0.00483789 + 0.02298774]] +``` diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/bce-embedding.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/bce-embedding.py similarity index 91% rename from python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/bce-embedding.py rename to python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/bce-embedding.py index 760a5e5f28b..e36cf7eee1f 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/bce-embedding.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/bce-embedding.py @@ -33,13 +33,12 @@ type=str, default="maidalun1020/bce-embedding-base_v1", help="The huggingface repo id for the bce-embedding model to be downloaded" - ", or the path to the huggingface checkpoint folder", + ", or the path to the huggingface checkpoint folder.", ) parser.add_argument('--prompt', type=str, default="'sentence_0', 'sentence_1'", help='Prompt to infer') parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) - parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--save-directory", type=str, required=True, help="The path of folder to save converted model, " @@ -58,7 +57,6 @@ optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md index 9be75d11608..86a81b5e6dd 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md @@ -5,17 +5,17 @@ In this directory, you will find a C++ example on how to run LLM models on Intel | Model | Model Link | |------------|----------------------------------------------------------------| -| Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) | -| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | | Llama2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | | Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | -| MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) | | Llama3.2 | [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct), [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | +| Qwen2 | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct), [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | +| Qwen2.5 | [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct), [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | +| MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) | -Please refer to [Quick Start](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#c-api) for details about verified platforms. +Please refer to [Quickstart](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#c-api) for details about verified platforms. ## 0. Prerequisites -For `ipex-llm` NPU support, please refer to [Quick Start](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for details about the required preparations. +For `ipex-llm` NPU support, please refer to [Quickstart](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for details about the required preparations. ## 1. Install & Runtime Configurations ### 1.1 Installation on Windows @@ -24,6 +24,9 @@ We suggest using conda to manage environment: conda create -n llm python=3.11 conda activate llm +:: for building the example +pip install cmake + :: install ipex-llm with 'npu' option pip install --pre --upgrade ipex-llm[npu] @@ -31,50 +34,52 @@ pip install --pre --upgrade ipex-llm[npu] pip install transformers==4.45.0 accelerate==0.33.0 ``` -Please refer to [Quick Start](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for more details about `ipex-llm` installation on Intel NPU. +Please refer to [Quickstart](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for more details about `ipex-llm` installation on Intel NPU. ### 1.2 Runtime Configurations -Please refer to [Quick Start](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#runtime-configurations) for environment variables setting based on your device. +Please refer to [Quickstart](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#runtime-configurations) for environment variables setting based on your device. ## 2. Convert Model We provide a [convert script](convert.py) under current directory, by running it, you can obtain the whole weights and configuration files which are required to run C++ example. ```cmd -:: to convert Qwen2.5-7B-Instruct -python convert.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory +:: to convert Llama-2-7b-chat-hf +python convert.py --repo-id-or-model-path meta-llama/Llama-2-7b-chat-hf --save-directory + +:: to convert Meta-Llama-3-8B-Instruct +python convert.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --save-directory + +:: to convert Llama-3.2-1B-Instruct +python convert.py --repo-id-or-model-path meta-llama/Llama-3.2-1B-Instruct --save-directory + +:: to convert Llama-3.2-3B-Instruct +python convert.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct --save-directory :: to convert Qwen2-1.5B-Instruct python convert.py --repo-id-or-model-path Qwen/Qwen2-1.5B-Instruct --save-directory -:: to convert Qwen2.5-3B-Instruct -python convert.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --save-directory --low_bit "sym_int8" +:: to convert Qwen2-7B-Instruct +python convert.py --repo-id-or-model-path Qwen/Qwen2-7B-Instruct --save-directory -:: to convert Llama-2-7b-chat-hf -python convert.py --repo-id-or-model-path meta-llama/Llama-2-7b-chat-hf --save-directory +:: to convert Qwen2.5-3B-Instruct +python convert.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --save-directory --low-bit "asym_int4" -:: to convert Meta-Llama-3-8B-Instruct -python convert.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --save-directory +:: to convert Qwen2.5-7B-Instruct +python convert.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory :: to convert MiniCPM-1B-sft-bf16 python convert.py --repo-id-or-model-path openbmb/MiniCPM-1B-sft-bf16 --save-directory :: to convert MiniCPM-2B-sft-bf16 python convert.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --save-directory - -:: to convert Llama-3.2-1B-Instruct -python convert.py --repo-id-or-model-path meta-llama/Llama-3.2-1B-Instruct --save-directory - -:: to convert Llama-3.2-3B-Instruct -python convert.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct --save-directory ``` Arguments info: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `Qwen/Qwen2.5-7B-Instruct`) to be downloaded, or the path to the huggingface checkpoint folder. +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g.`Meta-llama/Llama-2-7b-chat-hf` for Llama2-7B) to be downloaded, or the path to the huggingface checkpoint folder. - `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted model will be saved into `SAVE_DIRECTORY`. -- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. -- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `960`. -- `--low_bit LOW_BIT`: Defines the low bit precision to quantize the model. It is default to be `sym_int4`. -- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. +- `--max-context-len MAX_CONTEXT_LEN`: argument defining the maximum sequence length for both input and output tokens. It is default to be `1024`. +- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`. +- `--low-bit` LOW_BIT: argument defining the low bit optimizations that will be applied to the model. Current available options are `"sym_int4"`, `"asym_int4"` and `"sym_int8"`, with `"sym_int4"` as the default. ## 3. Build C++ Example `llm-npu-cli` @@ -110,34 +115,31 @@ Arguments info: - Last argument is your input prompt. ### 5. Sample Output -#### [`Qwen/Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) +#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) ##### Text Completion ```cmd Input: -<|im_start|>system -You are a helpful assistant.<|im_end|> -<|im_start|>user -AI是什么?<|im_end|> -<|im_start|>assistant +[INST] <> +<> -Prefill 22 tokens cost xxxx ms. +What is AI? [/INST] -Decode 63 tokens cost xxxx ms (avg xx.xx ms each token). -Output: -AI是"人工智能"的缩写,它是一门研究计算机如何能够完成与人类智能相关任务的学科,包括学习、推理、自我修正等能力。简而言之,人工智能就是让计算机模拟或执行人类智能行为的理论、技术和方法。 +Prefill 26 tokens cost xxxx ms. -它涵盖了机器学习、深度学习、自然 +Decode 63 tokens cost xxxx ms (avg xxxx ms each token). +Output: + AI stands for Artificial Intelligence, which is the field of study focused on creating and developing intelligent machines that can perform tasks that typically require human intelligence, such as visual and auditory recognition, speech recognition, and decision-making. AI is a broad and diverse field that includes a wide range ``` ##### Conversation ```cmd -User:你好 -Assistant:你好!很高兴能为你提供帮助。有什么问题或需要聊天可以找我哦。 -User:AI是什么? -Assistant: AI代表的是"Artificial Intelligence",中文翻译为人工智能。它是指由计算机或信息技术实现的智能行为。广义的人工智能可以指任何表现出智能行为的计算机或软件系统。狭义的人工智能则指的是模拟、学习、推理、理解自然语言以及自我生成的人工智能系统。 +User:Hi +Assistant: Hello! It's nice to meet you. How can I help you today? +User:What is AI in one sentence? +Assistant:Sure, here's a one-sentence definition of AI: -简而言之,人工智能是一种利用计算机和机器来模仿、模拟或扩展人类智能的技术或系统。它包括机器学习、深度学习、自然语言处理等多个子领域。 +Artificial Intelligence (AI) refers to the development and use of computer systems and algorithms that can perform tasks that typically require human intelligence, such as visual and speech recognition, decision-making and problem-solving, and natural language processing. User:exit ``` @@ -150,4 +152,17 @@ thread '' panicked at src\lib.rs:151:91: called `Result::unwrap()` on an `Err` value: Utf8Error { valid_up_to: 77, error_len: Some(1) } ``` -For detailed instructions on how to do this, see [this issue](https://github.com/intel-analytics/ipex-llm/issues/10989#issuecomment-2105598660). \ No newline at end of file +For detailed instructions on how to do this, see [this issue](https://github.com/intel-analytics/ipex-llm/issues/10989#issuecomment-2105598660). + +#### Accuracy Tuning + +If you enconter output issues when running the CPP examples, you could try the following methods [**when converting the model**](#2-convert-model) to tune the accuracy: + +1. Before converting the model, consider setting an additional environment variable `IPEX_LLM_NPU_QUANTIZATION_OPT=1` to enhance output quality. + +2. If you are using the default `LOW_BIT` value (i.e. `sym_int4` optimizations), you could try to use `--low-bit "asym_int4"` instead to tune the output quality. + +3. You could refer to the [Quickstart](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#accuracy-tuning) for more accuracy tuning strategies. + +> [!IMPORTANT] +> Please note that to make the above methods taking effect, you must specify a new folder for `SAVE_DIRECTORY`. Reusing the same `SAVE_DIRECTORY` will load the previously saved low-bit model, and thus making the above accuracy tuning strategies ineffective. \ No newline at end of file diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py index e5e28472728..7a22d567958 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py @@ -36,9 +36,9 @@ parser.add_argument( "--repo-id-or-model-path", type=str, - default="Qwen/Qwen2.5-7B-Instruct", # Or Qwen2-7B-Instruct, Qwen2-1.5B-Instruct - help="The huggingface repo id for the Qwen model to be downloaded" - ", or the path to the huggingface checkpoint folder", + default="meta-llama/Llama-2-7b-chat-hf", + help="The huggingface repo id for the model to be downloaded" + ", or the path to the huggingface checkpoint folder.", ) parser.add_argument("--save-directory", type=str, required=True, @@ -47,11 +47,10 @@ "Else, program will raise error.", ) parser.add_argument("--max-context-len", type=int, default=1024) - parser.add_argument("--max-prompt-len", type=int, default=960) - parser.add_argument("--quantization_group_size", type=int, default=0) - parser.add_argument('--low_bit', type=str, default="sym_int4", - help='Low bit precision to quantize the model') - parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--quantization-group-size", type=int, default=0) + parser.add_argument('--low-bit', type=str, default="sym_int4", + help='Low bit optimizations that will be applied to the model.') args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -66,7 +65,6 @@ quantization_group_size=args.quantization_group_size, torch_dtype=torch.float16, attn_implementation="eager", - transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True, save_directory=save_dir) t1 = time.perf_counter() diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index 147b4877604..694042193be 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -7,24 +7,17 @@ In this directory, you will find examples on how to directly run HuggingFace `tr |------------|----------------------------------------------------------------| | Llama2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | | Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | -| Llama3.2-1B | [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | -| Llama3.2-3B | [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | -| Chatglm3 | [THUDM/chatglm3-6b](https://huggingface.co/THUDM/chatglm3-6b) | -| Chatglm2 | [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b) | +| Llama3.2 | [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct), [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | | GLM-Edge | [THUDM/glm-edge-1.5b-chat](https://huggingface.co/THUDM/glm-edge-1.5b-chat), [THUDM/glm-edge-4b-chat](https://huggingface.co/THUDM/glm-edge-4b-chat) | -| Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) | -| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | +| Qwen2 | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct), [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) | +| Qwen2.5 | [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct), [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | | MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) | -| Phi-3 | [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) | -| Stablelm | [stabilityai/stablelm-zephyr-3b](https://huggingface.co/stabilityai/stablelm-zephyr-3b) | | Baichuan2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) | -| Deepseek | [deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct) | -| Mistral | [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | -Please refer to [Quick Start](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#python-api) for details about verified platforms. +Please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#python-api) for details about verified platforms. ## 0. Prerequisites -For `ipex-llm` NPU support, please refer to [Quick Start](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for details about the required preparations. +For `ipex-llm` NPU support, please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for details about the required preparations. ## 1. Install & Runtime Configurations ### 1.1 Installation on Windows @@ -43,10 +36,10 @@ pip install transformers==4.45.0 accelerate==0.33.0 pip install transformers==4.47.0 accelerate==0.26.0 ``` -Please refer to [Quick Start](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-ipex-llm-with-npu-support) for more details about `ipex-llm` installation on Intel NPU. +Please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-ipex-llm-with-npu-support) for more details about `ipex-llm` installation on Intel NPU. ### 1.2 Runtime Configurations -Please refer to [Quick Start](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#runtime-configurations) for environment variables setting based on your device. +Please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#runtime-configurations) for environment variables setting based on your device. ## 2. Run Optimized Models The examples below show how to run the **_optimized HuggingFace model implementations_** on Intel NPU, including @@ -54,13 +47,14 @@ The examples below show how to run the **_optimized HuggingFace model implementa - [Llama3-8B](./llama3.py) - [Llama3.2-1B](./llama3.py) - [Llama3.2-3B](./llama3.py) +- [GLM-Edge-1.5B](./glm.py) +- [GLM-Edge-4B](./glm.py) - [Qwen2-1.5B](./qwen.py) +- [Qwen2-7B](./qwen.py) - [Qwen2.5-3B](./qwen.py) - [Qwen2.5-7B](./qwen.py) - [MiniCPM-1B](./minicpm.py) - [MiniCPM-2B](./minicpm.py) -- [GLM-Edge-1.5B-Chat](./glm.py) -- [GLM-Edge-4B-Chat](./glm.py) - [Baichuan2-7B](./baichuan2.py) ### Run @@ -77,11 +71,20 @@ python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct" --s :: to run Llama-3.2-3B-Instruct python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct" --save-directory +:: to run glm-edge-1.5b-chat +python glm.py --repo-id-or-model-path "THUDM/glm-edge-1.5b-chat" --save-directory + +:: to run glm-edge-4b-chat +python glm.py --repo-id-or-model-path "THUDM/glm-edge-4b-chat" --save-directory + :: to run Qwen2-1.5B-Instruct -python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low-bit sym_int8 --save-directory +python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --save-directory + +:: to run Qwen2-7B-Instruct +python qwen.py --repo-id-or-model-path "Qwen/Qwen2-7B-Instruct" --save-directory :: to run Qwen2.5-3B-Instruct -python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low-bit sym_int8 --save-directory +python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low-bit asym_int4 --save-directory :: to run Qwen2.5-7B-Instruct python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-7B-Instruct" --save-directory @@ -92,37 +95,33 @@ python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-1B-sft-bf16" --save-d :: to run MiniCPM-2B-sft-bf16 python minicpm.py --repo-id-or-model-path "openbmb/MiniCPM-2B-sft-bf16" --save-directory -:: to run glm-edge-1.5b-chat -python glm.py --repo-id-or-model-path "THUDM/glm-edge-1.5b-chat" --save-directory - -:: to run glm-edge-4b-chat -python glm.py --repo-id-or-model-path "THUDM/glm-edge-4b-chat" --save-directory - :: to run Baichuan2-7B-Chat python baichuan2.py --repo-id-or-model-path "baichuan-inc/Baichuan2-7B-Chat" --save-directory ``` Arguments info: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Llama2 model (i.e. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'meta-llama/Llama-2-7b-chat-hf'`. -- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`. +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g.`Meta-llama/Llama-2-7b-chat-hf` for Llama2-7B) to be downloaded, or the path to the huggingface checkpoint folder. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `"What is AI?"` or `"AI是什么?"`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. -- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. -- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. -- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. -- `--disable-streaming`: Disable streaming mode of generation. +- `--max-context-len MAX_CONTEXT_LEN`: argument defining the maximum sequence length for both input and output tokens. It is default to be `1024`. +- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`. +- `--low-bit` LOW_BIT: argument defining the low bit optimizations that will be applied to the model. Current available options are `"sym_int4"`, `"asym_int4"` and `"sym_int8"`, with `"sym_int4"` as the default. +- `--disable-streaming`: argument defining whether to disable the streaming mode for generation. - `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded. ### Troubleshooting -#### `TypeError: can't convert meta device type tensor to numpy.` Error -If you encounter `TypeError: can't convert meta device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.` error when loading lowbit model, please try re-saving the lowbit model with the example script you are currently using. Please note that lowbit models saved by `qwen.py`, `llama.py`, etc. cannot be loaded by `generate.py`. +#### Accuracy Tuning +If you enconter output issues when running the examples, you could try the following methods to tune the accuracy: -#### Output Problem -If you encounter output problem, please try to disable the optimization of transposing value cache such as the following command: -```cmd -:: to run Llama-2-7b-chat-hf -python llama2.py --save-directory --disable-transpose-value-cache -``` +1. Before running the example, consider setting an additional environment variable `IPEX_LLM_NPU_QUANTIZATION_OPT=1` to enhance output quality. + +2. If you are using the default `LOW_BIT` value (i.e. `sym_int4` optimizations), you could try to use `--low-bit "asym_int4"` instead to tune the output quality. + +3. You could refer to the [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#accuracy-tuning) for more accuracy tuning strategies. + +> [!IMPORTANT] +> Please note that to make the above methods taking effect, you must specify a new folder for `SAVE_DIRECTORY`. Reusing the same `SAVE_DIRECTORY` will load the previously saved low-bit model, and thus making the above accuracy tuning strategies ineffective. #### Better Performance with High CPU Utilization You could enable optimization by setting the environment variable with `set IPEX_LLM_CPU_LM_HEAD=1` for better performance. But this will cause high CPU utilization. @@ -146,28 +145,3 @@ What is AI? [/INST] What is AI? [/INST] AI (Artificial Intelligence) is a field of computer science and engineering that focuses on the development of intelligent machines that can perform tasks ``` - -## 3. Run Models -In the example [generate.py](./generate.py), we show a basic use case for a Llama2 model to predict the next N tokens using `generate()` API, with IPEX-LLM INT4 optimizations on Intel NPUs. - -``` -python ./generate.py -``` - -Arguments info: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Llama2 model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'meta-llama/Llama-2-7b-chat-hf'`, and more verified models please see the list in [Verified Models](#verified-models). -- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string. -- `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `'Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun'`. -- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. -- `--low_bit`: argument defining the `low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used. - -### Sample Output -#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) - -```log -Inference time: xxxx s --------------------- Output -------------------- - Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun. But her parents were always telling her to stay at home and be careful. They were worried about her safety, and they didn't want her to --------------------------------------------------------------------------------- -done -``` diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py index 92b29506340..a355e6b476e 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py @@ -35,14 +35,15 @@ type=str, default="baichuan-inc/Baichuan2-7B-Chat", help="The huggingface repo id for the Baichuan2 model to be downloaded" - ", or the path to the huggingface checkpoint folder", + ", or the path to the huggingface checkpoint folder.", ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') - parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.") parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) - parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument('--low-bit', type=str, default="sym_int4", + help='Low bit optimizations that will be applied to the model.') parser.add_argument("--disable-streaming", action="store_true", default=False) parser.add_argument("--save-directory", type=str, required=True, @@ -60,11 +61,10 @@ torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager", - load_in_low_bit="sym_int4", + load_in_low_bit=args.low_bit, optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -77,7 +77,6 @@ optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py deleted file mode 100644 index 6eaee048af7..00000000000 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py +++ /dev/null @@ -1,83 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import torch -import time -import argparse -import os - -from ipex_llm.transformers.npu_model import AutoModelForCausalLM -from transformers import AutoTokenizer - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for npu model') - parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf", - help='The huggingface repo id for the Llama2 model to be downloaded' - ', or the path to the huggingface checkpoint folder') - parser.add_argument("--lowbit-path", type=str, - default="", - help='The path to the lowbit model folder, leave blank if you do not want to save. \ - If path not exists, lowbit model will be saved there. \ - Else, lowbit model will be loaded.') - parser.add_argument('--prompt', type=str, default="Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun", - help='Prompt to infer') - parser.add_argument('--n-predict', type=int, default=32, - help='Max tokens to predict') - parser.add_argument('--load_in_low_bit', type=str, default="sym_int8", - help='Load in low bit to use') - - args = parser.parse_args() - model_path = args.repo_id_or_model_path - - - if not args.lowbit_path or not os.path.exists(args.lowbit_path): - model = AutoModelForCausalLM.from_pretrained( - model_path, - trust_remote_code=True, - load_in_low_bit=args.load_in_low_bit, - attn_implementation="eager" - ) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - else: - model = AutoModelForCausalLM.load_low_bit( - args.lowbit_path, - trust_remote_code=True, - bigdl_transformers_low_bit=args.load_in_low_bit, - attn_implementation="eager" - ) - tokenizer = AutoTokenizer.from_pretrained(args.lowbit_path, trust_remote_code=True) - - print(model) - - if args.lowbit_path and not os.path.exists(args.lowbit_path): - model.save_low_bit(args.lowbit_path) - tokenizer.save_pretrained(args.lowbit_path) - - with torch.inference_mode(): - input_ids = tokenizer.encode(args.prompt, return_tensors="pt") - print("finish to load") - print('input length:', len(input_ids[0])) - st = time.time() - output = model.generate(input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict) - end = time.time() - print(f'Inference time: {end-st} s') - output_str = tokenizer.decode(output[0], skip_special_tokens=False) - print('-'*20, 'Output', '-'*20) - print(output_str) - - print('-'*80) - print('done') diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/glm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/glm.py index 637f612a600..919e5fff3b5 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/glm.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/glm.py @@ -34,18 +34,17 @@ "--repo-id-or-model-path", type=str, default="THUDM/glm-edge-1.5b-chat", - help="The huggingface repo id for the glm-edge model to be downloaded" - ", or the path to the huggingface checkpoint folder", + help="The huggingface repo id for the GLM-Edge model to be downloaded" + ", or the path to the huggingface checkpoint folder.", ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') - parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.") parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument('--low-bit', type=str, default="sym_int4", - help='Load in low bit to use') + help='Low bit optimizations that will be applied to the model.') parser.add_argument("--disable-streaming", action="store_true", default=False) - parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--save-directory", type=str, required=True, help="The path of folder to save converted model, " @@ -66,7 +65,6 @@ optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -79,7 +77,6 @@ optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, ) tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py index a2e0881b190..c3af421eb14 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py @@ -48,14 +48,15 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], type=str, default="meta-llama/Llama-2-7b-chat-hf", help="The huggingface repo id for the Llama2 model to be downloaded" - ", or the path to the huggingface checkpoint folder", + ", or the path to the huggingface checkpoint folder.", ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') - parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.") parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) - parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument('--low-bit', type=str, default="sym_int4", + help='Low bit optimizations that will be applied to the model.') parser.add_argument("--disable-streaming", action="store_true", default=False) parser.add_argument("--save-directory", type=str, required=True, @@ -73,11 +74,10 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager", - load_in_low_bit="sym_int4", + load_in_low_bit=args.low_bit, optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -89,8 +89,7 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], torch_dtype=torch.float16, optimize_model=True, max_context_len=args.max_context_len, - max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, + max_prompt_len=args.max_prompt_len ) tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py index 9bc570411d5..e0906352464 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py @@ -49,14 +49,15 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], type=str, default="meta-llama/Meta-Llama-3-8B-Instruct", help="The huggingface repo id for the Llama3 model to be downloaded" - ", or the path to the huggingface checkpoint folder", + ", or the path to the huggingface checkpoint folder.", ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') - parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.") parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) - parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument('--low-bit', type=str, default="sym_int4", + help='Low bit optimizations that will be applied to the model.') parser.add_argument("--disable-streaming", action="store_true", default=False) parser.add_argument("--save-directory", type=str, required=True, @@ -74,11 +75,10 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager", - load_in_low_bit="sym_int4", + load_in_low_bit=args.low_bit, optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -90,8 +90,7 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]], torch_dtype=torch.float16, optimize_model=True, max_context_len=args.max_context_len, - max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, + max_prompt_len=args.max_prompt_len ) tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py index eb911ca0b6e..e91c547bdb8 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py @@ -34,15 +34,16 @@ "--repo-id-or-model-path", type=str, default="openbmb/MiniCPM-1B-sft-bf16", - help="The huggingface repo id for the Llama2 model to be downloaded" - ", or the path to the huggingface checkpoint folder", + help="The huggingface repo id for the MiniCPM model to be downloaded" + ", or the path to the huggingface checkpoint folder.", ) parser.add_argument('--prompt', type=str, default="What is AI?", help='Prompt to infer') - parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.") parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) - parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument('--low-bit', type=str, default="sym_int4", + help='Low bit optimizations that will be applied to the model.') parser.add_argument("--disable-streaming", action="store_true", default=False) parser.add_argument("--save-directory", type=str, required=True, @@ -59,11 +60,10 @@ torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager", - load_in_low_bit="sym_int4", + load_in_low_bit=args.low_bit, optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) @@ -76,8 +76,7 @@ optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, - trust_remote_code=True, + trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py index d38509afd78..585256c72fd 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py @@ -35,18 +35,17 @@ type=str, default="Qwen/Qwen2.5-7B-Instruct", help="The huggingface repo id for the Qwen2 or Qwen2.5 model to be downloaded" - ", or the path to the huggingface checkpoint folder", + ", or the path to the huggingface checkpoint folder.", ) parser.add_argument('--prompt', type=str, default="AI是什么?", help='Prompt to infer') - parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.") parser.add_argument("--max-context-len", type=int, default=1024) - parser.add_argument("--max-prompt-len", type=int, default=960) - parser.add_argument("--quantization_group_size", type=int, default=0) + parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--quantization-group-size", type=int, default=0) parser.add_argument('--low-bit', type=str, default="sym_int4", - help='Load in low bit to use') + help='Low bit optimizations that will be applied to the model.') parser.add_argument("--disable-streaming", action="store_true", default=False) - parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--save-directory", type=str, required=True, help="The path of folder to save converted model, " @@ -67,7 +66,6 @@ optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, quantization_group_size=args.quantization_group_size, save_directory=args.save_directory ) @@ -80,8 +78,7 @@ torch_dtype=torch.float16, optimize_model=True, max_context_len=args.max_context_len, - max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, + max_prompt_len=args.max_prompt_len ) tokenizer = AutoTokenizer.from_pretrained(args.save_directory, trust_remote_code=True) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md index 32acff27606..cc5b668a317 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md @@ -5,22 +5,20 @@ In this directory, you will find examples on how you could apply IPEX-LLM INT4 o | Model | Model Link | |------------|----------------------------------------------------------------| -| Phi-3-Vision | [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | | MiniCPM-Llama3-V-2_5 | [openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) | | MiniCPM-V-2_6 | [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) | -| Bce-Embedding-Base-V1 | [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) | | Speech_Paraformer-Large | [iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch) | -Please refer to [Quick Start](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#python-api) for details about verified platforms. +Please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#python-api) for details about verified platforms. ## 0. Prerequisites -For `ipex-llm` NPU support, please refer to [Quick Start](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for details about the required preparations. +For `ipex-llm` NPU support, please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for details about the required preparations. ## 1. Install ### 1.1 Installation on Windows We suggest using conda to manage environment: ```bash -conda create -n llm python=3.10 libuv +conda create -n llm python=3.11 conda activate llm # install ipex-llm with 'npu' option @@ -30,24 +28,20 @@ pip install torchvision # [optional] for MiniCPM-V-2_6 pip install timm torch==2.1.2 torchvision==0.16.2 -# [optional] for Bce-Embedding-Base-V1 -pip install BCEmbedding==0.1.5 transformers==4.40.0 - # [optional] for Speech_Paraformer-Large pip install funasr==1.1.14 pip install modelscope==1.20.1 torch==2.1.2 torchaudio==2.1.2 ``` -Please refer to [Quick Start](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-ipex-llm-with-npu-support) for more details about `ipex-llm` installation on Intel NPU. +Please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-ipex-llm-with-npu-support) for more details about `ipex-llm` installation on Intel NPU. ### 1.2 Runtime Configurations -Please refer to [Quick Start](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#runtime-configurations) for environment variables setting based on your device. +Please refer to [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#runtime-configurations) for environment variables setting based on your device. -## 2. Run Optimized Models (Experimental) +## 2. Run Optimized Models The examples below show how to run the **_optimized HuggingFace & FunASR model implementations_** on Intel NPU, including - [MiniCPM-Llama3-V-2_5](./minicpm-llama3-v2.5.py) - [MiniCPM-V-2_6](./minicpm_v_2_6.py) - [Speech_Paraformer-Large](./speech_paraformer-large.py) -- [Bce-Embedding-Base-V1 ](./bce-embedding.py) ### 2.1 Run MiniCPM-Llama3-V-2_5 & MiniCPM-V-2_6 ```bash @@ -59,20 +53,35 @@ python minicpm_v_2_6.py --save-directory ``` Arguments info: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (i.e. `openbmb/MiniCPM-Llama3-V-2_5`) to be downloaded, or the path to the huggingface checkpoint folder. +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `openbmb/MiniCPM-Llama3-V-2_5` for MiniCPM-Llama3-V-2_5) to be downloaded, or the path to the huggingface checkpoint folder. - `image-url-or-path IMAGE_URL_OR_PATH`: argument defining the image to be infered. It is default to be 'http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg'. -- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is in the image?`. +- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `"What is in this image?"`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. -- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. -- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. -- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. +- `--max-context-len MAX_CONTEXT_LEN`: argument defining the maximum sequence length for both input and output tokens. It is default to be `1024`. +- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`. +- `--low-bit` LOW_BIT: argument defining the low bit optimizations that will be applied to the model. Current available options are `"sym_int4"`, `"asym_int4"` and `"sym_int8"`, with `"sym_int4"` as the default. - `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded. +#### Troubleshooting + +##### Accuracy Tuning +If you enconter output issues when running the examples, you could try the following methods to tune the accuracy: + +1. Before running the example, consider setting an additional environment variable `IPEX_LLM_NPU_QUANTIZATION_OPT=1` to enhance output quality. + +2. If you are using the default `LOW_BIT` value (i.e. `sym_int4` optimizations), you could try to use `--low-bit "asym_int4"` instead to tune the output quality. + +3. You could refer to the [Quickstart](../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#accuracy-tuning) for more accuracy tuning strategies. + +> [!IMPORTANT] +> Please note that to make the above methods taking effect, you must specify a new folder for `SAVE_DIRECTORY`. Reusing the same `SAVE_DIRECTORY` will load the previously saved low-bit model, and thus making the above accuracy tuning strategies ineffective. + + #### Sample Output ##### [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) ```log -Inference time: xx.xx s +Inference time: xxxx s -------------------- Input -------------------- http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg -------------------- Prompt -------------------- @@ -81,6 +90,10 @@ What is in this image? The image features a young child holding and showing off a white teddy bear wearing a pink dress. The background includes some red flowers and a stone wall, suggesting an outdoor setting. ``` +The sample input image is (which is fetched from [COCO dataset](https://cocodataset.org/#explore?id=264959)): + + + ### 2.2 Run Speech_Paraformer-Large ```bash # to run Speech_Paraformer-Large @@ -89,7 +102,7 @@ python speech_paraformer-large.py --save-directory Arguments info: - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the asr repo id for the model (i.e. `iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch`) to be downloaded, or the path to the asr checkpoint folder. -- `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used. +- `--low-bit` LOW_BIT: argument defining the low bit optimizations that will be applied to the model. It is default to be `sym_int8`, `sym_int4` can also be used. - `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded. #### Sample Output @@ -104,59 +117,3 @@ rtf_avg: 0.090: 100%|███████████████████ rtf_avg: 0.232: 100%|███████████████████████████████████| 1/1 [00:01<00:00, 1.29s/it] [{'key': 'asr_example_zh', 'text': '欢 迎 大 家 来 体 验 达 摩 院 推 出 的 语 音 识 别 模 型'}] ``` - -### 2.3 Run Bce-Embedding-Base-V1 -```bash -# to run Bce-Embedding-Base-V1 -python bce-embedding.py --save-directory -``` - -Arguments info: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the asr repo id for the model (i.e. `maidalun1020/bce-embedding-base_v1`) to be downloaded, or the path to the asr checkpoint folder. -- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded. - -#### Sample Output -##### [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) | - -```log -Inference time: xxx s -[[-0.00674987 -0.01700369 -0.0028928 ... -0.05296675 -0.00352772 - 0.00827096] - [-0.04398304 0.00023038 0.00643183 ... -0.02717186 0.00483789 - 0.02298774]] -``` - -### 3. Running examples - -``` -python ./generate.py -``` - -Arguments info: -- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Phi-3-vision model (e.g. `microsoft/Phi-3-vision-128k-instruct`) to be downloaded, or the path to the huggingface checkpoint folder. It is default to be `'microsoft/Phi-3-vision-128k-instruct'`, and more verified models please see the list in [Verified Models](#verified-models). -- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string. -- `--image-url-or-path IMAGE_URL_OR_PATH`: argument defining the image to be infered. It is default to be `'http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg'`. -- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `'What is in the image?'`. -- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. -- `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used. - - -#### Sample Output -##### [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) - -```log -Inference time: xxxx s --------------------- Prompt -------------------- -Message: [{'role': 'user', 'content': '<|image_1|>\nWhat is in the image?'}] -Image link/path: http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg --------------------- Output -------------------- - - -What is in the image? - The image shows a young girl holding a white teddy bear. She is wearing a pink dress with a heart on it. The background includes a stone -``` - -The sample input image is (which is fetched from [COCO dataset](https://cocodataset.org/#explore?id=264959)): - - - diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/generate.py deleted file mode 100644 index 74a8091aa7a..00000000000 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/generate.py +++ /dev/null @@ -1,113 +0,0 @@ -# -# Copyright 2016 The BigDL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import time -import torch -import argparse -import requests - -from PIL import Image -from ipex_llm.transformers.npu_model import AutoModelForCausalLM -from transformers import AutoProcessor - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for phi-3 model') - parser.add_argument('--repo-id-or-model-path', type=str, default="microsoft/Phi-3-vision-128k-instruct", - help='The huggingface repo id for the phi-3-vision model to be downloaded' - ', or the path to the huggingface checkpoint folder') - parser.add_argument("--lowbit-path", type=str, - default="", - help='The path to the lowbit model folder, leave blank if you do not want to save. \ - If path not exists, lowbit model will be saved there. \ - Else, lowbit model will be loaded.') - parser.add_argument('--image-url-or-path', type=str, - default="http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg", - help='The URL or path to the image to infer') - parser.add_argument('--prompt', type=str, default="What is in the image?", - help='Prompt to infer') - parser.add_argument('--n-predict', type=int, default=32, - help='Max tokens to predict') - parser.add_argument('--load_in_low_bit', type=str, default="sym_int4", - help='Load in low bit to use') - - - args = parser.parse_args() - model_path = args.repo_id_or_model_path - image_path = args.image_url_or_path - - # Load model in SYM_INT4, - # which convert the relevant layers in the model into SYM_INT4 format - # You could also try `'sym_int8'` for INT8 - # `_attn_implementation="eager"` is required for phi-3-vision - # `modules_to_not_convert=["vision_embed_tokens"]` and `model = model.half()` are for acceleration and are optional - - if not args.lowbit_path or not os.path.exists(args.lowbit_path): - model = AutoModelForCausalLM.from_pretrained( - model_path, - trust_remote_code=True, - load_in_low_bit=args.load_in_low_bit, - _attn_implementation="eager", - modules_to_not_convert=["vision_embed_tokens"] - ) - else: - model = AutoModelForCausalLM.load_low_bit( - args.lowbit_path, - trust_remote_code=True, - bigdl_transformers_low_bit=args.load_in_low_bit, - attn_implementation="eager", - modules_to_not_convert=["vision_embed_tokens"] - ) - - if args.lowbit_path and not os.path.exists(args.lowbit_path): - model.save_low_bit(args.lowbit_path) - - # Load processor - processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) - - # here the message formatting refers to https://huggingface.co/microsoft/Phi-3-vision-128k-instruct#sample-inference-code - messages = [ - {"role": "user", "content": "<|image_1|>\n{prompt}".format(prompt=args.prompt)}, - ] - prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - - if os.path.exists(image_path): - image = Image.open(image_path) - else: - image = Image.open(requests.get(image_path, stream=True).raw) - - # Generate predicted tokens - with torch.inference_mode(): - # start inference - st = time.time() - - inputs = processor(prompt, [image], return_tensors="pt") - output = model.generate(**inputs, - eos_token_id=processor.tokenizer.eos_token_id, - num_beams=1, - do_sample=False, - max_new_tokens=args.n_predict, - temperature=0.0) - end = time.time() - print(f'Inference time: {end-st} s') - output_str = processor.decode(output[0], - skip_special_tokens=True, - clean_up_tokenization_spaces=False) - print('-'*20, 'Prompt', '-'*20) - print(f'Message: {messages}') - print(f'Image link/path: {image_path}') - print('-'*20, 'Output', '-'*20) - print(output_str) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py index e4cdef6120a..22ddb786fa4 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py @@ -37,17 +37,18 @@ type=str, default="openbmb/MiniCPM-Llama3-V-2_5", help="The huggingface repo id for the MiniCPM-Llama3-V-2_5 model to be downloaded" - ", or the path to the huggingface checkpoint folder", + ", or the path to the huggingface checkpoint folder.", ) parser.add_argument('--image-url-or-path', type=str, default='http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg', help='The URL or path to the image to infer') - parser.add_argument('--prompt', type=str, default="What is in the image?", + parser.add_argument('--prompt', type=str, default="What is in this image?", help='Prompt to infer') - parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.") parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) - parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument('--low-bit', type=str, default="sym_int4", + help='Low bit optimizations that will be applied to the model.') parser.add_argument("--save-directory", type=str, required=True, help="The path of folder to save converted model, " @@ -63,11 +64,10 @@ torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager", - load_in_low_bit="sym_int4", + load_in_low_bit=args.low_bit, optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py index ec6b5361aa2..195d82b9fa7 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py @@ -26,19 +26,20 @@ if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for openbmb/MiniCPM-V-2_6 model') + parser = argparse.ArgumentParser(description='Predict Tokens using `chat()` API for npu model') parser.add_argument('--repo-id-or-model-path', type=str, default="openbmb/MiniCPM-V-2_6", - help='The huggingface repo id for the openbmb/MiniCPM-V-2_6 model to be downloaded' - ', or the path to the huggingface checkpoint folder') + help='The huggingface repo id for the MiniCPM-V-2_6 model to be downloaded' + ', or the path to the huggingface checkpoint folder.') parser.add_argument('--image-url-or-path', type=str, default='http://farm6.staticflickr.com/5268/5602445367_3504763978_z.jpg', help='The URL or path to the image to infer') parser.add_argument('--prompt', type=str, default="What is in this image?", help='Prompt to infer') - parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.") parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) - parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument('--low-bit', type=str, default="sym_int4", + help='Low bit optimizations that will be applied to the model.') parser.add_argument("--save-directory", type=str, required=True, help="The path of folder to save converted model, " @@ -54,11 +55,10 @@ torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager", - load_in_low_bit="sym_int4", + load_in_low_bit=args.low_bit, optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - transpose_value_cache=not args.disable_transpose_value_cache, save_directory=args.save_directory ) tokenizer = AutoTokenizer.from_pretrained(model_path, diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py index 0bf03d411cd..8c6cbd092b0 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/speech_paraformer-large.py @@ -33,8 +33,8 @@ type=str, default="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch", ) - parser.add_argument('--load_in_low_bit', type=str, default="sym_int8", - help='Load in low bit to use') + parser.add_argument('--low-bit', type=str, default="sym_int8", + help='Low bit optimizations that will be applied to the model.') parser.add_argument("--save-directory", type=str, required=True, help="The path of folder to save converted model, " @@ -48,7 +48,7 @@ model = AutoModel( model=model_path, attn_implementation="eager", - load_in_low_bit=args.load_in_low_bit, + load_in_low_bit=args.low_bit, low_cpu_mem_usage=True, optimize_model=True, save_directory=args.save_directory diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/README.md index cc5faf779fe..d61644b39cb 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/README.md @@ -2,29 +2,27 @@ This folder contains examples of running IPEX-LLM on Intel NPU: -- [LLM](LLM): examples of running large language models using IPEX-LLM optimizations -- [Multimodal](Multimodal): examples of running large multimodal models using IPEX-LLM optimizations +- [LLM](./LLM): examples of running large language models using IPEX-LLM optimizations + - [CPP](./LLM/CPP_Examples/): examples of running large language models using IPEX-LLM optimizations through C++ API +- [Multimodal](./Multimodal): examples of running large multimodal models using IPEX-LLM optimizations +- [Embedding](./Embedding): examples of running embedding models using IPEX-LLM optimizations +- [Save-Load](./Save-Load): examples of saving and loading low-bit models with IPEX-LLM optimizations + +> [!TIP] +> Please refer to [IPEX-LLM NPU Quickstart](../../../../../docs/mddocs/Quickstart/npu_quickstart.md) regarding more information about running `ipex-llm` on Intel NPU. ## Verified Models on Intel NPU -| Model | Model Link | +| Model | Example Link | |------------|----------------------------------------------------------------| -| Llama2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | -| Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | -| Llama3.2-1B | [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | -| Llama3.2-3B | [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | -| Chatglm3 | [THUDM/chatglm3-6b](https://huggingface.co/THUDM/chatglm3-6b) | -| Chatglm2 | [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b) | -| GLM-Edge | [THUDM/glm-edge-1.5b-chat](https://huggingface.co/THUDM/glm-edge-1.5b-chat), [THUDM/glm-edge-4b-chat](https://huggingface.co/THUDM/glm-edge-4b-chat) | -| Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) | -| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | -| MiniCPM | [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) | -| Phi-3 | [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) | -| Stablelm | [stabilityai/stablelm-zephyr-3b](https://huggingface.co/stabilityai/stablelm-zephyr-3b) | -| Baichuan2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) | -| Deepseek | [deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct) | -| Mistral | [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | -| Phi-3-Vision | [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | -| MiniCPM-Llama3-V-2_5 | [openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) | -| MiniCPM-V-2_6 | [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) | -| Bce-Embedding-Base-V1 | [maidalun1020/bce-embedding-base_v1](https://huggingface.co/maidalun1020/bce-embedding-base_v1) | -| Speech_Paraformer-Large | [iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch) | +| Llama2 | [Python link](./LLM), [C++ link](./LLM/CPP_Examples/) | +| Llama3 | [Python link](./LLM), [C++ link](./LLM/CPP_Examples/) | +| Llama3.2 | [Python link](./LLM), [C++ link](./LLM/CPP_Examples/) | +| GLM-Edge | [Python link](./LLM) | +| Qwen2 | [Python link](./LLM), [C++ link](./LLM/CPP_Examples/) | +| Qwen2.5 | [Python link](./LLM), [C++ link](./LLM/CPP_Examples/) | +| MiniCPM | [Python link](./LLM), [C++ link](./LLM/CPP_Examples/) | +| Baichuan2 | [Python link](./LLM) | +| MiniCPM-Llama3-V-2_5 | [Python link](./Multimodal/) | +| MiniCPM-V-2_6 | [Python link](./Multimodal/) | +| Speech_Paraformer-Large | [Python link](./Multimodal/) | +| Bce-Embedding-Base-V1 | [Python link](./Embedding//) | \ No newline at end of file diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/README.md similarity index 69% rename from python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/README.md rename to python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/README.md index 35102d5d27d..511af9e07c5 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/README.md @@ -6,7 +6,7 @@ In this directory, you will find example on how you could save/load models with In the example [generate.py](./generate.py), we show a basic use case of saving/loading model in low-bit optimizations to predict the next N tokens using `generate()` API. ## 0. Prerequisites -For `ipex-llm` NPU support, please refer to [Quick Start](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for details about the required preparations. +For `ipex-llm` NPU support, please refer to [Quickstart](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for details about the required preparations. ## 1. Install & Runtime Configurations ### 1.1 Installation on Windows @@ -21,10 +21,10 @@ pip install --pre --upgrade ipex-llm[npu] :: [optional] for Llama-3.2-1B-Instruct & Llama-3.2-3B-Instruct pip install transformers==4.45.0 accelerate==0.33.0 ``` -Please refer to [Quick Start](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for more details about `ipex-llm` installation on Intel NPU. +Please refer to [Quickstart](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for more details about `ipex-llm` installation on Intel NPU. ### 1.2 Runtime Configurations -Please refer to [Quick Start](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#runtime-configurations) for environment variables setting based on your device. +Please refer to [Quickstart](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#runtime-configurations) for environment variables setting based on your device. ## 3. Running examples @@ -45,8 +45,9 @@ In the example, several arguments can be passed to satisfy your requirements: - `--load-directory`: argument defining the path to load low-bit model. - `--prompt PROMPT`: argument defining the prompt to be inferred (with integrated prompt format for chat). It is default to be `'What is AI?'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. -- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. -- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. +- `--max-context-len MAX_CONTEXT_LEN`: argument defining the maximum sequence length for both input and output tokens. It is default to be `1024`. +- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`. +- `--low-bit` LOW_BIT: argument defining the low bit optimizations that will be applied to the model. Current available options are `"sym_int4"`, `"asym_int4"` and `"sym_int8"`, with `"sym_int4"` as the default. ### Sample Output #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/generate.py similarity index 95% rename from python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/generate.py rename to python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/generate.py index 4af29e946c9..5cbeed612e0 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/generate.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/generate.py @@ -45,6 +45,8 @@ help='Max tokens to predict') parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument('--low-bit', type=str, default="sym_int4", + help='Low bit optimizations that will be applied to the model.') args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -58,7 +60,7 @@ torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager", - load_in_low_bit="sym_int4", + load_in_low_bit=args.low_bit, optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len,