From 534f8d4a91efefc2a15acb6c4eb5f83a33aa7c6c Mon Sep 17 00:00:00 2001 From: plusbang Date: Wed, 4 Dec 2024 17:47:27 +0800 Subject: [PATCH 1/2] add --- .../LLM/Save-Load/README.md | 73 ++++++++++++ .../LLM/Save-Load/generate.py | 104 ++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/README.md create mode 100644 python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/generate.py diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/README.md new file mode 100644 index 00000000000..fbd9657c943 --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/README.md @@ -0,0 +1,73 @@ +# Save/Load Low-Bit Models with IPEX-LLM Optimizations + +In this directory, you will find example on how you could save/load models with IPEX-LLM optimizations on Intel NPU. + +## 0. Requirements +To run this example with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../README.md#0-requirements) for more information. + +## Example: Save/Load Optimized Models +In the example [generate.py](./generate.py), we show a basic use case of saving/loading model in low-bit optimizations to predict the next N tokens using `generate()` API. Also, saving and loading operations are platform-independent, so you could run it on different platforms. + +## 1. Install +### 1.1 Installation on Windows +We suggest using conda to manage environment: +```cmd +conda create -n llm python=3.10 +conda activate llm + +:: install ipex-llm with 'npu' option +pip install --pre --upgrade ipex-llm[npu] + +:: [optional] for Llama-3.2-1B-Instruct & Llama-3.2-3B-Instruct +pip install transformers==4.45.0 accelerate==0.33.0 +``` + +## 2. Runtime Configurations +**Following envrionment variables are required**: + +```cmd +set BIGDL_USE_NPU=1 +``` + +## 3. Running examples + +If you want to save the optimized model, run: +``` +python ./generate.py --repo-id-or-model-path "meta-llama/Llama-2-7b-chat-hf" --save-directory path/to/save/model +``` + +If you want to load the optimized low-bit model, run: +``` +python ./generate.py --load-directory path/to/load/model +``` + +In the example, several arguments can be passed to satisfy your requirements: + +- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the Llama2 model to be downloaded, or the path to the ModelScope checkpoint folder. It is default to be `'meta-llama/Llama-2-7b-chat-hf'`. +- `--save-directory`: argument defining the path to save the low-bit model. Then you can load the low-bit directly. +- `--load-directory`: argument defining the path to load low-bit model. +- `--prompt PROMPT`: argument defining the prompt to be inferred (with integrated prompt format for chat). It is default to be `'What is AI?'`. +- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. +- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. +- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. + +### Sample Output +#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) +```log +Inference time: xxxx s +-------------------- Input -------------------- + [INST] <> + +<> + +What is AI? [/INST] + +-------------------- Output -------------------- + [INST] <> + +<> + +What is AI? [/INST] + +Artificial Intelligence (AI) is a field of computer science and technology that focuses on the development of intelligent machines that can perform tasks that +``` diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/generate.py new file mode 100644 index 00000000000..4af29e946c9 --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/generate.py @@ -0,0 +1,104 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import time +import argparse +from ipex_llm.transformers.npu_model import AutoModelForCausalLM +from transformers import AutoTokenizer +from ipex_llm.utils.common.log4Error import invalidInputError + + +# you could tune the prompt based on your own model, +LLAMA2_PROMPT_FORMAT = """ [INST] <> + +<> + +{prompt} [/INST] +""" + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Example of saving and loading the optimized model') + parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf", + help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded' + ', or the path to the huggingface checkpoint folder') + parser.add_argument('--save-directory', type=str, default=None, + help='The path to save the low-bit model.') + parser.add_argument('--load-directory', type=str, default=None, + help='The path to load the low-bit model.') + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + parser.add_argument("--max-context-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=512) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + save_directory = args.save_directory + load_directory = args.load_directory + + if save_directory: + # first time to load and save + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len, + save_directory=save_directory + ) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + tokenizer.save_pretrained(save_directory) + print(f"Finish to load model from {model_path} and save to {save_directory}") + elif load_directory: + # load low-bit model + model = AutoModelForCausalLM.load_low_bit( + load_directory, + attn_implementation="eager", + torch_dtype=torch.float16, + optimize_model=True, + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len + ) + tokenizer = AutoTokenizer.from_pretrained(load_directory, trust_remote_code=True) + print(f"Finish to load model from {load_directory}") + else: + invalidInputError(False, + "Both `--save-directory` and `--load-directory` are None, please provide one of this.") + + # Generate predicted tokens + with torch.inference_mode(): + for i in range(3): + prompt = LLAMA2_PROMPT_FORMAT.format(prompt=args.prompt) + _input_ids = tokenizer.encode(prompt, return_tensors="pt") + + st = time.time() + output = model.generate( + _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict + ) + end = time.time() + + print(f"Inference time: {end-st} s") + input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) + print("-" * 20, "Input", "-" * 20) + print(input_str) + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print("-" * 20, "Output", "-" * 20) + print(output_str) From 85f569dcf6f808cfd738f4c69c0911f9b25c86c5 Mon Sep 17 00:00:00 2001 From: plusbang Date: Fri, 3 Jan 2025 15:19:13 +0800 Subject: [PATCH 2/2] update links --- .../LLM/Save-Load/README.md | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/README.md index fbd9657c943..35102d5d27d 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Save-Load/README.md @@ -2,17 +2,17 @@ In this directory, you will find example on how you could save/load models with IPEX-LLM optimizations on Intel NPU. -## 0. Requirements -To run this example with IPEX-LLM, we have some recommended requirements for your machine, please refer to [here](../README.md#0-requirements) for more information. - ## Example: Save/Load Optimized Models -In the example [generate.py](./generate.py), we show a basic use case of saving/loading model in low-bit optimizations to predict the next N tokens using `generate()` API. Also, saving and loading operations are platform-independent, so you could run it on different platforms. +In the example [generate.py](./generate.py), we show a basic use case of saving/loading model in low-bit optimizations to predict the next N tokens using `generate()` API. + +## 0. Prerequisites +For `ipex-llm` NPU support, please refer to [Quick Start](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for details about the required preparations. -## 1. Install +## 1. Install & Runtime Configurations ### 1.1 Installation on Windows We suggest using conda to manage environment: ```cmd -conda create -n llm python=3.10 +conda create -n llm python=3.11 conda activate llm :: install ipex-llm with 'npu' option @@ -21,13 +21,10 @@ pip install --pre --upgrade ipex-llm[npu] :: [optional] for Llama-3.2-1B-Instruct & Llama-3.2-3B-Instruct pip install transformers==4.45.0 accelerate==0.33.0 ``` +Please refer to [Quick Start](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#install-prerequisites) for more details about `ipex-llm` installation on Intel NPU. -## 2. Runtime Configurations -**Following envrionment variables are required**: - -```cmd -set BIGDL_USE_NPU=1 -``` +### 1.2 Runtime Configurations +Please refer to [Quick Start](../../../../../../../docs/mddocs/Quickstart/npu_quickstart.md#runtime-configurations) for environment variables setting based on your device. ## 3. Running examples