diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md index aa5e076c20d..ecfeb04703d 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md @@ -6,7 +6,7 @@ In this directory, you will find a C++ example on how to run LLM models on Intel | Model | Model Link | |------------|----------------------------------------------------------------| | Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) | -| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | +| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | | Llama2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | | Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | | MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) | @@ -35,9 +35,26 @@ pip install transformers==4.45.0 accelerate==0.33.0 We provide a [convert script](convert.py) under current directory, by running it, you can obtain the whole weights and configuration files which are required to run C++ example. ```cmd -:: to convert Qwen2.5-7b-Instruct +:: to convert Qwen2.5-7B-Instruct python convert.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory +:: to convert Qwen2-1.5B-Instruct +python convert.py --repo-id-or-model-path Qwen/Qwen2-1.5B-Instruct --save-directory + +:: to convert Qwen2.5-3B-Instruct +python convert.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --save-directory --low_bit "sym_int8" + +:: to convert Llama-2-7b-chat-hf +python convert.py --repo-id-or-model-path meta-llama/Llama-2-7b-chat-hf --save-directory + +:: to convert Meta-Llama-3-8B-Instruct +python convert.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --save-directory + +:: to convert MiniCPM-1B-sft-bf16 +python convert.py --repo-id-or-model-path openbmb/MiniCPM-1B-sft-bf16 --save-directory + +:: to convert MiniCPM-2B-sft-bf16 +python convert.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --save-directory ``` Arguments info: @@ -45,6 +62,7 @@ Arguments info: - `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted model will be saved into `SAVE_DIRECTORY`. - `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `960`. +- `--low_bit LOW_BIT`: Defines the low bit precision to quantize the model. It is default to be `sym_int4`. - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. ## 3. Build C++ Example `llm-npu-cli` diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py index c88aff51bdc..c4781f0a419 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py @@ -43,8 +43,8 @@ parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=960) parser.add_argument("--quantization_group_size", type=int, default=0) - parser.add_argument('--load_in_low_bit', type=str, default="sym_int4", - help='Load in low bit to use') + parser.add_argument('--low_bit', type=str, default="sym_int4", + help='Low bit precision to quantize the model') parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) args = parser.parse_args() @@ -54,7 +54,7 @@ model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, pipeline=True, - load_in_low_bit=args.load_in_low_bit, + load_in_low_bit=args.low_bit, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, quantization_group_size=args.quantization_group_size, diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md index 2e230e9628a..462199dbeee 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md @@ -10,7 +10,7 @@ In this directory, you will find examples on how to directly run HuggingFace `tr | Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | | Llama3.2 | [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct), [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | | Qwen2 | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) | -| Qwen2.5 | [Qwen/Qwen2.5-7b-Instruct](https://huggingface.co/Qwen/Qwen2.5-7b-Instruct) | +| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | | Baichuan2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan-7B-Chat) | | MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) | @@ -58,11 +58,14 @@ python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct" :: to run Llama-3.2-3B-Instruct python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct" -:: to run Qwen2.5-7b-Instruct +:: to run Qwen2.5-7B-Instruct python qwen.py -:: to run Qwen2-1.5b-Instruct -python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --load_in_low_bit "sym_int8" +:: to run Qwen2-1.5B-Instruct +python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low_bit "sym_int8" + +:: to run Qwen2.5-3B-Instruct +python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low_bit "sym_int8" :: to run Baichuan2-7B-Chat python baichuan2.py diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py index e25b6390099..d46ee771c0e 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py @@ -48,8 +48,8 @@ parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=960) parser.add_argument("--quantization_group_size", type=int, default=0) - parser.add_argument('--load_in_low_bit', type=str, default="sym_int4", - help='Load in low bit to use') + parser.add_argument('--low_bit', type=str, default="sym_int4", + help='Low bit precision to quantize the model') parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--disable-streaming", action="store_true", default=False) @@ -60,7 +60,7 @@ model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, pipeline=True, - load_in_low_bit=args.load_in_low_bit, + load_in_low_bit=args.low_bit, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, quantization_group_size=args.quantization_group_size, diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index 54d37ae5870..16af842457f 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -70,7 +70,7 @@ Arguments info: - `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string. - `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `'Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun'`. - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`. -- `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used. +- `--low_bit`: argument defining the `low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used. ### Sample Output #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) @@ -90,6 +90,7 @@ The examples below show how to run the **_optimized HuggingFace model implementa - [Llama3.2-1B](./llama.py) - [Llama3.2-3B](./llama.py) - [Qwen2-1.5B](./qwen.py) +- [Qwen2.5-3B](./qwen.py) - [Qwen2.5-7B](./qwen.py) - [MiniCPM-1B](./minicpm.py) - [MiniCPM-2B](./minicpm.py) @@ -122,6 +123,9 @@ python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct :: to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715) python qwen.py +:: to run Qwen2.5-3B-Instruct (LNL driver version: 32.0.101.2715) +python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8 + :: to run Qwen2.5-7B-Instruct (LNL driver version: 32.0.101.2715) python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py index ce69a52adee..2256be57e5a 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py @@ -47,7 +47,10 @@ help='Prompt to infer') parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") parser.add_argument("--max-context-len", type=int, default=1024) - parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--max-prompt-len", type=int, default=960) + parser.add_argument("--quantization_group_size", type=int, default=0) + parser.add_argument('--low_bit', type=str, default="sym_int4", + help='Load in low bit to use') parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--intra-pp", type=int, default=None) parser.add_argument("--inter-pp", type=int, default=None) @@ -62,14 +65,15 @@ torch_dtype=torch.float16, trust_remote_code=True, attn_implementation="eager", - load_in_low_bit="sym_int4", + load_in_low_bit=args.low_bit, optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, inter_pp=args.inter_pp, transpose_value_cache=not args.disable_transpose_value_cache, - mixed_precision=args.mixed_precision + mixed_precision=args.mixed_precision, + quantization_group_size=args.quantization_group_size, ) else: model = AutoModelForCausalLM.load_low_bit( diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 7473bfc8516..3c79038e75e 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -64,7 +64,7 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert, iqtype = ggml_tensor_qtype[qtype] if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"): if qtype == "sym_int4_rtn": - # workaround for qwen2 & int4 + # workaround for qwen2-7B & int4 if (layer.in_features == 3584 and layer.out_features == 152064) or \ (layer.in_features == 18944 and layer.out_features == 3584): qtype = "sym_int8_rtn" diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index 382f51effa2..f38c79d8835 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -428,8 +428,8 @@ def optimize_llm( intra_pp=intra_pp, decoder=True, transpose_value_cache=transpose_value_cache) - elif model.config.model_type == "qwen2" and model.config.num_hidden_layers == 28: - # for qwen2-1.5B and qwen2-7B + elif model.config.model_type == "qwen2": + # for qwen2-1.5B, qwen2-7B, qwen2.5-3B if intra_pp is None: intra_pp = 2 if inter_pp is None: