intel-analytics · rnwang04 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md
@@ -6,7 +6,7 @@ In this directory, you will find a C++ example on how to run LLM models on Intel
 | Model      | Model Link                                                    |
 |------------|----------------------------------------------------------------|
 | Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
-| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) |
+| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) |
 | Llama2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) |
 | Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) |
 | MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |
@@ -35,16 +35,34 @@ pip install transformers==4.45.0 accelerate==0.33.0
 We provide a [convert script](convert.py) under current directory, by running it, you can obtain the whole weights and configuration files which are required to run C++ example.
 
 ```cmd
-:: to convert Qwen2.5-7b-Instruct
+:: to convert Qwen2.5-7B-Instruct
 python convert.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory <converted_model_path>
 
+:: to convert Qwen2-1.5B-Instruct
+python convert.py --repo-id-or-model-path Qwen/Qwen2-1.5B-Instruct --save-directory <converted_model_path>
+
+:: to convert Qwen2.5-3B-Instruct
+python convert.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --save-directory <converted_model_path> --low_bit "sym_int8"
+
+:: to convert Llama-2-7b-chat-hf
+python convert.py --repo-id-or-model-path meta-llama/Llama-2-7b-chat-hf --save-directory <converted_model_path>
+
+:: to convert Meta-Llama-3-8B-Instruct
+python convert.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --save-directory <converted_model_path>
+
+:: to convert MiniCPM-1B-sft-bf16
+python convert.py --repo-id-or-model-path openbmb/MiniCPM-1B-sft-bf16 --save-directory <converted_model_path>
+
+:: to convert MiniCPM-2B-sft-bf16
+python convert.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --save-directory <converted_model_path>
 ```
 
 Arguments info:
 - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `Qwen/Qwen2.5-7B-Instruct`) to be downloaded, or the path to the huggingface checkpoint folder.
 - `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted model will be saved into `SAVE_DIRECTORY`.
 - `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `960`.
+- `--low_bit LOW_BIT`: Defines the low bit precision to quantize the model. It is default to be `sym_int4`.
 - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 
 ## 3. Build C++ Example `llm-npu-cli`

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py
@@ -43,8 +43,8 @@
     parser.add_argument("--max-context-len", type=int, default=1024)
     parser.add_argument("--max-prompt-len", type=int, default=960)
     parser.add_argument("--quantization_group_size", type=int, default=0)
-    parser.add_argument('--load_in_low_bit', type=str, default="sym_int4",
-                        help='Load in low bit to use')
+    parser.add_argument('--low_bit', type=str, default="sym_int4",
+                        help='Low bit precision to quantize the model')
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 
     args = parser.parse_args()
@@ -54,7 +54,7 @@
     model = AutoModelForCausalLM.from_pretrained(model_path,
                                                  optimize_model=True,
                                                  pipeline=True,
-                                                 load_in_low_bit=args.load_in_low_bit,
+                                                 load_in_low_bit=args.low_bit,
                                                  max_context_len=args.max_context_len,
                                                  max_prompt_len=args.max_prompt_len,
                                                  quantization_group_size=args.quantization_group_size,

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md
@@ -10,7 +10,7 @@ In this directory, you will find examples on how to directly run HuggingFace `tr
 | Llama3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) |
 | Llama3.2 | [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct), [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) |
 | Qwen2 | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |
-| Qwen2.5 | [Qwen/Qwen2.5-7b-Instruct](https://huggingface.co/Qwen/Qwen2.5-7b-Instruct) |
+| Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) |
 | Baichuan2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan-7B-Chat) |
 | MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) |
 
@@ -58,11 +58,14 @@ python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-1B-Instruct"
 :: to run Llama-3.2-3B-Instruct
 python llama3.py --repo-id-or-model-path "meta-llama/Llama-3.2-3B-Instruct"
 
-:: to run Qwen2.5-7b-Instruct
+:: to run Qwen2.5-7B-Instruct
 python qwen.py
 
-:: to run Qwen2-1.5b-Instruct
-python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --load_in_low_bit "sym_int8"
+:: to run Qwen2-1.5B-Instruct
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2-1.5B-Instruct" --low_bit "sym_int8"
+
+:: to run Qwen2.5-3B-Instruct
+python qwen.py --repo-id-or-model-path "Qwen/Qwen2.5-3B-Instruct" --low_bit "sym_int8"
 
 :: to run Baichuan2-7B-Chat
 python baichuan2.py

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/qwen.py
@@ -48,8 +48,8 @@
     parser.add_argument("--max-context-len", type=int, default=1024)
     parser.add_argument("--max-prompt-len", type=int, default=960)
     parser.add_argument("--quantization_group_size", type=int, default=0)
-    parser.add_argument('--load_in_low_bit', type=str, default="sym_int4",
-                        help='Load in low bit to use')
+    parser.add_argument('--low_bit', type=str, default="sym_int4",
+                        help='Low bit precision to quantize the model')
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--disable-streaming", action="store_true", default=False)
 
@@ -60,7 +60,7 @@
         model = AutoModelForCausalLM.from_pretrained(model_path,
                                                      optimize_model=True,
                                                      pipeline=True,
-                                                     load_in_low_bit=args.load_in_low_bit,
+                                                     load_in_low_bit=args.low_bit,
                                                      max_context_len=args.max_context_len,
                                                      max_prompt_len=args.max_prompt_len,
                                                      quantization_group_size=args.quantization_group_size,

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
@@ -70,7 +70,7 @@ Arguments info:
 - `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
 - `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `'Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun'`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
-- `--load_in_low_bit`: argument defining the `load_in_low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used.
+- `--low_bit`: argument defining the `low_bit` format used. It is default to be `sym_int8`, `sym_int4` can also be used.
 
 ### Sample Output
 #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
@@ -90,6 +90,7 @@ The examples below show how to run the **_optimized HuggingFace model implementa
 - [Llama3.2-1B](./llama.py)
 - [Llama3.2-3B](./llama.py)
 - [Qwen2-1.5B](./qwen.py)
+- [Qwen2.5-3B](./qwen.py)
 - [Qwen2.5-7B](./qwen.py)
 - [MiniCPM-1B](./minicpm.py)
 - [MiniCPM-2B](./minicpm.py)
@@ -122,6 +123,9 @@ python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct
 :: to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715)
 python qwen.py
 
+:: to run Qwen2.5-3B-Instruct (LNL driver version: 32.0.101.2715)
+python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8
+
 :: to run Qwen2.5-7B-Instruct (LNL driver version: 32.0.101.2715)
 python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct
 

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py
@@ -47,7 +47,10 @@
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=512)
+    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--quantization_group_size", type=int, default=0)
+    parser.add_argument('--low_bit', type=str, default="sym_int4",
+                        help='Load in low bit to use')
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
     parser.add_argument("--intra-pp", type=int, default=None)
     parser.add_argument("--inter-pp", type=int, default=None)
@@ -62,14 +65,15 @@
             torch_dtype=torch.float16,
             trust_remote_code=True,
             attn_implementation="eager",
-            load_in_low_bit="sym_int4",
+            load_in_low_bit=args.low_bit,
             optimize_model=True,
             max_context_len=args.max_context_len,
             max_prompt_len=args.max_prompt_len,
             intra_pp=args.intra_pp,
             inter_pp=args.inter_pp,
             transpose_value_cache=not args.disable_transpose_value_cache,
-            mixed_precision=args.mixed_precision
+            mixed_precision=args.mixed_precision,
+            quantization_group_size=args.quantization_group_size,
         )
     else:
         model = AutoModelForCausalLM.load_low_bit(

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
@@ -64,7 +64,7 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
     iqtype = ggml_tensor_qtype[qtype]
     if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
         if qtype == "sym_int4_rtn":
-            # workaround for qwen2 & int4
+            # workaround for qwen2-7B & int4
             if (layer.in_features == 3584 and layer.out_features == 152064) or \
                (layer.in_features == 18944 and layer.out_features == 3584):
                 qtype = "sym_int8_rtn"

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
@@ -428,8 +428,8 @@ def optimize_llm(
                       intra_pp=intra_pp,
                       decoder=True,
                       transpose_value_cache=transpose_value_cache)
-    elif model.config.model_type == "qwen2" and model.config.num_hidden_layers == 28:
-        # for qwen2-1.5B and qwen2-7B
+    elif model.config.model_type == "qwen2":
+        # for qwen2-1.5B, qwen2-7B, qwen2.5-3B
         if intra_pp is None:
             intra_pp = 2
         if inter_pp is None: