From 5e444cde2094891fb9e84b567cf551a13986266c Mon Sep 17 00:00:00 2001 From: plusbang Date: Fri, 29 Nov 2024 11:53:57 +0800 Subject: [PATCH 1/3] update --- .../llm/dev/benchmark/all-in-one/config.yaml | 4 +- python/llm/dev/benchmark/all-in-one/run.py | 6 +- .../HF-Transformers-AutoModels/LLM/README.md | 81 +++-------- .../LLM/{llama.py => llama2.py} | 11 +- .../HF-Transformers-AutoModels/LLM/llama3.py | 133 ++++++++++++++++++ .../HF-Transformers-AutoModels/LLM/minicpm.py | 11 +- .../HF-Transformers-AutoModels/LLM/qwen.py | 7 +- .../src/ipex_llm/transformers/npu_model.py | 2 +- 8 files changed, 176 insertions(+), 79 deletions(-) rename python/llm/example/NPU/HF-Transformers-AutoModels/LLM/{llama.py => llama2.py} (93%) create mode 100644 python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py diff --git a/python/llm/dev/benchmark/all-in-one/config.yaml b/python/llm/dev/benchmark/all-in-one/config.yaml index c371bb657aa..aeadf061e12 100644 --- a/python/llm/dev/benchmark/all-in-one/config.yaml +++ b/python/llm/dev/benchmark/all-in-one/config.yaml @@ -35,12 +35,12 @@ test_api: # - "bigdl_ipex_int8" # on Intel CPU, (qtype=int8) # - "speculative_cpu" # on Intel CPU, inference with self-speculative decoding # - "deepspeed_transformer_int4_cpu" # on Intel CPU, deepspeed autotp inference - # - "transformers_int4_npu_win" # on Intel NPU for Windows, transformer-like API, (qtype=int4) + - "transformers_int4_npu_win" # on Intel NPU for Windows, transformer-like API, (qtype=int4) # - "transformers_int4_loadlowbit_npu_win" # on Intel NPU for Windows, transformer-like API, (qtype=int4), use load_low_bit API. Please make sure you have used the save_npu.py to save the converted low bit model # - "transformers_int4_npu_pipeline_win" # on Intel NPU for Windows, transformer-like API, (qtype=int4) cpu_embedding: False # whether put embedding to CPU streaming: False # whether output in streaming way (only available now for gpu win related test_api) -optimize_model: False # whether apply further optimization on NPU (only available now for transformers_int4_npu_win test_api) +optimize_model: True # whether apply further optimization on NPU (only available now for transformers_int4_npu_win test_api) use_fp16_torch_dtype: True # whether use fp16 for non-linear layer (only available now for "pipeline_parallel_gpu" test_api) task: 'continuation' # task can be 'continuation', 'QA' and 'summarize' transpose_value_cache: True # whether apply transposed v_cache optimization on NPU (only available now for transformers_int4_npu_win test_api) diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index 1cd84599963..01a59a173e4 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -626,6 +626,7 @@ def transformers_int4_npu_win(repo_id, model_path = get_model_path(repo_id, local_model_hub) in_out_len = in_out_pairs[0].split("-") max_context_len = max(int(in_out_len[0]) + int(in_out_len[1]), 1024) + save_directory = "./save_converted_model_dir" # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format st = time.perf_counter() @@ -640,13 +641,14 @@ def transformers_int4_npu_win(repo_id, model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16, optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]), quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache, - use_cache=True, attn_implementation="eager").eval() + save_directory=save_directory, use_cache=True, attn_implementation="eager").eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) end = time.perf_counter() load_time = end - st print(">> loading of model costs {}s".format(load_time)) - model = BenchmarkWrapper(model) + if not hasattr(model, "model_ptr"): + model = BenchmarkWrapper(model) result = {} with torch.inference_mode(): diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index 16af842457f..b757c5c24d8 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -85,10 +85,10 @@ done ## 4. Run Optimized Models (Experimental) The examples below show how to run the **_optimized HuggingFace model implementations_** on Intel NPU, including -- [Llama2-7B](./llama.py) -- [Llama3-8B](./llama.py) -- [Llama3.2-1B](./llama.py) -- [Llama3.2-3B](./llama.py) +- [Llama2-7B](./llama2.py) +- [Llama3-8B](./llama3.py) +- [Llama3.2-1B](./llama3.py) +- [Llama3.2-3B](./llama3.py) - [Qwen2-1.5B](./qwen.py) - [Qwen2.5-3B](./qwen.py) - [Qwen2.5-7B](./qwen.py) @@ -96,44 +96,34 @@ The examples below show how to run the **_optimized HuggingFace model implementa - [MiniCPM-2B](./minicpm.py) - [Baichuan2-7B](./baichuan2.py) -### Recommended NPU Driver Version for MTL Users -#### 32.0.100.2540 -Supported models: Llama2-7B, Llama3-8B, Qwen2-1.5B, MiniCPM-1B, MiniCPM-2B, Baichuan2-7B - -### Recommended NPU Driver Version for LNL Users -#### 32.0.100.2625 -Supported models: Llama2-7B, MiniCPM-1B, Baichuan2-7B -#### 32.0.101.2715 -Supported models: Llama3-8B, MiniCPM-2B, Qwen2-1.5B, Qwen2.5-7B - ### Run ```cmd :: to run Llama-2-7b-chat-hf -python llama.py +python llama2.py --save-directory -:: to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715) -python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct +:: to run Meta-Llama-3-8B-Instruct +python llama3.py --save-directory :: to run Llama-3.2-1B-Instruct -python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-1B-Instruct +python llama3.py --repo-id-or-model-path meta-llama/Llama-3.2-1B-Instruct --save-directory :: to run Llama-3.2-3B-Instruct -python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct +python llama3.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct --save-directory -:: to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715) -python qwen.py +:: to run Qwen2-1.5B-Instruct +python qwen.py --repo-id-or-model-path Qwen/Qwen2-1.5B-Instruct --low_bit sym_int8 --save-directory -:: to run Qwen2.5-3B-Instruct (LNL driver version: 32.0.101.2715) -python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8 +:: to run Qwen2.5-3B-Instruct +python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8 --save-directory -:: to run Qwen2.5-7B-Instruct (LNL driver version: 32.0.101.2715) -python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct +:: to run Qwen2.5-7B-Instruct +python qwen.py --save-directory :: to run MiniCPM-1B-sft-bf16 -python minicpm.py +python minicpm.py --save-directory -:: to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715) -python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 +:: to run MiniCPM-2B-sft-bf16 +python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --save-directory :: to run Baichuan2-7B-Chat python baichuan2.py @@ -147,6 +137,7 @@ Arguments info: - `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`. - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`. - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache. +- `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. ### Troubleshooting @@ -154,40 +145,10 @@ Arguments info: If you encounter `TypeError: can't convert meta device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.` error when loading lowbit model, please try re-saving the lowbit model with the example script you are currently using. Please note that lowbit models saved by `qwen.py`, `llama.py`, etc. cannot be loaded by `generate.py`. #### Output Problem -If you encounter output problem, please try to disable the optimization of transposing value cache with following command: +If you encounter output problem, please try to disable the optimization of transposing value cache such as the following command: ```cmd :: to run Llama-2-7b-chat-hf -python llama.py --disable-transpose-value-cache - -:: to run Meta-Llama-3-8B-Instruct (LNL driver version: 32.0.101.2715) -python llama.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --disable-transpose-value-cache - -:: to run Llama-3.2-1B-Instruct -python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-1B-Instruct --disable-transpose-value-cache - -:: to run Llama-3.2-3B-Instruct -python llama.py --repo-id-or-model-path meta-llama/Llama-3.2-3B-Instruct --disable-transpose-value-cache - -:: to run Qwen2-1.5B-Instruct (LNL driver version: 32.0.101.2715) -python qwen.py --disable-transpose-value-cache - -:: to run Qwen2.5-7B-Instruct LNL driver version: 32.0.101.2715) -python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --disable-transpose-value-cache - -:: to run MiniCPM-1B-sft-bf16 -python minicpm.py --disable-transpose-value-cache - -:: to run MiniCPM-2B-sft-bf16 (LNL driver version: 32.0.101.2715) -python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --disable-transpose-value-cache - -:: to run Baichuan2-7B-Chat -python baichuan2.py --disable-transpose-value-cache -``` - -For [Qwen2.5-7B](./qwen.py), you could also try to enable mixed precision optimization when encountering output problems: - -```cmd -python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --mixed-precision +python llama2.py --save-directory --disable-transpose-value-cache ``` #### Better Performance with High CPU Utilization diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py similarity index 93% rename from python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py rename to python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py index 97aed851a91..2c3dd02ea9d 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py @@ -62,8 +62,12 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) - parser.add_argument("--intra-pp", type=int, default=2) - parser.add_argument("--inter-pp", type=int, default=2) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, program will raise error.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -78,9 +82,8 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, transpose_value_cache=not args.disable_transpose_value_cache, + save_directory=args.save_directory ) else: model = AutoModelForCausalLM.load_low_bit( diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py new file mode 100644 index 00000000000..50090f3d5dd --- /dev/null +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py @@ -0,0 +1,133 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import torch +import time +import argparse + +from ipex_llm.transformers.npu_model import AutoModelForCausalLM +from transformers import AutoTokenizer + +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +def get_prompt(user_input: str, chat_history: list[tuple[str, str]], + system_prompt: str) -> str: + prompt_texts = [f'<|begin_of_text|>'] + + if system_prompt != '': + prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>') + + for history_input, history_response in chat_history: + prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>') + prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>') + + prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n') + return ''.join(prompt_texts) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Predict Tokens using `generate()` API for npu model" + ) + parser.add_argument( + "--repo-id-or-model-path", + type=str, + default="meta-llama/Meta-Llama-3-8B-Instruct", + help="The huggingface repo id for the Llama3 model to be downloaded" + ", or the path to the huggingface checkpoint folder", + ) + parser.add_argument("--lowbit-path", type=str, + default="", + help="The path to the lowbit model folder, leave blank if you do not want to save. \ + If path not exists, lowbit model will be saved there. \ + Else, lowbit model will be loaded.", + ) + parser.add_argument('--prompt', type=str, default="What is AI?", + help='Prompt to infer') + parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict") + parser.add_argument("--max-context-len", type=int, default=1024) + parser.add_argument("--max-prompt-len", type=int, default=512) + parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, program will raise error.", + ) + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + if not args.lowbit_path or not os.path.exists(args.lowbit_path): + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.float16, + trust_remote_code=True, + attn_implementation="eager", + load_in_low_bit="sym_int4", + optimize_model=True, + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len, + transpose_value_cache=not args.disable_transpose_value_cache, + save_directory=args.save_directory + ) + else: + model = AutoModelForCausalLM.load_low_bit( + args.lowbit_path, + attn_implementation="eager", + torch_dtype=torch.float16, + optimize_model=True, + max_context_len=args.max_context_len, + max_prompt_len=args.max_prompt_len, + intra_pp=args.intra_pp, + inter_pp=args.inter_pp, + transpose_value_cache=not args.disable_transpose_value_cache, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + if args.lowbit_path and not os.path.exists(args.lowbit_path): + model.save_low_bit(args.lowbit_path) + + DEFAULT_SYSTEM_PROMPT = """\ + """ + + print("-" * 80) + print("done") + with torch.inference_mode(): + print("finish to load") + for i in range(5): + prompt = get_prompt(args.prompt, [], system_prompt=DEFAULT_SYSTEM_PROMPT) + _input_ids = tokenizer.encode(prompt, return_tensors="pt") + print("input length:", len(_input_ids[0])) + st = time.time() + output = model.generate( + _input_ids, num_beams=1, do_sample=False, max_new_tokens=args.n_predict + ) + end = time.time() + print(f"Inference time: {end-st} s") + input_str = tokenizer.decode(_input_ids[0], skip_special_tokens=False) + print("-" * 20, "Input", "-" * 20) + print(input_str) + output_str = tokenizer.decode(output[0], skip_special_tokens=False) + print("-" * 20, "Output", "-" * 20) + print(output_str) + + print("-" * 80) + print("done") + print("success shut down") diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py index 628ff29f915..0626202a7e5 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py @@ -49,8 +49,12 @@ parser.add_argument("--max-context-len", type=int, default=1024) parser.add_argument("--max-prompt-len", type=int, default=512) parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) - parser.add_argument("--intra-pp", type=int, default=2) - parser.add_argument("--inter-pp", type=int, default=2) + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, program will raise error.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -64,9 +68,8 @@ optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, transpose_value_cache=not args.disable_transpose_value_cache, + save_directory=args.save_directory ) else: model = AutoModelForCausalLM.load_low_bit( diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py index 57a2aa2b03f..4623bb9c419 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py @@ -52,9 +52,6 @@ parser.add_argument('--low_bit', type=str, default="sym_int4", help='Load in low bit to use') parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) - parser.add_argument("--intra-pp", type=int, default=None) - parser.add_argument("--inter-pp", type=int, default=None) - parser.add_argument("--mixed-precision", action='store_false') parser.add_argument("--save-directory", type=str, required=True, help="The path of folder to save converted model, " @@ -75,10 +72,8 @@ optimize_model=True, max_context_len=args.max_context_len, max_prompt_len=args.max_prompt_len, - intra_pp=args.intra_pp, - inter_pp=args.inter_pp, transpose_value_cache=not args.disable_transpose_value_cache, - mixed_precision=args.mixed_precision, + mixed_precision=True, quantization_group_size=args.quantization_group_size, save_directory=args.save_directory ) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index aff61122684..5cc15129125 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -266,7 +266,7 @@ def optimize_npu_model(cls, *args, **kwargs): model.share_memory() if not pipeline: - if model.config.model_type in ["qwen2"]: + if model.config.model_type in ["qwen2", "llama", "minicpm"]: from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process optimize_llm_single_process( llm, From daf02a6c64482a0fff74ba63cc4d360f07a425dc Mon Sep 17 00:00:00 2001 From: plusbang Date: Fri, 29 Nov 2024 11:57:45 +0800 Subject: [PATCH 2/3] update --- python/llm/dev/benchmark/all-in-one/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/dev/benchmark/all-in-one/config.yaml b/python/llm/dev/benchmark/all-in-one/config.yaml index aeadf061e12..b5210dbf021 100644 --- a/python/llm/dev/benchmark/all-in-one/config.yaml +++ b/python/llm/dev/benchmark/all-in-one/config.yaml @@ -35,7 +35,7 @@ test_api: # - "bigdl_ipex_int8" # on Intel CPU, (qtype=int8) # - "speculative_cpu" # on Intel CPU, inference with self-speculative decoding # - "deepspeed_transformer_int4_cpu" # on Intel CPU, deepspeed autotp inference - - "transformers_int4_npu_win" # on Intel NPU for Windows, transformer-like API, (qtype=int4) + # - "transformers_int4_npu_win" # on Intel NPU for Windows, transformer-like API, (qtype=int4) # - "transformers_int4_loadlowbit_npu_win" # on Intel NPU for Windows, transformer-like API, (qtype=int4), use load_low_bit API. Please make sure you have used the save_npu.py to save the converted low bit model # - "transformers_int4_npu_pipeline_win" # on Intel NPU for Windows, transformer-like API, (qtype=int4) cpu_embedding: False # whether put embedding to CPU From 5a932a722bd31ac92fcd302f29f3503d1532e931 Mon Sep 17 00:00:00 2001 From: plusbang Date: Fri, 29 Nov 2024 13:33:12 +0800 Subject: [PATCH 3/3] add path para --- .../NPU/HF-Transformers-AutoModels/LLM/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md index b757c5c24d8..56ff3963f1b 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md @@ -13,7 +13,7 @@ In this directory, you will find examples on how to directly run HuggingFace `tr | Chatglm2 | [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b) | | Qwen2 | [Qwen/Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct), [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) | | Qwen2.5 | [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | -| MiniCPM | [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) | +| MiniCPM | [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-1B-sft-bf16), [openbmb/MiniCPM-2B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-2B-sft-bf16) | | Phi-3 | [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) | | Stablelm | [stabilityai/stablelm-zephyr-3b](https://huggingface.co/stabilityai/stablelm-zephyr-3b) | | Baichuan2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan-7B-Chat) | @@ -99,10 +99,10 @@ The examples below show how to run the **_optimized HuggingFace model implementa ### Run ```cmd :: to run Llama-2-7b-chat-hf -python llama2.py --save-directory +python llama2.py --repo-id-or-model-path meta-llama/Llama-2-7b-chat-hf --save-directory :: to run Meta-Llama-3-8B-Instruct -python llama3.py --save-directory +python llama3.py --repo-id-or-model-path meta-llama/Meta-Llama-3-8B-Instruct --save-directory :: to run Llama-3.2-1B-Instruct python llama3.py --repo-id-or-model-path meta-llama/Llama-3.2-1B-Instruct --save-directory @@ -117,10 +117,10 @@ python qwen.py --repo-id-or-model-path Qwen/Qwen2-1.5B-Instruct --low_bit sym_in python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-3B-Instruct --low_bit sym_int8 --save-directory :: to run Qwen2.5-7B-Instruct -python qwen.py --save-directory +python qwen.py --repo-id-or-model-path Qwen/Qwen2.5-7B-Instruct --save-directory :: to run MiniCPM-1B-sft-bf16 -python minicpm.py --save-directory +python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-1B-sft-bf16 --save-directory :: to run MiniCPM-2B-sft-bf16 python minicpm.py --repo-id-or-model-path openbmb/MiniCPM-2B-sft-bf16 --save-directory