intel-analytics · rnwang04 · Oct 30, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 30, 2024
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/README.md
@@ -51,9 +51,12 @@ python baichuan2.py
 
 Arguments info:
 - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder.
+- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
 - `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
-- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
+- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
+- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
+- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 
 ### Sample Output
 #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/baichuan2.py
@@ -15,6 +15,7 @@
 #
 
 
+import os
 import torch
 import time
 import argparse
@@ -48,28 +49,49 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
         help="The huggingface repo id for the Baichuan2 model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
+    parser.add_argument("--lowbit-path", type=str,
+        default="",
+        help="The path to the lowbit model folder, leave blank if you do not want to save. \
+            If path not exists, lowbit model will be saved there. \
+            Else, lowbit model will be loaded.",
+    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                 optimize_model=True,
-                                                 pipeline=True,
-                                                 max_context_len=args.max_context_len,
-                                                 max_prompt_len=args.max_prompt_len,
-                                                 torch_dtype=torch.float16,
-                                                 attn_implementation="eager",
-                                                 transpose_value_cache=not args.disable_transpose_value_cache,
-                                                 trust_remote_code=True)
+    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+        model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                     optimize_model=True,
+                                                     pipeline=True,
+                                                     max_context_len=args.max_context_len,
+                                                     max_prompt_len=args.max_prompt_len,
+                                                     torch_dtype=torch.float16,
+                                                     attn_implementation="eager",
+                                                     transpose_value_cache=not args.disable_transpose_value_cache,
+                                                     trust_remote_code=True)
+    else:
+        model = AutoModelForCausalLM.load_low_bit(
+            args.lowbit_path,
+            attn_implementation="eager",
+            torch_dtype=torch.float16,
+            max_context_len=args.max_context_len,
+            max_prompt_len=args.max_prompt_len,
+            pipeline=True,
+            transpose_value_cache=not args.disable_transpose_value_cache,
+            trust_remote_code=True
+        )
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
+    if args.lowbit_path and not os.path.exists(args.lowbit_path):
+        model.save_low_bit(args.lowbit_path)
+
     DEFAULT_SYSTEM_PROMPT = """\
     """
 

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama2.py
@@ -15,6 +15,7 @@
 #
 
 
+import os
 import torch
 import time
 import argparse
@@ -48,29 +49,49 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
         help="The huggingface repo id for the Llama2 model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
+    parser.add_argument("--lowbit-path", type=str,
+        default="",
+        help="The path to the lowbit model folder, leave blank if you do not want to save. \
+            If path not exists, lowbit model will be saved there. \
+            Else, lowbit model will be loaded.",
+    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
     parser.add_argument("--max-context-len", type=int, default=1024)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument("--quantization_group_size", type=int, default=0)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                 optimize_model=True,
-                                                 pipeline=True,
-                                                 max_context_len=args.max_context_len,
-                                                 max_prompt_len=args.max_prompt_len,
-                                                 quantization_group_size=args.quantization_group_size,
-                                                 torch_dtype=torch.float16,
-                                                 attn_implementation="eager",
-                                                 transpose_value_cache=not args.disable_transpose_value_cache)
+    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+        model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                     optimize_model=True,
+                                                     pipeline=True,
+                                                     max_context_len=args.max_context_len,
+                                                     max_prompt_len=args.max_prompt_len,
+                                                     quantization_group_size=args.quantization_group_size,
+                                                     torch_dtype=torch.float16,
+                                                     attn_implementation="eager",
+                                                     transpose_value_cache=not args.disable_transpose_value_cache)
+    else:
+        model = AutoModelForCausalLM.load_low_bit(
+            args.lowbit_path,
+            attn_implementation="eager",
+            torch_dtype=torch.float16,
+            max_context_len=args.max_context_len,
+            max_prompt_len=args.max_prompt_len,
+            pipeline=True,
+            transpose_value_cache=not args.disable_transpose_value_cache,
+        )
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
+    if args.lowbit_path and not os.path.exists(args.lowbit_path):
+        model.save_low_bit(args.lowbit_path)
+
     DEFAULT_SYSTEM_PROMPT = """\
     """
 

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/Pipeline-Models/llama3.py
@@ -15,6 +15,7 @@
 #
 
 
+import os
 import torch
 import time
 import argparse
@@ -54,29 +55,49 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
         help="The huggingface repo id for the Llama3 model to be downloaded"
         ", or the path to the huggingface checkpoint folder",
     )
+    parser.add_argument("--lowbit-path", type=str,
+        default="",
+        help="The path to the lowbit model folder, leave blank if you do not want to save. \
+            If path not exists, lowbit model will be saved there. \
+            Else, lowbit model will be loaded.",
+    )
     parser.add_argument('--prompt', type=str, default="What is AI?",
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument("--quantization_group_size", type=int, default=0)
     parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
 
-    model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                 torch_dtype=torch.float16,
-                                                 optimize_model=True,
-                                                 pipeline=True,
-                                                 max_context_len=args.max_context_len,
-                                                 max_prompt_len=args.max_prompt_len,
-                                                 quantization_group_size=args.quantization_group_size,
-                                                 attn_implementation="eager",
-                                                 transpose_value_cache=not args.disable_transpose_value_cache)
+    if not args.lowbit_path or not os.path.exists(args.lowbit_path):
+        model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                    torch_dtype=torch.float16,
+                                                    optimize_model=True,
+                                                    pipeline=True,
+                                                    max_context_len=args.max_context_len,
+                                                    max_prompt_len=args.max_prompt_len,
+                                                    quantization_group_size=args.quantization_group_size,
+                                                    attn_implementation="eager",
+                                                    transpose_value_cache=not args.disable_transpose_value_cache)
+    else:
+        model = AutoModelForCausalLM.load_low_bit(
+            args.lowbit_path,
+            attn_implementation="eager",
+            torch_dtype=torch.float16,
+            max_context_len=args.max_context_len,
+            max_prompt_len=args.max_prompt_len,
+            pipeline=True,
+            transpose_value_cache=not args.disable_transpose_value_cache,
+        )
 
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
+    if args.lowbit_path and not os.path.exists(args.lowbit_path):
+        model.save_low_bit(args.lowbit_path)
+
     print("-" * 80)
     print("done")
     with torch.inference_mode():

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
@@ -127,7 +127,7 @@ Arguments info:
 - `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
 - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
-- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
+- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
 - `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 - `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.
 

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -166,6 +166,8 @@ def from_pretrained(cls, *args, **kwargs):
 
         logger.info(f"Converting model, it may takes up to several minutes ...")
 
+        model.config.update({"optimize_model": optimize_model})
+
         if mock_device == "cpu":
             with torch.no_grad():
                 # Only mock quantization_group_size=0 for now
@@ -262,7 +264,6 @@ def optimize_npu_model(cls, *args, **kwargs):
                 transpose_value_cache=transpose_value_cache,
                 group_size=quantization_group_size
             )
-            model.save_low_bit = types.MethodType(save_low_bit, model)
         else:
             from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
                 import convert_llm
@@ -271,7 +272,7 @@ def optimize_npu_model(cls, *args, **kwargs):
                         max_prompt_len=max_prompt_len,
                         transpose_value_cache=transpose_value_cache,
                         group_size=quantization_group_size)
-
+        model.save_low_bit = types.MethodType(save_low_bit, model)
         return model
 
     @classmethod
@@ -304,8 +305,10 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
         ignore_argument(kwargs, "pipeline_parallel_stages")
         ignore_argument(kwargs, "mixed_precision")
         ignore_argument(kwargs, "quantization_group_size")
-        optimize_model = kwargs.pop("optimize_model", False)
-        max_output_len = kwargs.pop("max_output_len", 1024)
+        ignore_argument(kwargs, "optimize_model")
+        pipeline = kwargs.pop("pipeline", False)
+        max_context_len = kwargs.pop("max_context_len", 1024)
+        max_context_len = max_context_len - 1
         max_prompt_len = kwargs.pop("max_prompt_len", 512)
         inter_pp = kwargs.pop("inter_pp", None)
         intra_pp = kwargs.pop("intra_pp", None)
@@ -355,6 +358,7 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
         bigdl_lcmu_enabled = config_dict.pop("bigdl_lcmu_enabled", True)
         mixed_precision = config_dict.pop("mixed_precision", False)
         quantization_group_size = config_dict.pop("group_size", 0)
+        optimize_model = config_dict.pop("optimize_model", False)
 
         invalidInputError(
             qtype,
@@ -450,13 +454,12 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
         quant_device = "meta" if bigdl_lcmu_enabled else "cpu"
         logger.info(f"Converting model, it may takes up to several minutes ...")
         from intel_npu_acceleration_library.compiler import create_npu_kernels
-
         if optimize_model:
             invalidInputError(
-                max_prompt_len < max_output_len,
+                max_prompt_len < max_context_len,
                 (
                     f"max_prompt_len ({max_prompt_len}) should be less"
-                    " than max_output_len ({max_output_len})"
+                    " than max_context_len ({max_context_len})"
                 ),
             )
             from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre
@@ -468,7 +471,8 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
 
             with torch.no_grad():
                 optimize_llm_pre(model, qtype, mixed_precision,
-                                 quantization_group_size=quantization_group_size)
+                                 quantization_group_size=quantization_group_size,
+                                 load=bigdl_lcmu_enabled)
                 cls.load_convert(qtype, model, quant_device, modules_to_not_convert,
                                  quantization_group_size, *model_args, **kwargs)
                 create_npu_kernels(llm)
@@ -541,17 +545,25 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
         for param in model.parameters():
             param.requires_grad_(False)
 
-        if optimize_model:
+        if optimize_model and not pipeline:
             from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
             optimize_llm(
                 llm,
-                max_output_len=max_output_len,
+                max_output_len=max_context_len,
                 max_prompt_len=max_prompt_len,
                 inter_pp=inter_pp,
                 intra_pp=intra_pp,
                 transpose_value_cache=transpose_value_cache,
                 group_size=quantization_group_size
             )
+        elif optimize_model and pipeline:
+            from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
+                import convert_llm
+            convert_llm(llm,
+                        kv_len=max_context_len,
+                        max_prompt_len=max_prompt_len,
+                        transpose_value_cache=transpose_value_cache,
+                        group_size=quantization_group_size)
 
         return model