diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py index b9c63c8c4f3..a808a551d5b 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py @@ -68,7 +68,7 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]], trust_remote_code=True, attn_implementation="eager", load_in_low_bit="sym_int4", - enable_mp=True, + optimize_model=True, max_output_len=args.max_output_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.py index 2aa3c263a38..fd0f1482147 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen2.py @@ -55,7 +55,7 @@ trust_remote_code=True, attn_implementation="eager", load_in_low_bit="sym_int4", - enable_mp=True, + optimize_model=True, max_output_len=args.max_output_len, max_prompt_len=args.max_prompt_len, intra_pp=args.intra_pp, diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 601c7cd1aac..49bcfde1c19 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -110,16 +110,16 @@ def from_pretrained(cls, *args, **kwargs): ignore_argument(kwargs, "mixed_precision") ignore_argument(kwargs, "cpu_embedding") ignore_argument(kwargs, "embedding_qtype") - ignore_argument(kwargs, "optimize_model") + ignore_argument(kwargs, "enable_mp") ignore_argument(kwargs, "modules_to_not_convert") ignore_argument(kwargs, "quantization_config") ignore_argument(kwargs, "speculative") ignore_argument(kwargs, "pipeline_parallel_stages") - enable_mp = kwargs.pop("enable_mp", False) + optimize_model = kwargs.pop("optimize_model", False) max_output_len = kwargs.pop("max_output_len", 1024) max_prompt_len = kwargs.pop("max_prompt_len", max_output_len) - inter_pp = kwargs.pop("inter_pp", 2) - intra_pp = kwargs.pop("intra_pp", 2) + inter_pp = kwargs.pop("inter_pp", None) + intra_pp = kwargs.pop("intra_pp", None) transpose_value_cache = kwargs.pop("transpose_value_cache", True) _args = copy.deepcopy(args) @@ -140,7 +140,7 @@ def from_pretrained(cls, *args, **kwargs): logger.info(f"Converting model, it may takes up to several minutes ...") from intel_npu_acceleration_library.compiler import create_npu_kernels - if enable_mp: + if optimize_model: invalidInputError( max_prompt_len < max_output_len, ( diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index 7056f1f9923..3b6f4376b9c 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -28,11 +28,16 @@ def optimize_llm( model: torch.nn.Module, max_output_len=1024, max_prompt_len=1024, - inter_pp=2, - intra_pp=2, + inter_pp=None, + intra_pp=None, transpose_value_cache=True, ): if model.config.model_type == "llama": + if intra_pp is None: + intra_pp = 2 + if inter_pp is None: + inter_pp = 2 + from ipex_llm.transformers.npu_models.llama_mp import gen_llama_fused_model_forward from ipex_llm.transformers.npu_models.llama_mp import DecodeRunner, PrefillRunner from transformers.models.llama.modeling_llama import LlamaModel @@ -59,6 +64,11 @@ def optimize_llm( convert_forward(model, LlamaForCausalLM, llama2_casullm_forward) elif model.config.model_type == "qwen2" and model.config.intermediate_size == 8960: # for qwen2-1.5B + if intra_pp is None: + intra_pp = 2 + if inter_pp is None: + inter_pp = 1 + from ipex_llm.transformers.npu_models.qwen2_mp import gen_qwen2_fused_model_forward from ipex_llm.transformers.npu_models.qwen2_mp import DecodeRunner, PrefillRunner from transformers.models.qwen2.modeling_qwen2 import Qwen2Model