diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 4e2e3b174e6..65390aeaa96 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -766,7 +766,7 @@ def optimize_npu_model(cls, *args, **kwargs): optimize_llm_pre(model, qtype, mixed_precision, quantization_group_size=quantization_group_size) cls.load_convert_fp16(qtype, model.encoder, "cpu", modules_to_not_convert, - quantization_group_size, None, *args, **kwargs) + quantization_group_size) create_npu_kernels(model.encoder) model = model.eval() logger.info(f"Finish to convert model") @@ -781,7 +781,7 @@ def optimize_npu_model(cls, *args, **kwargs): @classmethod def load_convert_fp16(cls, q_k, optimize_model, device, modules_to_not_convert, - group_size=0, imatrix_data=None, *arg, **kwarg): + group_size=0, imatrix_data=None): from ipex_llm.transformers.npu_models.xlm_mp import replace_with_FP16Linear replace_with_FP16Linear(optimize_model, q_k, device=device, modules_to_not_convert=modules_to_not_convert, diff --git a/python/llm/src/ipex_llm/transformers/npu_models/xlm_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/xlm_mp.py index dd1cece70e9..b4a8357e548 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/xlm_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/xlm_mp.py @@ -711,7 +711,7 @@ def forward(self, x): @module_optimization def replace_with_Layernorm(layer, qtype=None, device='NPU', - modules_to_not_convert=[], group_size=0): + modules_to_not_convert=[], group_size=0, **kwargs): if isinstance(layer, torch.nn.LayerNorm): return XLMLayerNorm( weight=layer.weight.to(torch.float16),