From 7d103417b87adcd032c23fedca9bdf1f826e27bb Mon Sep 17 00:00:00 2001 From: "Wang, Jian4" <61138589+hzjane@users.noreply.github.com> Date: Fri, 30 Aug 2024 09:50:18 +0800 Subject: [PATCH] Fix glm4-9b-chat nan error on vllm 0.3.3 (#11970) * fix nan value * update --- python/llm/src/ipex_llm/vllm/xpu/model_convert.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py index 94494da2e15..7979bfbc62a 100644 --- a/python/llm/src/ipex_llm/vllm/xpu/model_convert.py +++ b/python/llm/src/ipex_llm/vllm/xpu/model_convert.py @@ -248,7 +248,15 @@ def _ipex_llm_load_model(self) -> None: parallel_config=self.parallel_config, scheduler_config=self.scheduler_config) from ipex_llm import optimize_model - optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype) + import os + not_convert_last_mlp = os.getenv("IPEX_LLM_NOT_CONVERT_LAST_MLP", None) + if not_convert_last_mlp is not None: + # only use to avoid nan value in last mlp forward running glm4-9b-chat + modules = ["35.mlp", "36.mlp", "37.mlp", "38.mlp", "39.mlp"] + else: + modules = None + optimize_model(self.model, low_bit=low_bit, torch_dtype=self.model_config.dtype, + modules_to_not_convert=modules) self.model = self.model.to(device=self.device_config.device, dtype=self.model_config.dtype)