From b167109ba12f18d028d2be8a61d3dce950eb2724 Mon Sep 17 00:00:00 2001 From: Bo-Wen Wang <1849994161@qq.com> Date: Wed, 13 Mar 2024 13:51:42 +0800 Subject: [PATCH] [Fix] Fix quantization="gptq" when using Marlin (#3319) Co-authored-by: Woosuk Kwon --- vllm/config.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index d2b68b6fa1fe2..319c1569f5e98 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -168,13 +168,18 @@ def _verify_quantization(self) -> None: # Parse quantization method from the HF model config, if available. hf_quant_config = getattr(self.hf_config, "quantization_config", None) if hf_quant_config is not None: - hf_quant_method = str(hf_quant_config["quant_method"]).lower() + # If the GPTQ model is serialized in marlin format, use marlin. if (hf_quant_method == "gptq" and "is_marlin_format" in hf_quant_config and hf_quant_config["is_marlin_format"]): + logger.info("The model is serialized in Marlin format. " + "Using Marlin kernel.") hf_quant_method = "marlin" + if self.quantization == "gptq": + self.quantization = hf_quant_method + if self.quantization is None: self.quantization = hf_quant_method elif self.quantization != hf_quant_method: