[Fix] Fix quantization="gptq" when using Marlin (#3319)

Co-authored-by: Woosuk Kwon <[email protected]>
vllm-project · Mar 13, 2024 · b167109 · b167109
1 parent 602358f
commit b167109
Showing 1 changed file with 6 additions and 1 deletion.
diff --git a/vllm/config.py b/vllm/config.py
@@ -168,13 +168,18 @@ def _verify_quantization(self) -> None:
         # Parse quantization method from the HF model config, if available.
         hf_quant_config = getattr(self.hf_config, "quantization_config", None)
         if hf_quant_config is not None:
-
             hf_quant_method = str(hf_quant_config["quant_method"]).lower()
+
             # If the GPTQ model is serialized in marlin format, use marlin.
             if (hf_quant_method == "gptq"
                     and "is_marlin_format" in hf_quant_config
                     and hf_quant_config["is_marlin_format"]):
+                logger.info("The model is serialized in Marlin format. "
+                            "Using Marlin kernel.")
                 hf_quant_method = "marlin"
+                if self.quantization == "gptq":
+                    self.quantization = hf_quant_method
+
             if self.quantization is None:
                 self.quantization = hf_quant_method
             elif self.quantization != hf_quant_method: