From b167109ba12f18d028d2be8a61d3dce950eb2724 Mon Sep 17 00:00:00 2001
From: Bo-Wen Wang <1849994161@qq.com>
Date: Wed, 13 Mar 2024 13:51:42 +0800
Subject: [PATCH] [Fix] Fix quantization="gptq" when using Marlin (#3319)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/config.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index d2b68b6fa1fe2..319c1569f5e98 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -168,13 +168,18 @@ def _verify_quantization(self) -> None:
         # Parse quantization method from the HF model config, if available.
         hf_quant_config = getattr(self.hf_config, "quantization_config", None)
         if hf_quant_config is not None:
-
             hf_quant_method = str(hf_quant_config["quant_method"]).lower()
+
             # If the GPTQ model is serialized in marlin format, use marlin.
             if (hf_quant_method == "gptq"
                     and "is_marlin_format" in hf_quant_config
                     and hf_quant_config["is_marlin_format"]):
+                logger.info("The model is serialized in Marlin format. "
+                            "Using Marlin kernel.")
                 hf_quant_method = "marlin"
+                if self.quantization == "gptq":
+                    self.quantization = hf_quant_method
+
             if self.quantization is None:
                 self.quantization = hf_quant_method
             elif self.quantization != hf_quant_method: