Skip to content

Commit

Permalink
[Fix] Fix quantization="gptq" when using Marlin (#3319)
Browse files Browse the repository at this point in the history
Co-authored-by: Woosuk Kwon <[email protected]>
  • Loading branch information
DreamTeamWangbowen and WoosukKwon authored Mar 13, 2024
1 parent 602358f commit b167109
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,13 +168,18 @@ def _verify_quantization(self) -> None:
# Parse quantization method from the HF model config, if available.
hf_quant_config = getattr(self.hf_config, "quantization_config", None)
if hf_quant_config is not None:

hf_quant_method = str(hf_quant_config["quant_method"]).lower()

# If the GPTQ model is serialized in marlin format, use marlin.
if (hf_quant_method == "gptq"
and "is_marlin_format" in hf_quant_config
and hf_quant_config["is_marlin_format"]):
logger.info("The model is serialized in Marlin format. "
"Using Marlin kernel.")
hf_quant_method = "marlin"
if self.quantization == "gptq":
self.quantization = hf_quant_method

if self.quantization is None:
self.quantization = hf_quant_method
elif self.quantization != hf_quant_method:
Expand Down

0 comments on commit b167109

Please sign in to comment.