[Bugfix] Fix dynamic FP8 quantization for Mixtral (vllm-project#4793)

Temirulan · May 13, 2024 · 3733fc7 · 3733fc7
1 parent a6de2a3
commit 3733fc7
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -95,7 +95,7 @@ def __init__(
                                      params_dtype=self.params_dtype,
                                      quant_config=None)
 
-        if self.use_fp8:
+        if self.use_fp8 and self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = torch.float8_e4m3fn
 
         self.w13_weight = nn.Parameter(