From ccee5d337df4455587c099890f93e977972d3c4f Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 13:46:35 -0700
Subject: [PATCH 01/32] Implement static scaling for Mixtral

---
 csrc/ops.h                                    |  7 ++++-
 csrc/pybind.cpp                               |  3 +-
 csrc/quantization/fp8/fp8_cuda_kernels.cu     | 25 +++++++++++++++-
 vllm/_custom_ops.py                           |  9 ++++--
 .../layers/fused_moe/fused_moe.py             | 12 +++++---
 .../model_executor/layers/quantization/fp8.py | 11 +++++--
 vllm/model_executor/models/mixtral.py         | 30 ++++++++++++++++++-
 7 files changed, 84 insertions(+), 13 deletions(-)
diff --git a/csrc/ops.h b/csrc/ops.h
index ff7a3de1a0a8c..03bb1e24dc68e 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -146,7 +146,12 @@ void gptq_shuffle(
   torch::Tensor q_perm,
   int bit);
 
-void scaled_fp8_quant(
+void static_scaled_fp8_quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& scale);
+
+void dynamic_scaled_fp8_quant(
   torch::Tensor& out,
   torch::Tensor& input,
   torch::Tensor& scale);
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index a5b16c5abc3ed..2250c7f69f0ab 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -73,7 +73,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
   ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
-  ops.def("scaled_fp8_quant", &scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
+  ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, "Compute FP8 quantized tensor for given scaling factor");
+  ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
   ops.def(
     "moe_align_block_size",
     &moe_align_block_size,
diff --git a/csrc/quantization/fp8/fp8_cuda_kernels.cu b/csrc/quantization/fp8/fp8_cuda_kernels.cu
index c3337cede1282..2477051eb60d7 100644
--- a/csrc/quantization/fp8/fp8_cuda_kernels.cu
+++ b/csrc/quantization/fp8/fp8_cuda_kernels.cu
@@ -74,7 +74,30 @@ __global__ void scaled_fp8_quant_kernel(
 
 } // namespace vllm
 
-void scaled_fp8_quant(
+void static_scaled_fp8_quant(
+  torch::Tensor& out,      // [..., d]
+  torch::Tensor& input,    // [..., d]
+  torch::Tensor& scale)    // [1]
+{
+  int64_t num_tokens = input.numel() / input.size(-1);
+  int64_t num_elems = input.numel();
+  dim3 grid(num_tokens);
+  dim3 block(1024);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+    input.scalar_type(),
+    "scaled_fp8_quant_kernel",
+    [&] {
+      vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
+        out.data_ptr<c10::Float8_e4m3fn>(),
+        input.data_ptr<scalar_t>(),
+        scale.data_ptr<float>(),
+        num_elems);
+      });
+}
+
+void dynamic_scaled_fp8_quant(
   torch::Tensor& out,      // [..., d]
   torch::Tensor& input,    // [..., d]
   torch::Tensor& scale)    // [1]
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e4b16ed918d1a..73e7b723edb31 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -154,10 +154,13 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # fp8
-def scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
-    vllm_ops.scaled_fp8_quant(output, input, scale)
+    if scale:
+        vllm_ops.static_scaled_fp8_quant(output, input, scale)
+    else:
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
     return output, scale
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index ac7c30e2a9727..b77732cff22ad 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -220,8 +220,8 @@ def moe_align_block_size(
 
 
 def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
-                            B_scale: torch.Tensor, topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
+                            A_scale: Optional[torch.Tensor], B_scale: torch.Tensor,
+                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
@@ -232,10 +232,10 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
     assert sorted_token_ids.stride(0) == 1
 
     if not use_fp8:
-        A_scale = None
+        assert A_scale is None
         assert B_scale is None
     else:
-        A, A_scale = ops.scaled_fp8_quant(A)
+        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
         assert B_scale is not None
 
     grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
@@ -318,6 +318,8 @@ def fused_moe(
     use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -434,6 +436,7 @@ def fused_moe(
     invoke_fused_moe_kernel(hidden_states,
                             w1,
                             intermediate_cache1,
+                            a1_scale,
                             w1_scale,
                             topk_weights,
                             topk_ids,
@@ -451,6 +454,7 @@ def fused_moe(
     invoke_fused_moe_kernel(intermediate_cache2,
                             w2,
                             intermediate_cache3,
+                            a2_scale,
                             w2_scale,
                             topk_weights,
                             topk_ids,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 01e494c870e71..633021fda894b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -13,6 +13,12 @@
 class FP8Config(QuantizationConfig):
     """Config class for FP8."""
 
+    def __init__(
+        self,
+        act_scaling: str="dynamic",
+    ) -> None:
+        self.act_scaling = act_scaling
+
     @classmethod
     def get_name(cls) -> str:
         return "fp8"
@@ -30,11 +36,12 @@ def get_min_capability(cls) -> int:
 
     @classmethod
     def get_config_filenames(cls) -> List[str]:
-        return []
+        return ["quantize_config.json"]
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
-        return cls()
+        act_scaling = cls.get_from_keys(config, ["act_scaling"])
+        return cls(act_scaling)
 
     def get_linear_method(self) -> "Fp8LinearMethod":
         return Fp8LinearMethod(self)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a33b795d7088e..951c44ca24622 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -114,12 +114,27 @@ def __init__(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
             requires_grad=False) if self.use_fp8 else None
 
+        # Scaling factors for FP8 activations
+        static_act_scaling = self.use_fp8 and linear_method.act_scaling == "static"
+        self.as_scale = nn.Parameter(
+            torch.zeros(1, device="cuda", dtype=torch.float32),
+            requires_grad=False) if static_act_scaling else None
+        self.a2s_scale = nn.Parameter(
+            torch.zeros(1, device="cuda", dtype=torch.float32),
+            requires_grad=False) if static_act_scaling else None
+
         set_weight_attrs(self.ws, {
             "weight_loader": self.weight_loader,
         })
         set_weight_attrs(self.w2s, {
             "weight_loader": self.weight_loader,
         })
+        set_weight_attrs(self.as_scale, {
+            "weight_loader": self.weight_loader,
+        })
+        set_weight_attrs(self.a2s_scale, {
+            "weight_loader": self.weight_loader,
+        })
 
     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                       weight_name: str, expert_id: int):
@@ -134,6 +149,9 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                        shard_size:2 * shard_size, :] = loaded_weight[shard, :]
         if weight_name.endswith("w2.weight"):
             param_data[expert_id, :, :] = loaded_weight[:, shard]
+        if "activation_scale" in weight_name:
+            param_data[:] = param_data[:].max(loaded_weight)
+            print("loaded scale", weight_name, param_data)
 
     def process_weights_after_loading(self):
         if self.use_fp8:
@@ -161,7 +179,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         inplace=True,
                                         use_fp8=self.use_fp8,
                                         w1_scale=self.ws_scale,
-                                        w2_scale=self.w2s_scale)
+                                        w2_scale=self.w2s_scale,
+                                        a1_scale=self.a1_scale,
+                                        a2_scale=self.a2_scale)
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -443,11 +463,19 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         expert_params_mapping = [
+            # These are the weights for the experts
             # (param_name, weight_name, expert_id)
             ("ws" if weight_name in ["w1", "w3"] else "w2s",
              f"experts.{expert_id}.{weight_name}.weight", expert_id)
             for expert_id in range(self.config.num_local_experts)
             for weight_name in ["w1", "w2", "w3"]
+        ] + [
+            # These are the activation scales for the experts
+            # (param_name, weight_name, expert_id)
+            ("a_scale" if activation_name in ["a1", "a3"] else "a2_scale",
+             f"experts.{expert_id}.{activation_name}.activation_scale", expert_id)
+            for expert_id in range(self.config.num_local_experts)
+            for activation_name in ["a1", "a2", "a3"]
         ]
 
         params_dict = dict(self.named_parameters())

From 8f71c795314101187aa4aa65c128a73781ef14a4 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:01:32 -0700
Subject: [PATCH 02/32] fix

---
 vllm/model_executor/models/mixtral.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 951c44ca24622..6a52db56b50b9 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -115,13 +115,13 @@ def __init__(
             requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        static_act_scaling = self.use_fp8 and linear_method.act_scaling == "static"
+        need_act_scales = not self.use_fp8 or linear_method.act_scaling == "static"
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
-            requires_grad=False) if static_act_scaling else None
+            requires_grad=False) if need_act_scales else None
         self.a2s_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
-            requires_grad=False) if static_act_scaling else None
+            requires_grad=False) if need_act_scales else None
 
         set_weight_attrs(self.ws, {
             "weight_loader": self.weight_loader,

From 6eb01e07977b7a93348dc5b3247a5b3f369f7720 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:07:48 -0700
Subject: [PATCH 03/32] update

---
 vllm/model_executor/model_loader/weight_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 9995f2afe3cf7..2cf4cd9f5c642 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -147,7 +147,7 @@ def get_quant_config(model_config: ModelConfig,
         f for f in config_files if any(
             f.endswith(x) for x in possible_config_filenames)
     ]
-    if len(quant_config_files) == 0:
+    if len(quant_config_files) == 0 and "<optional>" not in possible_config_filenames:
         raise ValueError(
             f"Cannot find the config file for {model_config.quantization}")
     if len(quant_config_files) > 1:

From dc89cbc364abbcd471c547381de53cb72bd76c1b Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:16:39 -0700
Subject: [PATCH 04/32] fix

---
 vllm/model_executor/layers/quantization/fp8.py   |  2 ++
 vllm/model_executor/model_loader/weight_utils.py | 13 ++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 633021fda894b..7c0e0e9e8e45b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -13,6 +13,8 @@
 class FP8Config(QuantizationConfig):
     """Config class for FP8."""
 
+    config_file_optional = True
+
     def __init__(
         self,
         act_scaling: str="dynamic",
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 2cf4cd9f5c642..8fc85da86748c 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -135,19 +135,18 @@ def get_quant_config(model_config: ModelConfig,
     else:
         hf_folder = model_name_or_path
 
-    possible_config_filenames = quant_cls.get_config_filenames()
-
-    # If the quantization config is not found, use the default config.
-    if not possible_config_filenames:
-        return quant_cls()
-
     config_files = glob.glob(os.path.join(hf_folder, "*.json"))
 
     quant_config_files = [
         f for f in config_files if any(
             f.endswith(x) for x in possible_config_filenames)
     ]
-    if len(quant_config_files) == 0 and "<optional>" not in possible_config_filenames:
+
+    possible_config_filenames = quant_cls.get_config_filenames()
+    # If the quantization config is optional and not provided, use the default config.
+    if quant_cls.getattr("config_file_optional", False) and not quant_config_files:
+        return quant_cls()
+    if len(quant_config_files) == 0:
         raise ValueError(
             f"Cannot find the config file for {model_config.quantization}")
     if len(quant_config_files) > 1:

From be60845acd14b0639fc565a7aa954ad98df5c7b3 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:18:08 -0700
Subject: [PATCH 05/32] update

---
 vllm/model_executor/model_loader/weight_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 8fc85da86748c..df5b8fd80d1a3 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -136,13 +136,13 @@ def get_quant_config(model_config: ModelConfig,
         hf_folder = model_name_or_path
 
     config_files = glob.glob(os.path.join(hf_folder, "*.json"))
+    possible_config_filenames = quant_cls.get_config_filenames()
 
     quant_config_files = [
         f for f in config_files if any(
             f.endswith(x) for x in possible_config_filenames)
     ]
 
-    possible_config_filenames = quant_cls.get_config_filenames()
     # If the quantization config is optional and not provided, use the default config.
     if quant_cls.getattr("config_file_optional", False) and not quant_config_files:
         return quant_cls()

From 4613cb562f4df81e651b798d408fb9bfab067640 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:19:52 -0700
Subject: [PATCH 06/32] update

---
 vllm/model_executor/model_loader/weight_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index df5b8fd80d1a3..f3e0411c54336 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -144,7 +144,7 @@ def get_quant_config(model_config: ModelConfig,
     ]
 
     # If the quantization config is optional and not provided, use the default config.
-    if quant_cls.getattr("config_file_optional", False) and not quant_config_files:
+    if getattr(quant_cls, "config_file_optional", False) and not quant_config_files:
         return quant_cls()
     if len(quant_config_files) == 0:
         raise ValueError(

From 3d95d86e040fa643f110f952d42376ca26a702ce Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:21:41 -0700
Subject: [PATCH 07/32] fix

---
 vllm/model_executor/models/mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 6a52db56b50b9..c818764ff6799 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -115,7 +115,7 @@ def __init__(
             requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = not self.use_fp8 or linear_method.act_scaling == "static"
+        need_act_scales = not self.use_fp8 or linear_method.quant_config.act_scaling == "static"
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None

From 642763fd5a54bf442b17fdcebbc27dabc398144a Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:23:42 -0700
Subject: [PATCH 08/32] move

---
 vllm/model_executor/models/mixtral.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c818764ff6799..7260d1e7667ea 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -104,6 +104,13 @@ def __init__(
                         device="cuda",
                         dtype=self.params_dtype))
 
+        set_weight_attrs(self.ws, {
+            "weight_loader": self.weight_loader,
+        })
+        set_weight_attrs(self.w2s, {
+            "weight_loader": self.weight_loader,
+        })
+
         # Scaling factors for FP8 weights
         self.ws_scale = nn.Parameter(
             torch.ones(
@@ -123,18 +130,13 @@ def __init__(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
 
-        set_weight_attrs(self.ws, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.w2s, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.as_scale, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.a2s_scale, {
-            "weight_loader": self.weight_loader,
-        })
+        if need_act_scales:
+            set_weight_attrs(self.as_scale, {
+                "weight_loader": self.weight_loader,
+            })
+            set_weight_attrs(self.a2s_scale, {
+                "weight_loader": self.weight_loader,
+            })
 
     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                       weight_name: str, expert_id: int):

From 706e9317ef3c4f3ee715df3b0f0571a832073d11 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:27:04 -0700
Subject: [PATCH 09/32] update

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 7260d1e7667ea..9f4673b59c673 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -182,8 +182,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         use_fp8=self.use_fp8,
                                         w1_scale=self.ws_scale,
                                         w2_scale=self.w2s_scale,
-                                        a1_scale=self.a1_scale,
-                                        a2_scale=self.a2_scale)
+                                        a1_scale=self.as_scale,
+                                        a2_scale=self.a2s_scale)
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(

From 9a3c78ccbb06504400709d1148d1b9c34afd3295 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 14:31:18 -0700
Subject: [PATCH 10/32] lol

---
 vllm/model_executor/models/mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 9f4673b59c673..e4a0ba7902682 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -122,7 +122,7 @@ def __init__(
             requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = not self.use_fp8 or linear_method.quant_config.act_scaling == "static"
+        need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static"
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None

From 1b6f0201c60c3e75bcfc45d96f9c32d7f1754c72 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 15:15:06 -0700
Subject: [PATCH 11/32] fix cuda graph

---
 vllm/_custom_ops.py                           | 14 +++++++------
 .../layers/fused_moe/fused_moe.py             | 21 +++++++++++--------
 vllm/model_executor/models/mixtral.py         | 16 +++++++-------
 3 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 73e7b723edb31..0a18714da0e5c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -154,13 +154,15 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # fp8
-def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+def static_scaled_fp8_quant(input: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
-    if scale:
-        vllm_ops.static_scaled_fp8_quant(output, input, scale)
-    else:
-        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-        vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
+    vllm_ops.static_scaled_fp8_quant(output, input, scale)
+    return output
+
+def dynamic_scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+    scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+    vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
     return output, scale
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index b77732cff22ad..556ecc5f30868 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -227,15 +227,18 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
                             mul_routed_weight: bool, top_k: int,
                             config: Dict[str, Any], compute_type: tl.dtype,
-                            use_fp8: bool) -> None:
+                            linear_method: Optional[LinearMethodBase]) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
-    if not use_fp8:
+    if not isinstance(linear_method, Fp8LinearMethod):
         assert A_scale is None
         assert B_scale is None
-    else:
-        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
+    elif linear_method.quant_config.act_scaling == "static":
+        A = ops.static_scaled_fp8_quant(A, A_scale)
+        assert B_scale is not None
+    elif linear_method.quant_config.act_scaling == "dynamic":
+        A, A_scale = ops.dynamic_scaled_fp8_quant(A)
         assert B_scale is not None
 
     grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
@@ -265,7 +268,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
         MUL_ROUTED_WEIGHT=mul_routed_weight,
         top_k=top_k,
         compute_type=compute_type,
-        use_fp8=use_fp8,
+        use_fp8=isinstance(linear_method, Fp8LinearMethod),
         **config,
     )
 
@@ -315,7 +318,7 @@ def fused_moe(
     renormalize: bool,
     inplace: bool = False,
     override_config: Optional[Dict[str, Any]] = None,
-    use_fp8: bool = False,
+    linear_method: Optional[LinearMethodBase] = None,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
@@ -397,7 +400,7 @@ def fused_moe(
     else:
         # First try to load optimal config from the file
         configs = get_moe_configs(E, w2.shape[2],
-                                  "float8" if use_fp8 else None)
+                                  "float8" if isinstance(linear_method, Fp8LinearMethod) else None)
 
         if configs:
             # If an optimal configuration map has been found, look up the
@@ -447,7 +450,7 @@ def fused_moe(
                             topk_ids.shape[1],
                             config,
                             compute_type=tl.float16,
-                            use_fp8=use_fp8)
+                            linear_method=linear_method)
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
 
@@ -465,7 +468,7 @@ def fused_moe(
                             1,
                             config,
                             compute_type=tl.float16,
-                            use_fp8=use_fp8)
+                            linear_method=linear_method)
 
     if inplace:
         return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e4a0ba7902682..ca051e8fa813c 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -77,9 +77,7 @@ def __init__(
         self.top_k = top_k
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size // self.tp_size
-        # FIXME(pcmoritz): Make this more general to support different
-        # quantization schemes
-        self.use_fp8 = isinstance(linear_method, Fp8LinearMethod)
+        self.linear_method = linear_method
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -111,18 +109,20 @@ def __init__(
             "weight_loader": self.weight_loader,
         })
 
+        use_fp8 = isinstance(linear_method, Fp8LinearMethod)
+
         # Scaling factors for FP8 weights
         self.ws_scale = nn.Parameter(
             torch.ones(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
-            requires_grad=False) if self.use_fp8 else None
+            requires_grad=False) if use_fp8 else None
         self.w2s_scale = nn.Parameter(
             torch.ones(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
-            requires_grad=False) if self.use_fp8 else None
+            requires_grad=False) if use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static"
+        need_act_scales = use_fp8 and linear_method.quant_config.act_scaling == "static"
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
@@ -156,7 +156,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
             print("loaded scale", weight_name, param_data)
 
     def process_weights_after_loading(self):
-        if self.use_fp8:
+        if isinstance(self.linear_method, Fp8LinearMethod):
             ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn)
             w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn)
             for expert in range(self.num_total_experts):
@@ -179,7 +179,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         self.top_k,
                                         renormalize=True,
                                         inplace=True,
-                                        use_fp8=self.use_fp8,
+                                        linear_method=self.linear_method,
                                         w1_scale=self.ws_scale,
                                         w2_scale=self.w2s_scale,
                                         a1_scale=self.as_scale,

From b09bcecc6ff019ce5a4b3ece92d9e591832d1f5a Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 15:20:17 -0700
Subject: [PATCH 12/32] fix

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 556ecc5f30868..dab54a385b119 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -10,6 +10,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.utils import is_hip
 
 logger = init_logger(__name__)

From 052e2b3b83e6760739e2101252f38ed198784029 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 15:22:17 -0700
Subject: [PATCH 13/32] update

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index dab54a385b119..db22c38011c82 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -11,6 +11,7 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 from vllm.utils import is_hip
 
 logger = init_logger(__name__)

From b33c6d7a5208564735b6f6eeeeaed88b51a7be30 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 18:46:25 -0700
Subject: [PATCH 14/32] update

---
 vllm/model_executor/models/mixtral.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index ca051e8fa813c..c233cd0d9b065 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -474,10 +474,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ] + [
             # These are the activation scales for the experts
             # (param_name, weight_name, expert_id)
-            ("a_scale" if activation_name in ["a1", "a3"] else "a2_scale",
-             f"experts.{expert_id}.{activation_name}.activation_scale", expert_id)
+            ("as_scale" if weight_name in ["w1", "w3"] else "a2s_scale",
+             f"experts.{expert_id}.{weight_name}.activation_scale", expert_id)
             for expert_id in range(self.config.num_local_experts)
-            for activation_name in ["a1", "a2", "a3"]
+            for weight_name in ["w1", "w2", "w3"]
         ]
 
         params_dict = dict(self.named_parameters())

From 475f58d9a25a2cc0de04dcfbcba733590aeafd11 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 18:56:02 -0700
Subject: [PATCH 15/32] refactor

---
 vllm/model_executor/layers/quantization/base_config.py | 6 ++++++
 vllm/model_executor/layers/quantization/fp8.py         | 6 ++++--
 vllm/model_executor/model_loader/weight_utils.py       | 2 +-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 6115e7c3be956..08061f83b5a38 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -29,6 +29,12 @@ def get_min_capability(self) -> int:
         """
         raise NotImplementedError
 
+    # The following is not an abstract method and returns True by default.
+    @classmethod
+    def require_config_file(cls) -> bool:
+        """Whether this quantization config needs a configuration filen."""
+        return True
+
     @staticmethod
     @abstractmethod
     def get_config_filenames() -> List[str]:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 7c0e0e9e8e45b..d5e37cbd7bd5e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -13,8 +13,6 @@
 class FP8Config(QuantizationConfig):
     """Config class for FP8."""
 
-    config_file_optional = True
-
     def __init__(
         self,
         act_scaling: str="dynamic",
@@ -36,6 +34,10 @@ def get_min_capability(cls) -> int:
         # be included: https://github.com/pytorch/pytorch/pull/118881
         return 90
 
+    @classmethod
+    def require_config_file(cls) -> bool:
+        return False
+
     @classmethod
     def get_config_filenames(cls) -> List[str]:
         return ["quantize_config.json"]
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index f3e0411c54336..70879f91bdc23 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -144,7 +144,7 @@ def get_quant_config(model_config: ModelConfig,
     ]
 
     # If the quantization config is optional and not provided, use the default config.
-    if getattr(quant_cls, "config_file_optional", False) and not quant_config_files:
+    if not quant_cls.require_config_file() and not quant_config_files:
         return quant_cls()
     if len(quant_config_files) == 0:
         raise ValueError(

From 56b4880b47e4911bebaabe788eaeba08812eaa1b Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 19:16:27 -0700
Subject: [PATCH 16/32] update

---
 vllm/_custom_ops.py                               | 14 ++++++--------
 vllm/model_executor/layers/fused_moe/fused_moe.py |  7 ++-----
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0a18714da0e5c..b871b8e3a11d9 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -154,15 +154,13 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # fp8
-def static_scaled_fp8_quant(input: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]=None) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
-    vllm_ops.static_scaled_fp8_quant(output, input, scale)
-    return output
-
-def dynamic_scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
-    scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-    vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
+    if scale is None:
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
+    else:
+        vllm_ops.static_scaled_fp8_quant(output, input, scale)
     return output, scale
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index db22c38011c82..6fe94d4e268c3 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -236,11 +236,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
     if not isinstance(linear_method, Fp8LinearMethod):
         assert A_scale is None
         assert B_scale is None
-    elif linear_method.quant_config.act_scaling == "static":
-        A = ops.static_scaled_fp8_quant(A, A_scale)
-        assert B_scale is not None
-    elif linear_method.quant_config.act_scaling == "dynamic":
-        A, A_scale = ops.dynamic_scaled_fp8_quant(A)
+    else:
+        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
         assert B_scale is not None
 
     grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[

From be371549899fd3e169bb8d3f1730c27e83ecf760 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 19:27:40 -0700
Subject: [PATCH 17/32] revert

---
 .../model_executor/layers/fused_moe/fused_moe.py | 16 +++++++---------
 vllm/model_executor/models/mixtral.py            | 16 ++++++++--------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 6fe94d4e268c3..b77732cff22ad 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -10,8 +10,6 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import LinearMethodBase
-from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 from vllm.utils import is_hip
 
 logger = init_logger(__name__)
@@ -229,11 +227,11 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
                             mul_routed_weight: bool, top_k: int,
                             config: Dict[str, Any], compute_type: tl.dtype,
-                            linear_method: Optional[LinearMethodBase]) -> None:
+                            use_fp8: bool) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
-    if not isinstance(linear_method, Fp8LinearMethod):
+    if not use_fp8:
         assert A_scale is None
         assert B_scale is None
     else:
@@ -267,7 +265,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
         MUL_ROUTED_WEIGHT=mul_routed_weight,
         top_k=top_k,
         compute_type=compute_type,
-        use_fp8=isinstance(linear_method, Fp8LinearMethod),
+        use_fp8=use_fp8,
         **config,
     )
 
@@ -317,7 +315,7 @@ def fused_moe(
     renormalize: bool,
     inplace: bool = False,
     override_config: Optional[Dict[str, Any]] = None,
-    linear_method: Optional[LinearMethodBase] = None,
+    use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
@@ -399,7 +397,7 @@ def fused_moe(
     else:
         # First try to load optimal config from the file
         configs = get_moe_configs(E, w2.shape[2],
-                                  "float8" if isinstance(linear_method, Fp8LinearMethod) else None)
+                                  "float8" if use_fp8 else None)
 
         if configs:
             # If an optimal configuration map has been found, look up the
@@ -449,7 +447,7 @@ def fused_moe(
                             topk_ids.shape[1],
                             config,
                             compute_type=tl.float16,
-                            linear_method=linear_method)
+                            use_fp8=use_fp8)
 
     ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
 
@@ -467,7 +465,7 @@ def fused_moe(
                             1,
                             config,
                             compute_type=tl.float16,
-                            linear_method=linear_method)
+                            use_fp8=use_fp8)
 
     if inplace:
         return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c233cd0d9b065..9baf176fc78ee 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -77,7 +77,9 @@ def __init__(
         self.top_k = top_k
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size // self.tp_size
-        self.linear_method = linear_method
+        # FIXME(pcmoritz): Make this more general to support different
+        # quantization schemes
+        self.use_fp8 = isinstance(linear_method, Fp8LinearMethod)
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -109,20 +111,18 @@ def __init__(
             "weight_loader": self.weight_loader,
         })
 
-        use_fp8 = isinstance(linear_method, Fp8LinearMethod)
-
         # Scaling factors for FP8 weights
         self.ws_scale = nn.Parameter(
             torch.ones(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
-            requires_grad=False) if use_fp8 else None
+            requires_grad=False) if self.use_fp8 else None
         self.w2s_scale = nn.Parameter(
             torch.ones(
                 self.num_total_experts, device="cuda", dtype=torch.float32),
-            requires_grad=False) if use_fp8 else None
+            requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = use_fp8 and linear_method.quant_config.act_scaling == "static"
+        need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static"
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None
@@ -156,7 +156,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
             print("loaded scale", weight_name, param_data)
 
     def process_weights_after_loading(self):
-        if isinstance(self.linear_method, Fp8LinearMethod):
+        if self.use_fp8:
             ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn)
             w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn)
             for expert in range(self.num_total_experts):
@@ -179,7 +179,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         self.top_k,
                                         renormalize=True,
                                         inplace=True,
-                                        linear_method=self.linear_method,
+                                        use_fp8=self.use_fp8,
                                         w1_scale=self.ws_scale,
                                         w2_scale=self.w2s_scale,
                                         a1_scale=self.as_scale,

From 9c54d1923876d661f93d470a539903415036735f Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 19:35:27 -0700
Subject: [PATCH 18/32] format

---
 vllm/_custom_ops.py                               | 5 ++++-
 vllm/model_executor/layers/fused_moe/fused_moe.py | 5 +++--
 vllm/model_executor/layers/quantization/fp8.py    | 2 +-
 vllm/model_executor/model_loader/weight_utils.py  | 4 +++-
 vllm/model_executor/models/mixtral.py             | 3 ++-
 5 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index b871b8e3a11d9..c6f330f211e8a 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -154,7 +154,10 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # fp8
-def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]=None) -> Tuple[torch.Tensor, torch.Tensor]:
+def scaled_fp8_quant(
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:
         scale = torch.zeros(1, device=input.device, dtype=torch.float32)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index b77732cff22ad..17b140195676e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -220,8 +220,9 @@ def moe_align_block_size(
 
 
 def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
-                            A_scale: Optional[torch.Tensor], B_scale: torch.Tensor,
-                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                            A_scale: Optional[torch.Tensor],
+                            B_scale: torch.Tensor, topk_weights: torch.Tensor,
+                            topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d5e37cbd7bd5e..ab562288a8f7d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -15,7 +15,7 @@ class FP8Config(QuantizationConfig):
 
     def __init__(
         self,
-        act_scaling: str="dynamic",
+        act_scaling: str = "dynamic",
     ) -> None:
         self.act_scaling = act_scaling
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 70879f91bdc23..54fd4f22f0eef 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -143,9 +143,11 @@ def get_quant_config(model_config: ModelConfig,
             f.endswith(x) for x in possible_config_filenames)
     ]
 
-    # If the quantization config is optional and not provided, use the default config.
+    # If the quantization config is optional and not provided, use the
+    # default config.
     if not quant_cls.require_config_file() and not quant_config_files:
         return quant_cls()
+
     if len(quant_config_files) == 0:
         raise ValueError(
             f"Cannot find the config file for {model_config.quantization}")
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 9baf176fc78ee..f42e1c939d723 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -122,7 +122,8 @@ def __init__(
             requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static"
+        need_act_scales = (self.use_fp8 and
+                           linear_method.quant_config.act_scaling == "static")
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None

From c5155eab2f068dcd49a2fa78b0b198e0fc36582e Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 20:48:55 -0700
Subject: [PATCH 19/32] Update vllm/_custom_ops.py

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/_custom_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index c6f330f211e8a..9d40ec4928276 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -155,8 +155,8 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 # fp8
 def scaled_fp8_quant(
-        input: torch.Tensor,
-        scale: Optional[torch.Tensor] = None
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:

From 948cca76dff98914d7ad08dea7eed17c10f2b933 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 21:16:41 -0700
Subject: [PATCH 20/32] Update
 vllm/model_executor/layers/fused_moe/fused_moe.py

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 17b140195676e..8e49c86202eb7 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -221,7 +221,7 @@ def moe_align_block_size(
 
 def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
-                            B_scale: torch.Tensor, topk_weights: torch.Tensor,
+                            B_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
                             topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,

From 3feb88790dd5ab0d9d8244a92015a0ac27525387 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 21:16:52 -0700
Subject: [PATCH 21/32] Update vllm/model_executor/models/mixtral.py

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/model_executor/models/mixtral.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index f42e1c939d723..4439a75e5b57f 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -154,7 +154,6 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
             param_data[expert_id, :, :] = loaded_weight[:, shard]
         if "activation_scale" in weight_name:
             param_data[:] = param_data[:].max(loaded_weight)
-            print("loaded scale", weight_name, param_data)
 
     def process_weights_after_loading(self):
         if self.use_fp8:

From df1631690c706b72df401316858884585167970f Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 24 Apr 2024 21:44:13 -0700
Subject: [PATCH 22/32] format

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 8e49c86202eb7..0cc59ae57d0d3 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -221,8 +221,8 @@ def moe_align_block_size(
 
 def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
-                            B_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
+                            B_scale: Optional[torch.Tensor],
+                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,

From 794f1a185e179ed4e3a59e27e1d55cc0064e8283 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 13:45:11 -0700
Subject: [PATCH 23/32] Update vllm/_custom_ops.py

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/_custom_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 9d40ec4928276..40fdb2cef380f 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -160,7 +160,7 @@ def scaled_fp8_quant(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:
-        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        scale = torch.empty(1, device=input.device, dtype=torch.float32)
         vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         vllm_ops.static_scaled_fp8_quant(output, input, scale)

From c13b6a49ac5ab715e6ca5d38d747ddfc68876018 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 14:08:37 -0700
Subject: [PATCH 24/32] update

---
 vllm/model_executor/layers/quantization/fp8.py | 8 ++++----
 vllm/model_executor/models/mixtral.py          | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ab562288a8f7d..271cddc1175a5 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -15,9 +15,9 @@ class FP8Config(QuantizationConfig):
 
     def __init__(
         self,
-        act_scaling: str = "dynamic",
+        activation_scheme: str = "dynamic",
     ) -> None:
-        self.act_scaling = act_scaling
+        self.activation_scheme = activation_scheme
 
     @classmethod
     def get_name(cls) -> str:
@@ -44,8 +44,8 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
-        act_scaling = cls.get_from_keys(config, ["act_scaling"])
-        return cls(act_scaling)
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        return cls(activation_scheme)
 
     def get_linear_method(self) -> "Fp8LinearMethod":
         return Fp8LinearMethod(self)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 4439a75e5b57f..6f6c93b273335 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -123,7 +123,7 @@ def __init__(
 
         # Scaling factors for FP8 activations
         need_act_scales = (self.use_fp8 and
-                           linear_method.quant_config.act_scaling == "static")
+                           linear_method.quant_config.activation_scheme == "static")
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None

From 5a230ed4794899bc20518f9cf48cc59ed40a97d4 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 14:12:54 -0700
Subject: [PATCH 25/32] update

---
 vllm/model_executor/models/mixtral.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 6f6c93b273335..c7199338b5525 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -123,7 +123,8 @@ def __init__(
 
         # Scaling factors for FP8 activations
         need_act_scales = (self.use_fp8 and
-                           linear_method.quant_config.activation_scheme == "static")
+                           linear_method.quant_config.activation_scheme
+                           == "static")
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None

From 80069c977123957a4e814dcb2485cb14cf5b3bcf Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 14:17:00 -0700
Subject: [PATCH 26/32] format

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c7199338b5525..ad6f43fbd4aa0 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -122,8 +122,8 @@ def __init__(
             requires_grad=False) if self.use_fp8 else None
 
         # Scaling factors for FP8 activations
-        need_act_scales = (self.use_fp8 and
-                           linear_method.quant_config.activation_scheme
+        need_act_scales = (self.use_fp8
+                           and linear_method.quant_config.activation_scheme
                            == "static")
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),

From 5ce17d0cec2d7f43ebff895536f29ef356009534 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 14:27:41 -0700
Subject: [PATCH 27/32] activation_scale -> act_scale

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index ad6f43fbd4aa0..dad1c43e1b0da 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -153,7 +153,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
                        shard_size:2 * shard_size, :] = loaded_weight[shard, :]
         if weight_name.endswith("w2.weight"):
             param_data[expert_id, :, :] = loaded_weight[:, shard]
-        if "activation_scale" in weight_name:
+        if "act_scale" in weight_name:
             param_data[:] = param_data[:].max(loaded_weight)
 
     def process_weights_after_loading(self):
@@ -476,7 +476,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             # These are the activation scales for the experts
             # (param_name, weight_name, expert_id)
             ("as_scale" if weight_name in ["w1", "w3"] else "a2s_scale",
-             f"experts.{expert_id}.{weight_name}.activation_scale", expert_id)
+             f"experts.{expert_id}.{weight_name}.act_scale", expert_id)
             for expert_id in range(self.config.num_local_experts)
             for weight_name in ["w1", "w2", "w3"]
         ]

From 92d5162ee6298f2557ded542d5157f7bf3db097d Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 25 Apr 2024 14:50:36 -0700
Subject: [PATCH 28/32] fix dynamic scaling -- need init to zero due to atomic
 update

---
 vllm/_custom_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 40fdb2cef380f..9d40ec4928276 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -160,7 +160,7 @@ def scaled_fp8_quant(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:
-        scale = torch.empty(1, device=input.device, dtype=torch.float32)
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
         vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         vllm_ops.static_scaled_fp8_quant(output, input, scale)

From 521a4c84eff15299898ee36c9f658c36b4a948c6 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Fri, 26 Apr 2024 10:34:21 -0700
Subject: [PATCH 29/32] revert

---
 .../layers/quantization/base_config.py               |  6 ------
 vllm/model_executor/layers/quantization/fp8.py       |  6 +-----
 vllm/model_executor/model_loader/weight_utils.py     | 12 ++++++------
 3 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 08061f83b5a38..6115e7c3be956 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -29,12 +29,6 @@ def get_min_capability(self) -> int:
         """
         raise NotImplementedError
 
-    # The following is not an abstract method and returns True by default.
-    @classmethod
-    def require_config_file(cls) -> bool:
-        """Whether this quantization config needs a configuration filen."""
-        return True
-
     @staticmethod
     @abstractmethod
     def get_config_filenames() -> List[str]:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 271cddc1175a5..2091e7622a7b8 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -34,13 +34,9 @@ def get_min_capability(cls) -> int:
         # be included: https://github.com/pytorch/pytorch/pull/118881
         return 90
 
-    @classmethod
-    def require_config_file(cls) -> bool:
-        return False
-
     @classmethod
     def get_config_filenames(cls) -> List[str]:
-        return ["quantize_config.json"]
+        return []
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 54fd4f22f0eef..c061c9b8ab68c 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -135,19 +135,19 @@ def get_quant_config(model_config: ModelConfig,
     else:
         hf_folder = model_name_or_path
 
-    config_files = glob.glob(os.path.join(hf_folder, "*.json"))
     possible_config_filenames = quant_cls.get_config_filenames()
 
+    # If the quantization config is not found, use the default config.
+    if not possible_config_filenames:
+        return quant_cls()
+
+    config_files = glob.glob(os.path.join(hf_folder, "*.json"))
+
     quant_config_files = [
         f for f in config_files if any(
             f.endswith(x) for x in possible_config_filenames)
     ]
 
-    # If the quantization config is optional and not provided, use the
-    # default config.
-    if not quant_cls.require_config_file() and not quant_config_files:
-        return quant_cls()
-
     if len(quant_config_files) == 0:
         raise ValueError(
             f"Cannot find the config file for {model_config.quantization}")

From f3300563b125a35b15e3c3571d3565e7aac74411 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Fri, 26 Apr 2024 10:34:49 -0700
Subject: [PATCH 30/32] update

---
 vllm/model_executor/model_loader/weight_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index c061c9b8ab68c..9995f2afe3cf7 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -147,7 +147,6 @@ def get_quant_config(model_config: ModelConfig,
         f for f in config_files if any(
             f.endswith(x) for x in possible_config_filenames)
     ]
-
     if len(quant_config_files) == 0:
         raise ValueError(
             f"Cannot find the config file for {model_config.quantization}")

From 72e1e42fb9ee12c453264c596d1d40ae9e0e4569 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Fri, 26 Apr 2024 13:54:39 -0700
Subject: [PATCH 31/32] fix

---
 vllm/model_executor/layers/quantization/fp8.py | 2 +-
 vllm/model_executor/models/mixtral.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index afaeb3033762c..ba9f3149649c1 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -40,7 +40,7 @@ def get_config_filenames(cls) -> List[str]:
         return []
 
     @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
+    def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
         activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
         return cls(activation_scheme)
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 64dc2ef18aa37..a94cf60a35295 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -124,7 +124,7 @@ def __init__(
 
         # Scaling factors for FP8 activations
         need_act_scales = (self.use_fp8
-                           and linear_method.quant_config.activation_scheme
+                           and quant_config.activation_scheme
                            == "static")
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),

From 0f5824c54ccea96072f59e9cc397b3d87b89b20e Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Fri, 26 Apr 2024 13:57:46 -0700
Subject: [PATCH 32/32] update

---
 vllm/model_executor/models/mixtral.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a94cf60a35295..c5dd1a63e2f7a 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -124,8 +124,7 @@ def __init__(
 
         # Scaling factors for FP8 activations
         need_act_scales = (self.use_fp8
-                           and quant_config.activation_scheme
-                           == "static")
+                           and quant_config.activation_scheme == "static")
         self.as_scale = nn.Parameter(
             torch.zeros(1, device="cuda", dtype=torch.float32),
             requires_grad=False) if need_act_scales else None