From ccee5d337df4455587c099890f93e977972d3c4f Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 13:46:35 -0700 Subject: [PATCH 01/32] Implement static scaling for Mixtral --- csrc/ops.h | 7 ++++- csrc/pybind.cpp | 3 +- csrc/quantization/fp8/fp8_cuda_kernels.cu | 25 +++++++++++++++- vllm/_custom_ops.py | 9 ++++-- .../layers/fused_moe/fused_moe.py | 12 +++++--- .../model_executor/layers/quantization/fp8.py | 11 +++++-- vllm/model_executor/models/mixtral.py | 30 ++++++++++++++++++- 7 files changed, 84 insertions(+), 13 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index ff7a3de1a0a8c..03bb1e24dc68e 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -146,7 +146,12 @@ void gptq_shuffle( torch::Tensor q_perm, int bit); -void scaled_fp8_quant( +void static_scaled_fp8_quant( + torch::Tensor& out, + torch::Tensor& input, + torch::Tensor& scale); + +void dynamic_scaled_fp8_quant( torch::Tensor& out, torch::Tensor& input, torch::Tensor& scale); diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index a5b16c5abc3ed..2250c7f69f0ab 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -73,7 +73,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ"); ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ"); ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM"); - ops.def("scaled_fp8_quant", &scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor"); + ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, "Compute FP8 quantized tensor for given scaling factor"); + ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor"); ops.def( "moe_align_block_size", &moe_align_block_size, diff --git a/csrc/quantization/fp8/fp8_cuda_kernels.cu b/csrc/quantization/fp8/fp8_cuda_kernels.cu index c3337cede1282..2477051eb60d7 100644 --- a/csrc/quantization/fp8/fp8_cuda_kernels.cu +++ b/csrc/quantization/fp8/fp8_cuda_kernels.cu @@ -74,7 +74,30 @@ __global__ void scaled_fp8_quant_kernel( } // namespace vllm -void scaled_fp8_quant( +void static_scaled_fp8_quant( + torch::Tensor& out, // [..., d] + torch::Tensor& input, // [..., d] + torch::Tensor& scale) // [1] +{ + int64_t num_tokens = input.numel() / input.size(-1); + int64_t num_elems = input.numel(); + dim3 grid(num_tokens); + dim3 block(1024); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), + "scaled_fp8_quant_kernel", + [&] { + vllm::scaled_fp8_quant_kernel<<>>( + out.data_ptr(), + input.data_ptr(), + scale.data_ptr(), + num_elems); + }); +} + +void dynamic_scaled_fp8_quant( torch::Tensor& out, // [..., d] torch::Tensor& input, // [..., d] torch::Tensor& scale) // [1] diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index e4b16ed918d1a..73e7b723edb31 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -154,10 +154,13 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, # fp8 -def scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - scale = torch.zeros(1, device=input.device, dtype=torch.float32) +def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: output = torch.empty_like(input, dtype=torch.float8_e4m3fn) - vllm_ops.scaled_fp8_quant(output, input, scale) + if scale: + vllm_ops.static_scaled_fp8_quant(output, input, scale) + else: + scale = torch.zeros(1, device=input.device, dtype=torch.float32) + vllm_ops.dynamic_scaled_fp8_quant(output, input, scale) return output, scale diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index ac7c30e2a9727..b77732cff22ad 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -220,8 +220,8 @@ def moe_align_block_size( def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, - B_scale: torch.Tensor, topk_weights: torch.Tensor, - topk_ids: torch.Tensor, + A_scale: Optional[torch.Tensor], B_scale: torch.Tensor, + topk_weights: torch.Tensor, topk_ids: torch.Tensor, sorted_token_ids: torch.Tensor, expert_ids: torch.Tensor, num_tokens_post_padded: torch.Tensor, @@ -232,10 +232,10 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, assert sorted_token_ids.stride(0) == 1 if not use_fp8: - A_scale = None + assert A_scale is None assert B_scale is None else: - A, A_scale = ops.scaled_fp8_quant(A) + A, A_scale = ops.scaled_fp8_quant(A, A_scale) assert B_scale is not None grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[ @@ -318,6 +318,8 @@ def fused_moe( use_fp8: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets of @@ -434,6 +436,7 @@ def fused_moe( invoke_fused_moe_kernel(hidden_states, w1, intermediate_cache1, + a1_scale, w1_scale, topk_weights, topk_ids, @@ -451,6 +454,7 @@ def fused_moe( invoke_fused_moe_kernel(intermediate_cache2, w2, intermediate_cache3, + a2_scale, w2_scale, topk_weights, topk_ids, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 01e494c870e71..633021fda894b 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -13,6 +13,12 @@ class FP8Config(QuantizationConfig): """Config class for FP8.""" + def __init__( + self, + act_scaling: str="dynamic", + ) -> None: + self.act_scaling = act_scaling + @classmethod def get_name(cls) -> str: return "fp8" @@ -30,11 +36,12 @@ def get_min_capability(cls) -> int: @classmethod def get_config_filenames(cls) -> List[str]: - return [] + return ["quantize_config.json"] @classmethod def from_config(cls, config: Dict[str, Any]) -> "FP8Config": - return cls() + act_scaling = cls.get_from_keys(config, ["act_scaling"]) + return cls(act_scaling) def get_linear_method(self) -> "Fp8LinearMethod": return Fp8LinearMethod(self) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index a33b795d7088e..951c44ca24622 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -114,12 +114,27 @@ def __init__( self.num_total_experts, device="cuda", dtype=torch.float32), requires_grad=False) if self.use_fp8 else None + # Scaling factors for FP8 activations + static_act_scaling = self.use_fp8 and linear_method.act_scaling == "static" + self.as_scale = nn.Parameter( + torch.zeros(1, device="cuda", dtype=torch.float32), + requires_grad=False) if static_act_scaling else None + self.a2s_scale = nn.Parameter( + torch.zeros(1, device="cuda", dtype=torch.float32), + requires_grad=False) if static_act_scaling else None + set_weight_attrs(self.ws, { "weight_loader": self.weight_loader, }) set_weight_attrs(self.w2s, { "weight_loader": self.weight_loader, }) + set_weight_attrs(self.as_scale, { + "weight_loader": self.weight_loader, + }) + set_weight_attrs(self.a2s_scale, { + "weight_loader": self.weight_loader, + }) def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, weight_name: str, expert_id: int): @@ -134,6 +149,9 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, shard_size:2 * shard_size, :] = loaded_weight[shard, :] if weight_name.endswith("w2.weight"): param_data[expert_id, :, :] = loaded_weight[:, shard] + if "activation_scale" in weight_name: + param_data[:] = param_data[:].max(loaded_weight) + print("loaded scale", weight_name, param_data) def process_weights_after_loading(self): if self.use_fp8: @@ -161,7 +179,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: inplace=True, use_fp8=self.use_fp8, w1_scale=self.ws_scale, - w2_scale=self.w2s_scale) + w2_scale=self.w2s_scale, + a1_scale=self.a1_scale, + a2_scale=self.a2_scale) if self.tp_size > 1: final_hidden_states = tensor_model_parallel_all_reduce( @@ -443,11 +463,19 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ] expert_params_mapping = [ + # These are the weights for the experts # (param_name, weight_name, expert_id) ("ws" if weight_name in ["w1", "w3"] else "w2s", f"experts.{expert_id}.{weight_name}.weight", expert_id) for expert_id in range(self.config.num_local_experts) for weight_name in ["w1", "w2", "w3"] + ] + [ + # These are the activation scales for the experts + # (param_name, weight_name, expert_id) + ("a_scale" if activation_name in ["a1", "a3"] else "a2_scale", + f"experts.{expert_id}.{activation_name}.activation_scale", expert_id) + for expert_id in range(self.config.num_local_experts) + for activation_name in ["a1", "a2", "a3"] ] params_dict = dict(self.named_parameters()) From 8f71c795314101187aa4aa65c128a73781ef14a4 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 14:01:32 -0700 Subject: [PATCH 02/32] fix --- vllm/model_executor/models/mixtral.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 951c44ca24622..6a52db56b50b9 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -115,13 +115,13 @@ def __init__( requires_grad=False) if self.use_fp8 else None # Scaling factors for FP8 activations - static_act_scaling = self.use_fp8 and linear_method.act_scaling == "static" + need_act_scales = not self.use_fp8 or linear_method.act_scaling == "static" self.as_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), - requires_grad=False) if static_act_scaling else None + requires_grad=False) if need_act_scales else None self.a2s_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), - requires_grad=False) if static_act_scaling else None + requires_grad=False) if need_act_scales else None set_weight_attrs(self.ws, { "weight_loader": self.weight_loader, From 6eb01e07977b7a93348dc5b3247a5b3f369f7720 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 14:07:48 -0700 Subject: [PATCH 03/32] update --- vllm/model_executor/model_loader/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 9995f2afe3cf7..2cf4cd9f5c642 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -147,7 +147,7 @@ def get_quant_config(model_config: ModelConfig, f for f in config_files if any( f.endswith(x) for x in possible_config_filenames) ] - if len(quant_config_files) == 0: + if len(quant_config_files) == 0 and "" not in possible_config_filenames: raise ValueError( f"Cannot find the config file for {model_config.quantization}") if len(quant_config_files) > 1: From dc89cbc364abbcd471c547381de53cb72bd76c1b Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 14:16:39 -0700 Subject: [PATCH 04/32] fix --- vllm/model_executor/layers/quantization/fp8.py | 2 ++ vllm/model_executor/model_loader/weight_utils.py | 13 ++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 633021fda894b..7c0e0e9e8e45b 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -13,6 +13,8 @@ class FP8Config(QuantizationConfig): """Config class for FP8.""" + config_file_optional = True + def __init__( self, act_scaling: str="dynamic", diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 2cf4cd9f5c642..8fc85da86748c 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -135,19 +135,18 @@ def get_quant_config(model_config: ModelConfig, else: hf_folder = model_name_or_path - possible_config_filenames = quant_cls.get_config_filenames() - - # If the quantization config is not found, use the default config. - if not possible_config_filenames: - return quant_cls() - config_files = glob.glob(os.path.join(hf_folder, "*.json")) quant_config_files = [ f for f in config_files if any( f.endswith(x) for x in possible_config_filenames) ] - if len(quant_config_files) == 0 and "" not in possible_config_filenames: + + possible_config_filenames = quant_cls.get_config_filenames() + # If the quantization config is optional and not provided, use the default config. + if quant_cls.getattr("config_file_optional", False) and not quant_config_files: + return quant_cls() + if len(quant_config_files) == 0: raise ValueError( f"Cannot find the config file for {model_config.quantization}") if len(quant_config_files) > 1: From be60845acd14b0639fc565a7aa954ad98df5c7b3 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 14:18:08 -0700 Subject: [PATCH 05/32] update --- vllm/model_executor/model_loader/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 8fc85da86748c..df5b8fd80d1a3 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -136,13 +136,13 @@ def get_quant_config(model_config: ModelConfig, hf_folder = model_name_or_path config_files = glob.glob(os.path.join(hf_folder, "*.json")) + possible_config_filenames = quant_cls.get_config_filenames() quant_config_files = [ f for f in config_files if any( f.endswith(x) for x in possible_config_filenames) ] - possible_config_filenames = quant_cls.get_config_filenames() # If the quantization config is optional and not provided, use the default config. if quant_cls.getattr("config_file_optional", False) and not quant_config_files: return quant_cls() From 4613cb562f4df81e651b798d408fb9bfab067640 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 14:19:52 -0700 Subject: [PATCH 06/32] update --- vllm/model_executor/model_loader/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index df5b8fd80d1a3..f3e0411c54336 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -144,7 +144,7 @@ def get_quant_config(model_config: ModelConfig, ] # If the quantization config is optional and not provided, use the default config. - if quant_cls.getattr("config_file_optional", False) and not quant_config_files: + if getattr(quant_cls, "config_file_optional", False) and not quant_config_files: return quant_cls() if len(quant_config_files) == 0: raise ValueError( From 3d95d86e040fa643f110f952d42376ca26a702ce Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 14:21:41 -0700 Subject: [PATCH 07/32] fix --- vllm/model_executor/models/mixtral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 6a52db56b50b9..c818764ff6799 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -115,7 +115,7 @@ def __init__( requires_grad=False) if self.use_fp8 else None # Scaling factors for FP8 activations - need_act_scales = not self.use_fp8 or linear_method.act_scaling == "static" + need_act_scales = not self.use_fp8 or linear_method.quant_config.act_scaling == "static" self.as_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), requires_grad=False) if need_act_scales else None From 642763fd5a54bf442b17fdcebbc27dabc398144a Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 14:23:42 -0700 Subject: [PATCH 08/32] move --- vllm/model_executor/models/mixtral.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index c818764ff6799..7260d1e7667ea 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -104,6 +104,13 @@ def __init__( device="cuda", dtype=self.params_dtype)) + set_weight_attrs(self.ws, { + "weight_loader": self.weight_loader, + }) + set_weight_attrs(self.w2s, { + "weight_loader": self.weight_loader, + }) + # Scaling factors for FP8 weights self.ws_scale = nn.Parameter( torch.ones( @@ -123,18 +130,13 @@ def __init__( torch.zeros(1, device="cuda", dtype=torch.float32), requires_grad=False) if need_act_scales else None - set_weight_attrs(self.ws, { - "weight_loader": self.weight_loader, - }) - set_weight_attrs(self.w2s, { - "weight_loader": self.weight_loader, - }) - set_weight_attrs(self.as_scale, { - "weight_loader": self.weight_loader, - }) - set_weight_attrs(self.a2s_scale, { - "weight_loader": self.weight_loader, - }) + if need_act_scales: + set_weight_attrs(self.as_scale, { + "weight_loader": self.weight_loader, + }) + set_weight_attrs(self.a2s_scale, { + "weight_loader": self.weight_loader, + }) def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, weight_name: str, expert_id: int): From 706e9317ef3c4f3ee715df3b0f0571a832073d11 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 14:27:04 -0700 Subject: [PATCH 09/32] update --- vllm/model_executor/models/mixtral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 7260d1e7667ea..9f4673b59c673 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -182,8 +182,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: use_fp8=self.use_fp8, w1_scale=self.ws_scale, w2_scale=self.w2s_scale, - a1_scale=self.a1_scale, - a2_scale=self.a2_scale) + a1_scale=self.as_scale, + a2_scale=self.a2s_scale) if self.tp_size > 1: final_hidden_states = tensor_model_parallel_all_reduce( From 9a3c78ccbb06504400709d1148d1b9c34afd3295 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 14:31:18 -0700 Subject: [PATCH 10/32] lol --- vllm/model_executor/models/mixtral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 9f4673b59c673..e4a0ba7902682 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -122,7 +122,7 @@ def __init__( requires_grad=False) if self.use_fp8 else None # Scaling factors for FP8 activations - need_act_scales = not self.use_fp8 or linear_method.quant_config.act_scaling == "static" + need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static" self.as_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), requires_grad=False) if need_act_scales else None From 1b6f0201c60c3e75bcfc45d96f9c32d7f1754c72 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 15:15:06 -0700 Subject: [PATCH 11/32] fix cuda graph --- vllm/_custom_ops.py | 14 +++++++------ .../layers/fused_moe/fused_moe.py | 21 +++++++++++-------- vllm/model_executor/models/mixtral.py | 16 +++++++------- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 73e7b723edb31..0a18714da0e5c 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -154,13 +154,15 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, # fp8 -def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: +def static_scaled_fp8_quant(input: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: output = torch.empty_like(input, dtype=torch.float8_e4m3fn) - if scale: - vllm_ops.static_scaled_fp8_quant(output, input, scale) - else: - scale = torch.zeros(1, device=input.device, dtype=torch.float32) - vllm_ops.dynamic_scaled_fp8_quant(output, input, scale) + vllm_ops.static_scaled_fp8_quant(output, input, scale) + return output + +def dynamic_scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + output = torch.empty_like(input, dtype=torch.float8_e4m3fn) + scale = torch.zeros(1, device=input.device, dtype=torch.float32) + vllm_ops.dynamic_scaled_fp8_quant(output, input, scale) return output, scale diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index b77732cff22ad..556ecc5f30868 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -227,15 +227,18 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, num_tokens_post_padded: torch.Tensor, mul_routed_weight: bool, top_k: int, config: Dict[str, Any], compute_type: tl.dtype, - use_fp8: bool) -> None: + linear_method: Optional[LinearMethodBase]) -> None: assert topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 - if not use_fp8: + if not isinstance(linear_method, Fp8LinearMethod): assert A_scale is None assert B_scale is None - else: - A, A_scale = ops.scaled_fp8_quant(A, A_scale) + elif linear_method.quant_config.act_scaling == "static": + A = ops.static_scaled_fp8_quant(A, A_scale) + assert B_scale is not None + elif linear_method.quant_config.act_scaling == "dynamic": + A, A_scale = ops.dynamic_scaled_fp8_quant(A) assert B_scale is not None grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[ @@ -265,7 +268,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, MUL_ROUTED_WEIGHT=mul_routed_weight, top_k=top_k, compute_type=compute_type, - use_fp8=use_fp8, + use_fp8=isinstance(linear_method, Fp8LinearMethod), **config, ) @@ -315,7 +318,7 @@ def fused_moe( renormalize: bool, inplace: bool = False, override_config: Optional[Dict[str, Any]] = None, - use_fp8: bool = False, + linear_method: Optional[LinearMethodBase] = None, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, @@ -397,7 +400,7 @@ def fused_moe( else: # First try to load optimal config from the file configs = get_moe_configs(E, w2.shape[2], - "float8" if use_fp8 else None) + "float8" if isinstance(linear_method, Fp8LinearMethod) else None) if configs: # If an optimal configuration map has been found, look up the @@ -447,7 +450,7 @@ def fused_moe( topk_ids.shape[1], config, compute_type=tl.float16, - use_fp8=use_fp8) + linear_method=linear_method) ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) @@ -465,7 +468,7 @@ def fused_moe( 1, config, compute_type=tl.float16, - use_fp8=use_fp8) + linear_method=linear_method) if inplace: return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index e4a0ba7902682..ca051e8fa813c 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -77,9 +77,7 @@ def __init__( self.top_k = top_k self.hidden_size = hidden_size self.intermediate_size = intermediate_size // self.tp_size - # FIXME(pcmoritz): Make this more general to support different - # quantization schemes - self.use_fp8 = isinstance(linear_method, Fp8LinearMethod) + self.linear_method = linear_method if params_dtype is None: params_dtype = torch.get_default_dtype() @@ -111,18 +109,20 @@ def __init__( "weight_loader": self.weight_loader, }) + use_fp8 = isinstance(linear_method, Fp8LinearMethod) + # Scaling factors for FP8 weights self.ws_scale = nn.Parameter( torch.ones( self.num_total_experts, device="cuda", dtype=torch.float32), - requires_grad=False) if self.use_fp8 else None + requires_grad=False) if use_fp8 else None self.w2s_scale = nn.Parameter( torch.ones( self.num_total_experts, device="cuda", dtype=torch.float32), - requires_grad=False) if self.use_fp8 else None + requires_grad=False) if use_fp8 else None # Scaling factors for FP8 activations - need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static" + need_act_scales = use_fp8 and linear_method.quant_config.act_scaling == "static" self.as_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), requires_grad=False) if need_act_scales else None @@ -156,7 +156,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, print("loaded scale", weight_name, param_data) def process_weights_after_loading(self): - if self.use_fp8: + if isinstance(self.linear_method, Fp8LinearMethod): ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn) w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn) for expert in range(self.num_total_experts): @@ -179,7 +179,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: self.top_k, renormalize=True, inplace=True, - use_fp8=self.use_fp8, + linear_method=self.linear_method, w1_scale=self.ws_scale, w2_scale=self.w2s_scale, a1_scale=self.as_scale, From b09bcecc6ff019ce5a4b3ece92d9e591832d1f5a Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 15:20:17 -0700 Subject: [PATCH 12/32] fix --- vllm/model_executor/layers/fused_moe/fused_moe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 556ecc5f30868..dab54a385b119 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -10,6 +10,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.linear import LinearMethodBase from vllm.utils import is_hip logger = init_logger(__name__) From 052e2b3b83e6760739e2101252f38ed198784029 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 15:22:17 -0700 Subject: [PATCH 13/32] update --- vllm/model_executor/layers/fused_moe/fused_moe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index dab54a385b119..db22c38011c82 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -11,6 +11,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.linear import LinearMethodBase +from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod from vllm.utils import is_hip logger = init_logger(__name__) From b33c6d7a5208564735b6f6eeeeaed88b51a7be30 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 18:46:25 -0700 Subject: [PATCH 14/32] update --- vllm/model_executor/models/mixtral.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index ca051e8fa813c..c233cd0d9b065 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -474,10 +474,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ] + [ # These are the activation scales for the experts # (param_name, weight_name, expert_id) - ("a_scale" if activation_name in ["a1", "a3"] else "a2_scale", - f"experts.{expert_id}.{activation_name}.activation_scale", expert_id) + ("as_scale" if weight_name in ["w1", "w3"] else "a2s_scale", + f"experts.{expert_id}.{weight_name}.activation_scale", expert_id) for expert_id in range(self.config.num_local_experts) - for activation_name in ["a1", "a2", "a3"] + for weight_name in ["w1", "w2", "w3"] ] params_dict = dict(self.named_parameters()) From 475f58d9a25a2cc0de04dcfbcba733590aeafd11 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 18:56:02 -0700 Subject: [PATCH 15/32] refactor --- vllm/model_executor/layers/quantization/base_config.py | 6 ++++++ vllm/model_executor/layers/quantization/fp8.py | 6 ++++-- vllm/model_executor/model_loader/weight_utils.py | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 6115e7c3be956..08061f83b5a38 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -29,6 +29,12 @@ def get_min_capability(self) -> int: """ raise NotImplementedError + # The following is not an abstract method and returns True by default. + @classmethod + def require_config_file(cls) -> bool: + """Whether this quantization config needs a configuration filen.""" + return True + @staticmethod @abstractmethod def get_config_filenames() -> List[str]: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 7c0e0e9e8e45b..d5e37cbd7bd5e 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -13,8 +13,6 @@ class FP8Config(QuantizationConfig): """Config class for FP8.""" - config_file_optional = True - def __init__( self, act_scaling: str="dynamic", @@ -36,6 +34,10 @@ def get_min_capability(cls) -> int: # be included: https://github.com/pytorch/pytorch/pull/118881 return 90 + @classmethod + def require_config_file(cls) -> bool: + return False + @classmethod def get_config_filenames(cls) -> List[str]: return ["quantize_config.json"] diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index f3e0411c54336..70879f91bdc23 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -144,7 +144,7 @@ def get_quant_config(model_config: ModelConfig, ] # If the quantization config is optional and not provided, use the default config. - if getattr(quant_cls, "config_file_optional", False) and not quant_config_files: + if not quant_cls.require_config_file() and not quant_config_files: return quant_cls() if len(quant_config_files) == 0: raise ValueError( From 56b4880b47e4911bebaabe788eaeba08812eaa1b Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 19:16:27 -0700 Subject: [PATCH 16/32] update --- vllm/_custom_ops.py | 14 ++++++-------- vllm/model_executor/layers/fused_moe/fused_moe.py | 7 ++----- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 0a18714da0e5c..b871b8e3a11d9 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -154,15 +154,13 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, # fp8 -def static_scaled_fp8_quant(input: torch.Tensor, scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: +def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]=None) -> Tuple[torch.Tensor, torch.Tensor]: output = torch.empty_like(input, dtype=torch.float8_e4m3fn) - vllm_ops.static_scaled_fp8_quant(output, input, scale) - return output - -def dynamic_scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - output = torch.empty_like(input, dtype=torch.float8_e4m3fn) - scale = torch.zeros(1, device=input.device, dtype=torch.float32) - vllm_ops.dynamic_scaled_fp8_quant(output, input, scale) + if scale is None: + scale = torch.zeros(1, device=input.device, dtype=torch.float32) + vllm_ops.dynamic_scaled_fp8_quant(output, input, scale) + else: + vllm_ops.static_scaled_fp8_quant(output, input, scale) return output, scale diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index db22c38011c82..6fe94d4e268c3 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -236,11 +236,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, if not isinstance(linear_method, Fp8LinearMethod): assert A_scale is None assert B_scale is None - elif linear_method.quant_config.act_scaling == "static": - A = ops.static_scaled_fp8_quant(A, A_scale) - assert B_scale is not None - elif linear_method.quant_config.act_scaling == "dynamic": - A, A_scale = ops.dynamic_scaled_fp8_quant(A) + else: + A, A_scale = ops.scaled_fp8_quant(A, A_scale) assert B_scale is not None grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[ From be371549899fd3e169bb8d3f1730c27e83ecf760 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 19:27:40 -0700 Subject: [PATCH 17/32] revert --- .../model_executor/layers/fused_moe/fused_moe.py | 16 +++++++--------- vllm/model_executor/models/mixtral.py | 16 ++++++++-------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 6fe94d4e268c3..b77732cff22ad 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -10,8 +10,6 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.linear import LinearMethodBase -from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod from vllm.utils import is_hip logger = init_logger(__name__) @@ -229,11 +227,11 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, num_tokens_post_padded: torch.Tensor, mul_routed_weight: bool, top_k: int, config: Dict[str, Any], compute_type: tl.dtype, - linear_method: Optional[LinearMethodBase]) -> None: + use_fp8: bool) -> None: assert topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 - if not isinstance(linear_method, Fp8LinearMethod): + if not use_fp8: assert A_scale is None assert B_scale is None else: @@ -267,7 +265,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, MUL_ROUTED_WEIGHT=mul_routed_weight, top_k=top_k, compute_type=compute_type, - use_fp8=isinstance(linear_method, Fp8LinearMethod), + use_fp8=use_fp8, **config, ) @@ -317,7 +315,7 @@ def fused_moe( renormalize: bool, inplace: bool = False, override_config: Optional[Dict[str, Any]] = None, - linear_method: Optional[LinearMethodBase] = None, + use_fp8: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, @@ -399,7 +397,7 @@ def fused_moe( else: # First try to load optimal config from the file configs = get_moe_configs(E, w2.shape[2], - "float8" if isinstance(linear_method, Fp8LinearMethod) else None) + "float8" if use_fp8 else None) if configs: # If an optimal configuration map has been found, look up the @@ -449,7 +447,7 @@ def fused_moe( topk_ids.shape[1], config, compute_type=tl.float16, - linear_method=linear_method) + use_fp8=use_fp8) ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) @@ -467,7 +465,7 @@ def fused_moe( 1, config, compute_type=tl.float16, - linear_method=linear_method) + use_fp8=use_fp8) if inplace: return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index c233cd0d9b065..9baf176fc78ee 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -77,7 +77,9 @@ def __init__( self.top_k = top_k self.hidden_size = hidden_size self.intermediate_size = intermediate_size // self.tp_size - self.linear_method = linear_method + # FIXME(pcmoritz): Make this more general to support different + # quantization schemes + self.use_fp8 = isinstance(linear_method, Fp8LinearMethod) if params_dtype is None: params_dtype = torch.get_default_dtype() @@ -109,20 +111,18 @@ def __init__( "weight_loader": self.weight_loader, }) - use_fp8 = isinstance(linear_method, Fp8LinearMethod) - # Scaling factors for FP8 weights self.ws_scale = nn.Parameter( torch.ones( self.num_total_experts, device="cuda", dtype=torch.float32), - requires_grad=False) if use_fp8 else None + requires_grad=False) if self.use_fp8 else None self.w2s_scale = nn.Parameter( torch.ones( self.num_total_experts, device="cuda", dtype=torch.float32), - requires_grad=False) if use_fp8 else None + requires_grad=False) if self.use_fp8 else None # Scaling factors for FP8 activations - need_act_scales = use_fp8 and linear_method.quant_config.act_scaling == "static" + need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static" self.as_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), requires_grad=False) if need_act_scales else None @@ -156,7 +156,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, print("loaded scale", weight_name, param_data) def process_weights_after_loading(self): - if isinstance(self.linear_method, Fp8LinearMethod): + if self.use_fp8: ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn) w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn) for expert in range(self.num_total_experts): @@ -179,7 +179,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: self.top_k, renormalize=True, inplace=True, - linear_method=self.linear_method, + use_fp8=self.use_fp8, w1_scale=self.ws_scale, w2_scale=self.w2s_scale, a1_scale=self.as_scale, From 9c54d1923876d661f93d470a539903415036735f Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 19:35:27 -0700 Subject: [PATCH 18/32] format --- vllm/_custom_ops.py | 5 ++++- vllm/model_executor/layers/fused_moe/fused_moe.py | 5 +++-- vllm/model_executor/layers/quantization/fp8.py | 2 +- vllm/model_executor/model_loader/weight_utils.py | 4 +++- vllm/model_executor/models/mixtral.py | 3 ++- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index b871b8e3a11d9..c6f330f211e8a 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -154,7 +154,10 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, # fp8 -def scaled_fp8_quant(input: torch.Tensor, scale: Optional[torch.Tensor]=None) -> Tuple[torch.Tensor, torch.Tensor]: +def scaled_fp8_quant( + input: torch.Tensor, + scale: Optional[torch.Tensor] = None +) -> Tuple[torch.Tensor, torch.Tensor]: output = torch.empty_like(input, dtype=torch.float8_e4m3fn) if scale is None: scale = torch.zeros(1, device=input.device, dtype=torch.float32) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index b77732cff22ad..17b140195676e 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -220,8 +220,9 @@ def moe_align_block_size( def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, - A_scale: Optional[torch.Tensor], B_scale: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, + A_scale: Optional[torch.Tensor], + B_scale: torch.Tensor, topk_weights: torch.Tensor, + topk_ids: torch.Tensor, sorted_token_ids: torch.Tensor, expert_ids: torch.Tensor, num_tokens_post_padded: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index d5e37cbd7bd5e..ab562288a8f7d 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -15,7 +15,7 @@ class FP8Config(QuantizationConfig): def __init__( self, - act_scaling: str="dynamic", + act_scaling: str = "dynamic", ) -> None: self.act_scaling = act_scaling diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 70879f91bdc23..54fd4f22f0eef 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -143,9 +143,11 @@ def get_quant_config(model_config: ModelConfig, f.endswith(x) for x in possible_config_filenames) ] - # If the quantization config is optional and not provided, use the default config. + # If the quantization config is optional and not provided, use the + # default config. if not quant_cls.require_config_file() and not quant_config_files: return quant_cls() + if len(quant_config_files) == 0: raise ValueError( f"Cannot find the config file for {model_config.quantization}") diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 9baf176fc78ee..f42e1c939d723 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -122,7 +122,8 @@ def __init__( requires_grad=False) if self.use_fp8 else None # Scaling factors for FP8 activations - need_act_scales = self.use_fp8 and linear_method.quant_config.act_scaling == "static" + need_act_scales = (self.use_fp8 and + linear_method.quant_config.act_scaling == "static") self.as_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), requires_grad=False) if need_act_scales else None From c5155eab2f068dcd49a2fa78b0b198e0fc36582e Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 20:48:55 -0700 Subject: [PATCH 19/32] Update vllm/_custom_ops.py Co-authored-by: Woosuk Kwon --- vllm/_custom_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index c6f330f211e8a..9d40ec4928276 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -155,8 +155,8 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, # fp8 def scaled_fp8_quant( - input: torch.Tensor, - scale: Optional[torch.Tensor] = None + input: torch.Tensor, + scale: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: output = torch.empty_like(input, dtype=torch.float8_e4m3fn) if scale is None: From 948cca76dff98914d7ad08dea7eed17c10f2b933 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 21:16:41 -0700 Subject: [PATCH 20/32] Update vllm/model_executor/layers/fused_moe/fused_moe.py Co-authored-by: Woosuk Kwon --- vllm/model_executor/layers/fused_moe/fused_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 17b140195676e..8e49c86202eb7 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -221,7 +221,7 @@ def moe_align_block_size( def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, A_scale: Optional[torch.Tensor], - B_scale: torch.Tensor, topk_weights: torch.Tensor, + B_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, topk_ids: torch.Tensor, sorted_token_ids: torch.Tensor, expert_ids: torch.Tensor, From 3feb88790dd5ab0d9d8244a92015a0ac27525387 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 21:16:52 -0700 Subject: [PATCH 21/32] Update vllm/model_executor/models/mixtral.py Co-authored-by: Woosuk Kwon --- vllm/model_executor/models/mixtral.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index f42e1c939d723..4439a75e5b57f 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -154,7 +154,6 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, param_data[expert_id, :, :] = loaded_weight[:, shard] if "activation_scale" in weight_name: param_data[:] = param_data[:].max(loaded_weight) - print("loaded scale", weight_name, param_data) def process_weights_after_loading(self): if self.use_fp8: From df1631690c706b72df401316858884585167970f Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 24 Apr 2024 21:44:13 -0700 Subject: [PATCH 22/32] format --- vllm/model_executor/layers/fused_moe/fused_moe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 8e49c86202eb7..0cc59ae57d0d3 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -221,8 +221,8 @@ def moe_align_block_size( def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, A_scale: Optional[torch.Tensor], - B_scale: Optional[torch.Tensor], topk_weights: torch.Tensor, - topk_ids: torch.Tensor, + B_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, topk_ids: torch.Tensor, sorted_token_ids: torch.Tensor, expert_ids: torch.Tensor, num_tokens_post_padded: torch.Tensor, From 794f1a185e179ed4e3a59e27e1d55cc0064e8283 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 25 Apr 2024 13:45:11 -0700 Subject: [PATCH 23/32] Update vllm/_custom_ops.py Co-authored-by: Woosuk Kwon --- vllm/_custom_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 9d40ec4928276..40fdb2cef380f 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -160,7 +160,7 @@ def scaled_fp8_quant( ) -> Tuple[torch.Tensor, torch.Tensor]: output = torch.empty_like(input, dtype=torch.float8_e4m3fn) if scale is None: - scale = torch.zeros(1, device=input.device, dtype=torch.float32) + scale = torch.empty(1, device=input.device, dtype=torch.float32) vllm_ops.dynamic_scaled_fp8_quant(output, input, scale) else: vllm_ops.static_scaled_fp8_quant(output, input, scale) From c13b6a49ac5ab715e6ca5d38d747ddfc68876018 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 25 Apr 2024 14:08:37 -0700 Subject: [PATCH 24/32] update --- vllm/model_executor/layers/quantization/fp8.py | 8 ++++---- vllm/model_executor/models/mixtral.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index ab562288a8f7d..271cddc1175a5 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -15,9 +15,9 @@ class FP8Config(QuantizationConfig): def __init__( self, - act_scaling: str = "dynamic", + activation_scheme: str = "dynamic", ) -> None: - self.act_scaling = act_scaling + self.activation_scheme = activation_scheme @classmethod def get_name(cls) -> str: @@ -44,8 +44,8 @@ def get_config_filenames(cls) -> List[str]: @classmethod def from_config(cls, config: Dict[str, Any]) -> "FP8Config": - act_scaling = cls.get_from_keys(config, ["act_scaling"]) - return cls(act_scaling) + activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) + return cls(activation_scheme) def get_linear_method(self) -> "Fp8LinearMethod": return Fp8LinearMethod(self) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 4439a75e5b57f..6f6c93b273335 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -123,7 +123,7 @@ def __init__( # Scaling factors for FP8 activations need_act_scales = (self.use_fp8 and - linear_method.quant_config.act_scaling == "static") + linear_method.quant_config.activation_scheme == "static") self.as_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), requires_grad=False) if need_act_scales else None From 5a230ed4794899bc20518f9cf48cc59ed40a97d4 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 25 Apr 2024 14:12:54 -0700 Subject: [PATCH 25/32] update --- vllm/model_executor/models/mixtral.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 6f6c93b273335..c7199338b5525 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -123,7 +123,8 @@ def __init__( # Scaling factors for FP8 activations need_act_scales = (self.use_fp8 and - linear_method.quant_config.activation_scheme == "static") + linear_method.quant_config.activation_scheme + == "static") self.as_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), requires_grad=False) if need_act_scales else None From 80069c977123957a4e814dcb2485cb14cf5b3bcf Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 25 Apr 2024 14:17:00 -0700 Subject: [PATCH 26/32] format --- vllm/model_executor/models/mixtral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index c7199338b5525..ad6f43fbd4aa0 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -122,8 +122,8 @@ def __init__( requires_grad=False) if self.use_fp8 else None # Scaling factors for FP8 activations - need_act_scales = (self.use_fp8 and - linear_method.quant_config.activation_scheme + need_act_scales = (self.use_fp8 + and linear_method.quant_config.activation_scheme == "static") self.as_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), From 5ce17d0cec2d7f43ebff895536f29ef356009534 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 25 Apr 2024 14:27:41 -0700 Subject: [PATCH 27/32] activation_scale -> act_scale --- vllm/model_executor/models/mixtral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index ad6f43fbd4aa0..dad1c43e1b0da 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -153,7 +153,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, shard_size:2 * shard_size, :] = loaded_weight[shard, :] if weight_name.endswith("w2.weight"): param_data[expert_id, :, :] = loaded_weight[:, shard] - if "activation_scale" in weight_name: + if "act_scale" in weight_name: param_data[:] = param_data[:].max(loaded_weight) def process_weights_after_loading(self): @@ -476,7 +476,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # These are the activation scales for the experts # (param_name, weight_name, expert_id) ("as_scale" if weight_name in ["w1", "w3"] else "a2s_scale", - f"experts.{expert_id}.{weight_name}.activation_scale", expert_id) + f"experts.{expert_id}.{weight_name}.act_scale", expert_id) for expert_id in range(self.config.num_local_experts) for weight_name in ["w1", "w2", "w3"] ] From 92d5162ee6298f2557ded542d5157f7bf3db097d Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 25 Apr 2024 14:50:36 -0700 Subject: [PATCH 28/32] fix dynamic scaling -- need init to zero due to atomic update --- vllm/_custom_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 40fdb2cef380f..9d40ec4928276 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -160,7 +160,7 @@ def scaled_fp8_quant( ) -> Tuple[torch.Tensor, torch.Tensor]: output = torch.empty_like(input, dtype=torch.float8_e4m3fn) if scale is None: - scale = torch.empty(1, device=input.device, dtype=torch.float32) + scale = torch.zeros(1, device=input.device, dtype=torch.float32) vllm_ops.dynamic_scaled_fp8_quant(output, input, scale) else: vllm_ops.static_scaled_fp8_quant(output, input, scale) From 521a4c84eff15299898ee36c9f658c36b4a948c6 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Fri, 26 Apr 2024 10:34:21 -0700 Subject: [PATCH 29/32] revert --- .../layers/quantization/base_config.py | 6 ------ vllm/model_executor/layers/quantization/fp8.py | 6 +----- vllm/model_executor/model_loader/weight_utils.py | 12 ++++++------ 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 08061f83b5a38..6115e7c3be956 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -29,12 +29,6 @@ def get_min_capability(self) -> int: """ raise NotImplementedError - # The following is not an abstract method and returns True by default. - @classmethod - def require_config_file(cls) -> bool: - """Whether this quantization config needs a configuration filen.""" - return True - @staticmethod @abstractmethod def get_config_filenames() -> List[str]: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 271cddc1175a5..2091e7622a7b8 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -34,13 +34,9 @@ def get_min_capability(cls) -> int: # be included: https://github.com/pytorch/pytorch/pull/118881 return 90 - @classmethod - def require_config_file(cls) -> bool: - return False - @classmethod def get_config_filenames(cls) -> List[str]: - return ["quantize_config.json"] + return [] @classmethod def from_config(cls, config: Dict[str, Any]) -> "FP8Config": diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 54fd4f22f0eef..c061c9b8ab68c 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -135,19 +135,19 @@ def get_quant_config(model_config: ModelConfig, else: hf_folder = model_name_or_path - config_files = glob.glob(os.path.join(hf_folder, "*.json")) possible_config_filenames = quant_cls.get_config_filenames() + # If the quantization config is not found, use the default config. + if not possible_config_filenames: + return quant_cls() + + config_files = glob.glob(os.path.join(hf_folder, "*.json")) + quant_config_files = [ f for f in config_files if any( f.endswith(x) for x in possible_config_filenames) ] - # If the quantization config is optional and not provided, use the - # default config. - if not quant_cls.require_config_file() and not quant_config_files: - return quant_cls() - if len(quant_config_files) == 0: raise ValueError( f"Cannot find the config file for {model_config.quantization}") From f3300563b125a35b15e3c3571d3565e7aac74411 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Fri, 26 Apr 2024 10:34:49 -0700 Subject: [PATCH 30/32] update --- vllm/model_executor/model_loader/weight_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index c061c9b8ab68c..9995f2afe3cf7 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -147,7 +147,6 @@ def get_quant_config(model_config: ModelConfig, f for f in config_files if any( f.endswith(x) for x in possible_config_filenames) ] - if len(quant_config_files) == 0: raise ValueError( f"Cannot find the config file for {model_config.quantization}") From 72e1e42fb9ee12c453264c596d1d40ae9e0e4569 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Fri, 26 Apr 2024 13:54:39 -0700 Subject: [PATCH 31/32] fix --- vllm/model_executor/layers/quantization/fp8.py | 2 +- vllm/model_executor/models/mixtral.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index afaeb3033762c..ba9f3149649c1 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -40,7 +40,7 @@ def get_config_filenames(cls) -> List[str]: return [] @classmethod - def from_config(cls, config: Dict[str, Any]) -> "FP8Config": + def from_config(cls, config: Dict[str, Any]) -> "Fp8Config": activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) return cls(activation_scheme) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 64dc2ef18aa37..a94cf60a35295 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -124,7 +124,7 @@ def __init__( # Scaling factors for FP8 activations need_act_scales = (self.use_fp8 - and linear_method.quant_config.activation_scheme + and quant_config.activation_scheme == "static") self.as_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), From 0f5824c54ccea96072f59e9cc397b3d87b89b20e Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Fri, 26 Apr 2024 13:57:46 -0700 Subject: [PATCH 32/32] update --- vllm/model_executor/models/mixtral.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index a94cf60a35295..c5dd1a63e2f7a 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -124,8 +124,7 @@ def __init__( # Scaling factors for FP8 activations need_act_scales = (self.use_fp8 - and quant_config.activation_scheme - == "static") + and quant_config.activation_scheme == "static") self.as_scale = nn.Parameter( torch.zeros(1, device="cuda", dtype=torch.float32), requires_grad=False) if need_act_scales else None