vllm-project · robertgshaw2-neuralmagic · Apr 30, 2024 · Apr 23, 2024 · Apr 23, 2024 · Apr 23, 2024
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -146,7 +146,12 @@ void gptq_shuffle(
   torch::Tensor q_perm,
   int bit);
 
-void scaled_fp8_quant(
+void static_scaled_fp8_quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& scale);
+
+void dynamic_scaled_fp8_quant(
   torch::Tensor& out,
   torch::Tensor& input,
   torch::Tensor& scale);

diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
@@ -73,7 +73,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
   ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
-  ops.def("scaled_fp8_quant", &scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
+  ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant, "Compute FP8 quantized tensor for given scaling factor");
+  ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
   ops.def(
     "moe_align_block_size",
     &moe_align_block_size,

diff --git a/csrc/quantization/fp8/fp8_cuda_kernels.cu b/csrc/quantization/fp8/fp8_cuda_kernels.cu
@@ -74,7 +74,30 @@ __global__ void scaled_fp8_quant_kernel(
 
 } // namespace vllm
 
-void scaled_fp8_quant(
+void static_scaled_fp8_quant(
+  torch::Tensor& out,      // [..., d]
+  torch::Tensor& input,    // [..., d]
+  torch::Tensor& scale)    // [1]
+{
+  int64_t num_tokens = input.numel() / input.size(-1);
+  int64_t num_elems = input.numel();
+  dim3 grid(num_tokens);
+  dim3 block(1024);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+    input.scalar_type(),
+    "scaled_fp8_quant_kernel",
+    [&] {
+      vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
+        out.data_ptr<c10::Float8_e4m3fn>(),
+        input.data_ptr<scalar_t>(),
+        scale.data_ptr<float>(),
+        num_elems);
+      });
+}
+
+void dynamic_scaled_fp8_quant(
   torch::Tensor& out,      // [..., d]
   torch::Tensor& input,    // [..., d]
   torch::Tensor& scale)    // [1]

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
@@ -7,3 +7,4 @@ nvidia-ml-py # for pynvml package
 vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 torch == 2.2.1
 xformers == 0.0.25  # Requires PyTorch 2.2.1
+nvidia-cutlass
diff --git a/run_fp8.py b/run_fp8.py
@@ -0,0 +1,41 @@
+import argparse
+
+from transformers import AutoTokenizer
+
+from vllm import LLM
+
+choices = ["llama-static", "mistral-static", "mistral-dynamic", "mixtral-static"]
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--type", choices="mixtral-static")
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    if args.type == "llama-static":
+        model_name = "nm-testing/Meta-Llama-3-8B-Instruct-FP8"
+    elif args.type == "mistral-static":
+        model_name = "nm-testing/mistral-fp8-static"
+    elif args.type == "mistral-dynamic":
+        model_name = "nm-testing/mistral-fp8-dynamic"
+    elif args.type == 'mixtral-static':
+        model_name = "nm-testing/Mixtral-8x7B-Instruct-v0.1-FP8"
+    else:
+        raise ValueError(f"--type should be in {choices}")
+
+    model = LLM(model_name,
+                enforce_eager=True,
+                max_model_len=1024)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+    prompt = tokenizer.apply_chat_template([{
+        "role": "user",
+        "content": "What is your name"
+    }], tokenize=False, add_generation_prompt=True)
+    print(f"----- Prompt: {prompt}")
+
+    outputs = model.generate(prompt)
+    print(outputs)
+    generation = outputs[0].outputs[0].text
+    print(f"----- Generation: {generation}")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -154,10 +154,16 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # fp8
-def scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+def scaled_fp8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
-    vllm_ops.scaled_fp8_quant(output, input, scale)
+    if scale is None:
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
+    else:
+        vllm_ops.static_scaled_fp8_quant(output, input, scale)
     return output, scale
 
 

diff --git a/vllm/model_executor/layers/fused_gemm_dq/__init__.py b/vllm/model_executor/layers/fused_gemm_dq/__init__.py
@@ -0,0 +1,5 @@
+from vllm.model_executor.layers.fused_gemm_dq.fused_gemm_dq_fp8 import fused_gemm_dq_fp8
+
+__all__ = [
+    "fused_gemm_dq_fp8",
+]
diff --git a/vllm/model_executor/layers/fused_gemm_dq/fused_gemm_dq_fp8.py b/vllm/model_executor/layers/fused_gemm_dq/fused_gemm_dq_fp8.py
@@ -0,0 +1,87 @@
+import cutlass
+from cutlass import Tensor as FakeTensor
+import cutlass.epilogue
+
+import torch
+from typing import Optional, Tuple, Dict
+
+
+def setup_dequant_epilogue(
+    plan: cutlass.op.Gemm,
+    dq: torch.Tensor,
+    scale_a: Optional[torch.Tensor],
+    scale_b: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+) -> Tuple[cutlass.op.Gemm, Dict]:
+    assert bias is None
+
+    if all([scale_a is None, scale_b is None]):
+        return plan, None
+    assert scale_b is not None
+
+    def epilog_with_scale_b(accum, scale_b):
+        D = scale_b * accum
+        return D
+
+    def epilog_with_both_scales(accum, scale_a, scale_b):
+        D = scale_a * (scale_b * accum)
+        return D
+
+    visitor_args = {"scale_a": scale_a, "scale_b": scale_b, "D": dq}
+    epilogue_tensors = {
+        "accum": FakeTensor(
+            element=torch.float32,
+            shape=dq.shape,
+            layout_tag=cutlass.LayoutType.RowMajor,
+        ),
+        "D": dq,
+        "scale_b": scale_b,
+    }
+    epilog_fn = epilog_with_scale_b
+
+    if scale_a is not None:
+        epilogue_tensors["scale_a"] = scale_a
+        visitor_args["scale_a"] = scale_a
+        epilog_fn = epilog_with_both_scales
+
+    plan.epilogue_visitor = cutlass.epilogue.trace(epilog_fn, epilogue_tensors)
+    return plan, visitor_args
+
+
+def fused_gemm_dq_fp8(
+    x_q: torch.Tensor,
+    w_q: torch.Tensor,
+    out_dtype: torch.dtype,
+    scale_a: Optional[torch.Tensor] = None,
+    scale_b: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    dq = torch.empty((x_q.shape[0], w_q.shape[1]), dtype=out_dtype, device="cuda")
+    C = torch.zeros((x_q.shape[0], w_q.shape[1]), dtype=out_dtype, device="cuda")
+
+    plan = cutlass.op.Gemm(
+        element_A=x_q.dtype,
+        element_B=w_q.dtype,
+        element_C=dq.dtype,
+        element_D=dq.dtype,
+        layout_A=cutlass.LayoutType.RowMajor,
+        layout_B=cutlass.LayoutType.ColumnMajor,
+        layout_C=cutlass.LayoutType.RowMajor,
+        element_accumulator=torch.float32,
+        kernel_cc=90,
+    )
+
+    plan, visitor_args = setup_dequant_epilogue(plan, dq, scale_a, scale_b, bias)
+
+    plan.run(
+        x_q,
+        w_q,
+        C,
+        dq,
+        alpha=1,
+        beta=0,
+        visitor_args=visitor_args,
+        print_module=False,
+    )
+
+    return dq
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -220,8 +220,9 @@ def moe_align_block_size(
 
 
 def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
-                            B_scale: torch.Tensor, topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
+                            A_scale: Optional[torch.Tensor],
+                            B_scale: Optional[torch.Tensor],
+                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
@@ -232,10 +233,10 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
     assert sorted_token_ids.stride(0) == 1
 
     if not use_fp8:
-        A_scale = None
+        assert A_scale is None
         assert B_scale is None
     else:
-        A, A_scale = ops.scaled_fp8_quant(A)
+        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
         assert B_scale is not None
 
     grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
@@ -318,6 +319,8 @@ def fused_moe(
     use_fp8: bool = False,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -434,6 +437,7 @@ def fused_moe(
     invoke_fused_moe_kernel(hidden_states,
                             w1,
                             intermediate_cache1,
+                            a1_scale,
                             w1_scale,
                             topk_weights,
                             topk_ids,
@@ -451,6 +455,7 @@ def fused_moe(
     invoke_fused_moe_kernel(intermediate_cache2,
                             w2,
                             intermediate_cache3,
+                            a2_scale,
                             w2_scale,
                             topk_weights,
                             topk_ids,

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -295,6 +295,21 @@ def weight_loader(self,
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
         is_metadata = getattr(param, "is_metadata", False)
+
+        # TODO: document.
+        # TODO: sync with is_metadata.
+        # For loading scales.
+        shard_indexer = getattr(param, "shard_indexer", None)
+        logical_widths = getattr(param, "logical_widths", None)
+        if output_dim is not None and shard_indexer is not None:
+            raise NotImplementedError(
+                "We do not currently support output_dim != None and "
+                "shard_indexer != None for a parameter. Please open an issue.")
+        if loaded_shard_id is None and shard_indexer is not None:
+            raise NotImplementedError(
+                "We do not currently support loaded_shard_id == None and "
+                "shard_indexer != None for a parameter. Please open an issue.")
+
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -352,6 +367,15 @@ def weight_loader(self,
             shard_size = loaded_weight.shape[0]
             shard_offset = loaded_shard_id * shard_size
             param_data = param_data.narrow(0, shard_offset, shard_size)
+
+        # TODO: sync with is_metadata UX.
+        # If a param_shard_splitter is defined by the LinearMethod, use it.
+        elif shard_indexer is not None:
+            param_data, loaded_weight = shard_indexer(param_data,
+                                                      loaded_weight,
+                                                      loaded_shard_id,
+                                                      logical_widths)
+
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -434,6 +458,18 @@ def weight_loader(self,
         output_dim = getattr(param, "output_dim", None)
         is_metadata = getattr(param, "is_metadata", False)
 
+        # TODO: sync with is_metadata UX
+        shard_indexer = getattr(param, "shard_indexer", None)
+        logical_widths = getattr(param, "logical_widths", None)
+        if output_dim is not None and shard_indexer is not None:
+            raise NotImplementedError(
+                "We do not currently support output_dim != None and "
+                "shard_indexer != None for a parameter. Please open an issue.")
+        if loaded_shard_id is None and shard_indexer is not None:
+            raise NotImplementedError(
+                "We do not currently support loaded_shard_id == None and "
+                "shard_indexer != None for a parameter. Please open an issue.")
+
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -506,6 +542,13 @@ def weight_loader(self,
             shard_index = ["q", "k", "v"].index(loaded_shard_id)
             param_data = param_data.narrow(0, shard_index * shard_size,
                                            shard_size)
+        # TODO: sync with QKV
+        # If a param_shard_splitter is defined by the LinearMethod, use it.
+        elif shard_indexer is not None:
+            param_data, loaded_weight = shard_indexer(param_data,
+                                                      loaded_weight,
+                                                      loaded_shard_id,
+                                                      logical_widths)
         else:
             ignore_warning = getattr(param, "ignore_warning", False)
             if not ignore_warning:
@@ -602,6 +645,11 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
+        # TODO: canon
+        # This is for loading scales for fp8, which have no dims.
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 

diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
@@ -4,7 +4,8 @@
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.fp8 import FP8Config
+# from vllm.model_executor.layers.quantization.fp8 import FP8Config
+from vllm.model_executor.layers.quantization.fp8_serialized import FP8Config
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig

diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
@@ -29,6 +29,12 @@ def get_min_capability(self) -> int:
         """
         raise NotImplementedError
 
+    # The following is not an abstract method and returns True by default.
+    @classmethod
+    def require_config_file(cls) -> bool:
+        """Whether this quantization config needs a configuration filen."""
+        return True
+
     @staticmethod
     @abstractmethod
     def get_config_filenames() -> List[str]: