PaddlePaddle · luotao1 · Aug 21, 2024 · Aug 19, 2024
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
diff --git a/paddle/fluid/operators/ops_signature/fused_multi_transformer_int8_sig.cc b/paddle/fluid/operators/ops_signature/fused_multi_transformer_int8_sig.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature FusedMultiTransformerInt8OpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  paddle::small_vector<const char*> inputs{
+      "X",           "LnScale",           "LnBias",       "QKVW",
+      "QKVBias",     "CacheKV",           "TimeStep",     "SrcMask",
+      "OutLinearW",  "OutLinearBias",     "FFNLnScale",   "FFNLnBias",
+      "FFN1Weight",  "FFN1Bias",          "FFN2Weight",   "FFN2Bias",
+      "QKVOutScale", "OutLinearOutScale", "FFN1OutScale", "FFN2OutScale"};
+  paddle::small_vector<const char*> attrs;
+  attrs.emplace_back("pre_layer_norm");
+  attrs.emplace_back("epsilon");
+  attrs.emplace_back("dropout_rate");
+  attrs.emplace_back("is_test");
+  attrs.emplace_back("dropout_implementation");
+  attrs.emplace_back("act_method");
+  attrs.emplace_back("trans_qkvw");
+  attrs.emplace_back("ring_id");
+  attrs.emplace_back("num_head");
+  attrs.emplace_back("dim_head");
+  attrs.emplace_back("dim_ffn");
+  attrs.emplace_back("qkv_in_scale");
+  attrs.emplace_back("out_linear_in_scale");
+  attrs.emplace_back("ffn1_in_scale");
+  attrs.emplace_back("ffn2_in_scale");
+  attrs.emplace_back("quant_round_type");
+  attrs.emplace_back("quant_max_bound");
+  attrs.emplace_back("quant_min_bound");
+  paddle::small_vector<const char*> outputs{"CacheKVOut", "Out"};
+  return KernelSignature("fused_multi_transformer_int8",
+                         std::move(inputs),
+                         std::move(attrs),
+                         std::move(outputs));
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(fused_multi_transformer_int8,
+                           phi::FusedMultiTransformerInt8OpArgumentMapping);
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -194,6 +194,7 @@
     'fused_elementwise_mul',
     'fused_elementwise_sub',
     'fused_embedding_fc_lstm',
+    'fused_multi_transformer_int8',
     'fusion_group',
     'fusion_lstm',
     'fusion_seqpool_cvm_concat',

diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
@@ -2140,6 +2140,148 @@ void FusedMultiTransformerInt8XpuInferMeta(
   out->set_layout(x.layout());
 }
 
+void FusedMultiTransformerInt8InferMeta(
+    const MetaTensor& x,
+    const std::vector<const MetaTensor*>& ln_scale,
+    const std::vector<const MetaTensor*>& ln_bias,
+    const std::vector<const MetaTensor*>& qkv_w,
+    const paddle::optional<std::vector<const MetaTensor*>>& qkv_bias,
+    const paddle::optional<std::vector<const MetaTensor*>>& cache_kv,
+    const MetaTensor& time_step,
+    const MetaTensor& src_mask,
+    const std::vector<const MetaTensor*>& out_linear_w,
+    const paddle::optional<std::vector<const MetaTensor*>>& out_linear_bias,
+    const std::vector<const MetaTensor*>& ffn_ln_scale,
+    const std::vector<const MetaTensor*>& ffn_ln_bias,
+    const std::vector<const MetaTensor*>& ffn1_weight,
+    const paddle::optional<std::vector<const MetaTensor*>>& ffn1_bias,
+    const std::vector<const MetaTensor*>& ffn2_weight,
+    const paddle::optional<std::vector<const MetaTensor*>>& ffn2_bias,
+    const paddle::optional<std::vector<const MetaTensor*>>& qkv_out_scale,
+    const paddle::optional<std::vector<const MetaTensor*>>&
+        out_linear_out_scale,
+    const paddle::optional<std::vector<const MetaTensor*>>& ffn1_out_scale,
+    const paddle::optional<std::vector<const MetaTensor*>>& ffn2_out_scale,
+    bool pre_layer_norm,
+    float epsilon,
+    float dropout_rate,
+    bool is_test,
+    const std::string& dropout_implementation,
+    const std::string& act_method,
+    bool trans_qkvw,
+    int ring_id,
+    int num_head,
+    int dim_head,
+    int dim_ffn,
+    const std::vector<float>& qkv_in_scale,
+    const std::vector<float>& out_linear_in_scale,
+    const std::vector<float>& ffn1_in_scale,
+    const std::vector<float>& ffn2_in_scale,
+    int quant_round_type,
+    float quant_max_bound,
+    float quant_min_bound,
+    std::vector<MetaTensor*> cache_kv_out,
+    MetaTensor* out) {
+  // x: qkv's input [batch_size, seq_len, dim_embed]
+  // y: qkv's weight: [3, num_head, dim_head, dim_embed]
+  const auto& x_dim = x.dims();
+  const auto& y_dim = qkv_w[0]->dims();
+  PADDLE_ENFORCE_EQ(
+      x_dim.size(),
+      3,
+      common::errors::InvalidArgument("The dimensions of x must be 3"
+                                      "(batch_size, seq_len, dim_embed),"
+                                      "but received dimensions of"
+                                      "Input is [%d]",
+                                      x_dim.size()));
+  PADDLE_ENFORCE_EQ(
+      y_dim.size(),
+      4,
+      common::errors::InvalidArgument("The dimensions of qkv_weight must be 4"
+                                      "(3, num_head, dim_head, dim_embed),"
+                                      "but received dimensions of"
+                                      "Input is [%d]",
+                                      y_dim.size()));
+  PADDLE_ENFORCE_EQ(
+      x_dim[2],
+      trans_qkvw ? y_dim[3] : y_dim[0],
+      common::errors::InvalidArgument(
+          "ShapeError: the dimension of x_dim[2] and y_dim[3](trans_qkvw is "
+          "true) or y_dim[0](trans_qkvw is false)"
+          "must be equal. But received: the shape "
+          "of input x = [%s], and the shape of "
+          "input qkv_weight = [%s]",
+          x_dim,
+          y_dim));
+
+  if (ring_id == -1) {
+    if (trans_qkvw) {
+      PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2],
+                        y_dim[3],
+                        common::errors::InvalidArgument(
+                            "The dimensions of qkv_weight must be 4"
+                            "(3, num_head, dim_head, dim_embed),"
+                            "and must satisfy the limitations: "
+                            "(num_head * dim_head == dim_embed)"));
+
+    } else {
+      PADDLE_ENFORCE_EQ(y_dim[2] * y_dim[3],
+                        y_dim[0],
+                        common::errors::InvalidArgument(
+                            "The dimensions of qkv_weight must be 4"
+                            "(dim_embed, 3, num_head, dim_head),"
+                            "and must satisfy the limitations: "
+                            "(num_head * dim_head == dim_embed)"));
+    }
+  }
+
+  if (cache_kv && cache_kv.get().size() > 0) {
+    // [2, batch_size, num_head, max_seq_len, head_size]
+    const auto& c_dim = cache_kv.get()[0]->dims();
+
+    PADDLE_ENFORCE_EQ(
+        c_dim.size(),
+        5,
+        common::errors::InvalidArgument(
+            "The CacheKV must be 5 dims, but got %d", c_dim.size()));
+    PADDLE_ENFORCE_EQ(c_dim[0],
+                      2,
+                      common::errors::InvalidArgument(
+                          "The first dim of CacheKV must be 2, but got %d",
+                          c_dim[0]));  // 2
+    PADDLE_ENFORCE_EQ(c_dim[1],
+                      x_dim[0],
+                      common::errors::InvalidArgument(
+                          "The second dim of CacheKV must be equal with "
+                          "batch size %d, but got %d",
+                          x_dim[0],
+                          c_dim[1]));  // batch_size
+    PADDLE_ENFORCE_EQ(c_dim[2],
+                      trans_qkvw ? y_dim[1] : y_dim[2],
+                      common::errors::InvalidArgument(
+                          "The third dim of CacheKV must be equal with num "
+                          "head %d, but got %d",
+                          trans_qkvw ? y_dim[1] : y_dim[2],
+                          c_dim[2]));  // num_head
+    PADDLE_ENFORCE_GT(
+        c_dim[3],
+        0,
+        common::errors::InvalidArgument(
+            "The forth dim of CacheKV must be greater than 0, but got %d",
+            c_dim[3]));  // cache_seq_len
+    PADDLE_ENFORCE_EQ(c_dim[4],
+                      trans_qkvw ? y_dim[2] : y_dim[3],
+                      common::errors::InvalidArgument(
+                          "The fifth dim of CacheKV must be equal with head "
+                          "size %d, but got %d",
+                          trans_qkvw ? y_dim[2] : y_dim[3],
+                          c_dim[4]));  // head_size
+  }
+
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+}
+
 void YoloBoxXPUInferMeta(const MetaTensor& x,
                          const MetaTensor& x_max,
                          const MetaTensor& grid,

diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
@@ -611,6 +611,49 @@ void FusedMultiTransformerInt8XpuInferMeta(
     MetaTensor* out,
     std::vector<MetaTensor*> cache_kv_out);
 
+void FusedMultiTransformerInt8InferMeta(
+    const MetaTensor& x,
+    const std::vector<const MetaTensor*>& ln_scale,
+    const std::vector<const MetaTensor*>& ln_bias,
+    const std::vector<const MetaTensor*>& qkv_w,
+    const paddle::optional<std::vector<const MetaTensor*>>& qkv_bias,
+    const paddle::optional<std::vector<const MetaTensor*>>& cache_kv,
+    const MetaTensor& time_step,
+    const MetaTensor& src_mask,
+    const std::vector<const MetaTensor*>& out_linear_w,
+    const paddle::optional<std::vector<const MetaTensor*>>& out_linear_bias,
+    const std::vector<const MetaTensor*>& ffn_ln_scale,
+    const std::vector<const MetaTensor*>& ffn_ln_bias,
+    const std::vector<const MetaTensor*>& ffn1_weight,
+    const paddle::optional<std::vector<const MetaTensor*>>& ffn1_bias,
+    const std::vector<const MetaTensor*>& ffn2_weight,
+    const paddle::optional<std::vector<const MetaTensor*>>& ffn2_bias,
+    const paddle::optional<std::vector<const MetaTensor*>>& qkv_out_scale,
+    const paddle::optional<std::vector<const MetaTensor*>>&
+        out_linear_out_scale,
+    const paddle::optional<std::vector<const MetaTensor*>>& ffn1_out_scale,
+    const paddle::optional<std::vector<const MetaTensor*>>& ffn2_out_scale,
+    bool pre_layer_norm,
+    float epsilon,
+    float dropout_rate,
+    bool is_test,
+    const std::string& dropout_implementation,
+    const std::string& act_method,
+    bool trans_qkvw,
+    int ring_id,
+    int num_head,
+    int dim_head,
+    int dim_ffn,
+    const std::vector<float>& qkv_in_scale,
+    const std::vector<float>& out_linear_in_scale,
+    const std::vector<float>& ffn1_in_scale,
+    const std::vector<float>& ffn2_in_scale,
+    int quant_round_type,
+    float quant_max_bound,
+    float quant_min_bound,
+    std::vector<MetaTensor*> cache_kv_out,
+    MetaTensor* out);
+
 void YoloBoxXPUInferMeta(const MetaTensor& x,
                          const MetaTensor& x_max,
                          const MetaTensor& grid,

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
@@ -223,6 +223,8 @@ if(WITH_ROCM)
     "fusion/gpu/fused_bn_add_activation_kernel.cu"
     "fusion/gpu/fused_feedforward_kernel.cu"
     "fusion/gpu/fused_feedforward_grad_kernel.cu"
+    "fusion/gpu/fused_multi_transformer_int8_kernel.cu"
+    "fusion/gpu/fused_multi_transformer_kernel.cu"
     "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu")
   list(
     REMOVE_ITEM