b3602sss · minghaoBD · May 20, 2022 · May 23, 2022 · May 23, 2022 · May 23, 2022
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -101,6 +101,7 @@ pass_library(matmul_scale_fuse_pass inference)
 pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
 pass_library(mixed_precision_configure_pass inference)
 pass_library(replace_dense_with_sparse_pass inference)
+pass_library(replace_dense_multihead_matmul_with_sparse_pass inference)
 pass_library(generate_pass DEPS pass_desc_proto)
 target_link_libraries(generate_pass pass_desc_proto)
 

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -3449,6 +3449,44 @@ PDNode *patterns::DenseFC::operator()() {
   return fc_out;
 }
 
+PDNode *patterns::MultiheadMatmul::operator()() {
+  auto *multihead_matmul = pattern->NewNode(multihead_matmul_repr())
+                               ->assert_is_op("multihead_matmul");
+  // Input
+  auto *multihead_matmul_input =
+      pattern->NewNode(multihead_matmul_input_repr())
+          ->AsInput()
+          ->assert_is_op_input("multihead_matmul", "Input");
+  // Filter
+  auto *multihead_matmul_weights =
+      pattern->NewNode(multihead_matmul_weights_repr())
+          ->AsInput()
+          ->assert_is_op_input("multihead_matmul", "W");
+  // Bias
+  auto *multihead_matmul_bias =
+      pattern->NewNode(multihead_matmul_bias_repr())
+          ->AsInput()
+          ->assert_is_op_input("multihead_matmul", "Bias");
+  // BiasQK
+  auto *multihead_matmul_biasqk =
+      pattern->NewNode(multihead_matmul_biasqk_repr())
+          ->AsInput()
+          ->assert_is_op_input("multihead_matmul", "BiasQK");
+  // Output
+  auto *multihead_matmul_out =
+      pattern->NewNode(multihead_matmul_out_repr())
+          ->AsOutput()
+          ->assert_is_op_output("multihead_matmul", "Out")
+          ->assert_is_only_output_of_op("multihead_matmul");
+
+  multihead_matmul
+      ->LinksFrom({multihead_matmul_input, multihead_matmul_weights,
+                   multihead_matmul_bias, multihead_matmul_biasqk})
+      .LinksTo({multihead_matmul_out});
+
+  return multihead_matmul_out;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1958,6 +1958,24 @@ struct DenseFC : public PatternBase {
   PATTERN_DECL_NODE(fc_bias);
 };
 
+//
+// \brief   Pattern looking for multihead matmul fc.
+//
+struct MultiheadMatmul : public PatternBase {
+  MultiheadMatmul(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "multihead_matmul") {}
+
+  PDNode* operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(multihead_matmul);
+  PATTERN_DECL_NODE(multihead_matmul_out);
+  PATTERN_DECL_NODE(multihead_matmul_input);
+  PATTERN_DECL_NODE(multihead_matmul_weights);
+  PATTERN_DECL_NODE(multihead_matmul_bias);
+  PATTERN_DECL_NODE(multihead_matmul_biasqk);
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.

diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -864,7 +864,7 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
     auto* mul0_op_desc = mul0->Op();
 
     // all mul op has same input.
-    if (multihead_op_desc.HasAttr("Input_scale")) {
+    if (mul0_op_desc->HasAttr("Input_scale")) {
       multihead_op_desc.SetAttr("Input_scale",
                                 mul0_op_desc->GetAttr("Input_scale"));
     }

diff --git a/paddle/fluid/framework/ir/replace_dense_multihead_matmul_with_sparse_pass.cc b/paddle/fluid/framework/ir/replace_dense_multihead_matmul_with_sparse_pass.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/replace_dense_multihead_matmul_with_sparse_pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+ReplaceDenseMultiheadMatmulWithSparsePass::
+    ReplaceDenseMultiheadMatmulWithSparsePass() {
+  AddOpCompat(OpCompat("multihead_matmul"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("W")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddInput("BiasQK")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+}
+
+void ReplaceDenseMultiheadMatmulWithSparsePass::ApplyImpl(Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+
+  std::string name_scope = "replace_dense_multihead_matmul_with_sparse_pass";
+  FusePassBase::Init(name_scope, graph);
+  GraphPatternDetector gpd;
+
+  patterns::MultiheadMatmul multihead_matmul_pattern(
+      gpd.mutable_pattern(), "dense_multihead_matmul_replace_pass");
+  multihead_matmul_pattern();
+  int found_multihead_matmul_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "Replace dense multihead matmul with sparse multihead matmul.";
+
+    /*   if (!IsCompat(subgraph, g)) {
+         LOG(WARNING) << "Pass in op compat failed.";
+         return;
+       }*/
+
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_out, multihead_matmul_out,
+                              multihead_matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul, multihead_matmul,
+                              multihead_matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_input, multihead_matmul_input,
+                              multihead_matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_weights,
+                              multihead_matmul_weights,
+                              multihead_matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_bias, multihead_matmul_bias,
+                              multihead_matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_biasqk, multihead_matmul_biasqk,
+                              multihead_matmul_pattern);
+
+    auto *multihead_matmul_op = multihead_matmul->Op();
+    auto w_name = multihead_matmul_op->Input("W")[0];
+    // recognize sparse op by name
+    if (w_name.find("sparse_2_4") != w_name.npos) {
+      // fake op
+      OpDesc desc(multihead_matmul_op->Block());
+      desc.SetType("sparse_multihead_matmul");
+      desc.SetInput("Input", {multihead_matmul_input->Name()});
+      desc.SetInput("W", {multihead_matmul_weights->Name()});
+      desc.SetInput("Bias", {multihead_matmul_bias->Name()});
+      desc.SetInput("BiasQK", {multihead_matmul_biasqk->Name()});
+      desc.SetOutput("Out", {multihead_matmul_out->Name()});
+
+      // copy all attr
+      desc.SetAttr("alpha", multihead_matmul_op->GetAttr("alpha"));
+      desc.SetAttr("head_number", multihead_matmul_op->GetAttr("head_number"));
+      if (multihead_matmul_op->HasAttr("Input_scale")) {
+        desc.SetAttr("Input_scale",
+                     multihead_matmul_op->GetAttr("Input_scale"));
+      }
+      if (multihead_matmul_op->HasAttr("fc_out_threshold")) {
+        desc.SetAttr("fc_out_threshold",
+                     multihead_matmul_op->GetAttr("fc_out_threshold"));
+      }
+      if (multihead_matmul_op->HasAttr("qkv2context_plugin_int8")) {
+        desc.SetAttr("qkv2context_plugin_int8",
+                     multihead_matmul_op->GetAttr("qkv2context_plugin_int8"));
+      }
+      if (multihead_matmul_op->HasAttr("dp_probs")) {
+        desc.SetAttr("dp_probs", multihead_matmul_op->GetAttr("dp_probs"));
+      }
+      if (multihead_matmul_op->HasAttr("out_threshold")) {
+        desc.SetAttr("out_threshold",
+                     multihead_matmul_op->GetAttr("out_threshold"));
+      }
+      desc.Flush();
+      GraphSafeRemoveNodes(g, {multihead_matmul});
+      auto sparse_multihead_matmul_node = g->CreateOpNode(&desc);
+
+      IR_NODE_LINK_TO(multihead_matmul_input, sparse_multihead_matmul_node);
+      IR_NODE_LINK_TO(multihead_matmul_weights, sparse_multihead_matmul_node);
+      IR_NODE_LINK_TO(multihead_matmul_bias, sparse_multihead_matmul_node);
+      IR_NODE_LINK_TO(multihead_matmul_biasqk, sparse_multihead_matmul_node);
+      IR_NODE_LINK_TO(sparse_multihead_matmul_node, multihead_matmul_out);
+      found_multihead_matmul_count++;
+    }
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_multihead_matmul_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(replace_dense_multihead_matmul_with_sparse_pass,
+              paddle::framework::ir::ReplaceDenseMultiheadMatmulWithSparsePass);
diff --git a/paddle/fluid/framework/ir/replace_dense_multihead_matmul_with_sparse_pass.h b/paddle/fluid/framework/ir/replace_dense_multihead_matmul_with_sparse_pass.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/**
+ * Replace dense multihead_matmul op with sparse multihead_matmul op
+ */
+class Graph;
+
+class ReplaceDenseMultiheadMatmulWithSparsePass : public FusePassBase {
+ public:
+  ReplaceDenseMultiheadMatmulWithSparsePass();
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+  const std::string name_scope_{
+      "replace_dense_multihead_matmul_with_sparse_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1767,6 +1767,7 @@ USE_TRT_CONVERTER(roll)
 USE_TRT_CONVERTER(strided_slice)
 #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
 USE_TRT_CONVERTER(sparse_fc)
+USE_TRT_CONVERTER(sparse_multihead_matmul)
 #endif
 #endif
 

diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -91,26 +91,27 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "delete_quant_dequant_linear_op_pass",   //
       "add_support_int8_pass",                 //
       // "fc_fuse_pass",                        //
-      "simplify_with_basic_ops_pass",                 //
-      "embedding_eltwise_layernorm_fuse_pass",        //
-      "preln_embedding_eltwise_layernorm_fuse_pass",  //
-      "multihead_matmul_fuse_pass_v2",                //
-      "multihead_matmul_fuse_pass_v3",                //
-      "skip_layernorm_fuse_pass",                     //
-      "preln_skip_layernorm_fuse_pass",               //
-      "conv_bn_fuse_pass",                            //
-      "unsqueeze2_eltwise_fuse_pass",                 //
-      "trt_squeeze2_matmul_fuse_pass",                //
-      "trt_reshape2_matmul_fuse_pass",                //
-      "trt_flatten2_matmul_fuse_pass",                //
-      "trt_map_matmul_v2_to_mul_pass",                //
-      "trt_map_matmul_v2_to_matmul_pass",             //
-      "trt_map_matmul_to_mul_pass",                   //
-      "fc_fuse_pass",                                 //
-      "conv_elementwise_add_fuse_pass",               //
-      "replace_dense_with_sparse_pass",               //
-      "tensorrt_subgraph_pass",                       //
-      "conv_bn_fuse_pass",                            //
+      "simplify_with_basic_ops_pass",                     //
+      "embedding_eltwise_layernorm_fuse_pass",            //
+      "preln_embedding_eltwise_layernorm_fuse_pass",      //
+      "multihead_matmul_fuse_pass_v2",                    //
+      "multihead_matmul_fuse_pass_v3",                    //
+      "skip_layernorm_fuse_pass",                         //
+      "preln_skip_layernorm_fuse_pass",                   //
+      "conv_bn_fuse_pass",                                //
+      "unsqueeze2_eltwise_fuse_pass",                     //
+      "trt_squeeze2_matmul_fuse_pass",                    //
+      "trt_reshape2_matmul_fuse_pass",                    //
+      "trt_flatten2_matmul_fuse_pass",                    //
+      "trt_map_matmul_v2_to_mul_pass",                    //
+      "trt_map_matmul_v2_to_matmul_pass",                 //
+      "trt_map_matmul_to_mul_pass",                       //
+      "fc_fuse_pass",                                     //
+      "conv_elementwise_add_fuse_pass",                   //
+      "replace_dense_with_sparse_pass",                   //
+      "replace_dense_multihead_matmul_with_sparse_pass",  //
+      "tensorrt_subgraph_pass",                           //
+      "conv_bn_fuse_pass",                                //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                            // guaranteed at least v7
 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we

diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -6,6 +6,6 @@ nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPE
 endif()
 nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto device_context boost)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
-nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
+nv_test(test_tensorrt_engine SRCS test_engine.cc test_dynamic_engine DEPS dynload_cuda tensorrt_engine tensorrt_plugin)
 add_subdirectory(plugin)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -10,7 +10,7 @@ conv3d_op.cc mish_op.cc nearest_interp_v2_op.cc pool3d_op.cc deformable_conv_op.
 preln_skip_layernorm.cc	strided_slice_op.cc roll_op.cc)
 
 if (CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
-  list(APPEND CONVERT_FILES sparse_fc_op.cc)
+  list(APPEND CONVERT_FILES sparse_fc_op.cc sparse_multihead_matmul_op.cc)
 endif()
 
 nv_library(tensorrt_converter

diff --git a/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc
@@ -255,15 +255,27 @@ class SparseFcOpConverter : public OpConverter {
 
     float* bias_data = nullptr;
     int bias_num = 0;
+    void* b_data = nullptr;
     if (with_bias) {
       auto* b_v = scope.GetVar(op_desc.Input("Bias").front());
       auto* b_t = b_v->GetMutable<framework::LoDTensor>();
       bias_data = engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t);
       bias_num = b_t->numel();
+
+      half* half_bias_data = nullptr;
+      if (with_fp16) {
+        half_bias_data = new half[bias_num];
+        for (int i = 0; i < bias_num; i++) {
+          half_bias_data[i] = static_cast<half>(bias_data[i]);
+        }
+        b_data = static_cast<void*>(half_bias_data);
+      } else {
+        b_data = static_cast<void*>(bias_data);
+      }
     }
-    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
-                                static_cast<void*>(bias_data),
-                                static_cast<size_t>(bias_num)};
+    TensorRTEngine::Weight bias{
+        with_fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
+        b_data, static_cast<size_t>(bias_num)};
 
     // Running the TRT Static Shape mode: x_num_col_dims-1
     if (!engine_->with_dynamic_shape()) {