[FEATURE] Add interleaved batch_dot oneDNN fuses for new GluonNLP mod…

…els (#20312) * Add self attention fuse with oneDNN support * Make ConvertWeightBias2MKLDNN inline function * Add new fp32 ops names to amp list * Switch to forward interface
apache · Jun 29, 2021 · 1d0bdfd · 1d0bdfd
1 parent 38e1416
commit 1d0bdfd
Show file tree

Hide file tree

Showing 10 changed files with 1,706 additions and 2 deletions.
diff --git a/python/mxnet/amp/lists/symbol_fp16.py b/python/mxnet/amp/lists/symbol_fp16.py
@@ -618,6 +618,8 @@
     FP32_FUNCS.extend([
         '_sg_mkldnn_conv',
         '_sg_mkldnn_fully_connected',
+        '_sg_mkldnn_selfatt_qk',
+        '_sg_mkldnn_selfatt_valatt',
     ])
 
 # Functions that have to be cast to FP32 only for

diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
@@ -855,7 +855,7 @@ def quantize_net(network, quantized_dtype='auto', quantize_mode='full', quantize
     while True:
         try:
             network(*data_nd)
-        except TypeError as err:
+        except (ValueError, TypeError) as err:
             if logger:
                 logger.warning(err)
                 logger.warning("Deduced input data descriptors failed to run forward pass."

diff --git a/src/operator/subgraph/mkldnn/mkldnn_common.h b/src/operator/subgraph/mkldnn/mkldnn_common.h
@@ -28,6 +28,7 @@
 #define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_COMMON_H_
 #if MXNET_USE_ONEDNN == 1
 #include <vector>
+#include "../../numpy/np_matrix_op-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -86,7 +87,7 @@ static std::vector<float> GetWeightScales(const NDArray &weight, const NDArray *
   return weight_scales;
 }
 
-static void ConvertWeightBias2MKLDNN(NDArray *weight, NDArray *bias, bool has_bias,
+static inline void ConvertWeightBias2MKLDNN(NDArray *weight, NDArray *bias, bool has_bias,
                                      const mkldnn::memory::desc &weight_md,
                                      const mkldnn::memory::desc *bias_md,
                                      const int num_group, float data_scale,
@@ -131,6 +132,34 @@ static void ConvertWeightBias2MKLDNN(NDArray *weight, NDArray *bias, bool has_bi
   if (has_bias && data_scale) *bias = new_bias;
 }
 
+
+static inline bool CheckReshapeConditions(const nnvm::Node& node, const index_t out_index) {
+  const index_t split_output_index = node.inputs[0].index;
+  if (split_output_index != out_index)
+    return false;
+
+  const auto &reshape_param = nnvm::get<NumpyXReshapeParam>(node.attrs.parsed);
+  const auto newshape = reshape_param.newshape;
+
+  if (newshape.ndim() != 4 || !(newshape[0] == newshape[1] && newshape[0] == -2))
+    return false;
+
+  return true;
+}
+
+static inline bool CheckSwapAxisConditions(const nnvm::Node& node) {
+  auto params = node.attrs.dict;
+  int dim1 = 0, dim2 = 0;
+  if (params.count("dim1") && params.count("dim2")) {
+    dim1 = std::stoi(params.at("dim1"));
+    dim2 = std::stoi(params.at("dim2"));
+  } else {
+    return false;
+  }
+
+  return ((dim1 == 1 && dim2 == 2) || (dim1 == 2 && dim2 == 1));
+}
+
 }  // namespace op
 }  // namespace mxnet
 

diff --git a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
@@ -26,6 +26,9 @@
 #include "mkldnn_fc_post_quantize_property.h"
 #include "mkldnn_elemwisemul_post_quantize_property.h"
 #include "mkldnn_post_quantize_align_scale_property.h"
+#include "mkldnn_transformer_qk_property.h"
+#include "mkldnn_transformer_valatt_property.h"
+#include "mkldnn_transformer_post_quantize_property.h"
 
 namespace mxnet {
 namespace op {
@@ -37,6 +40,8 @@ MXNET_REGISTER_SUBGRAPH_BACKEND(MKLDNN)
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNConvProperty);
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNFCProperty);
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNBNReLUProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNTransformerQKProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNTransformerValAttProperty);
 
 MXNET_REGISTER_SUBGRAPH_BACKEND(MKLDNN_QUANTIZE)
 .set_attr("context", Context::CPU());
@@ -46,11 +51,15 @@ MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNConvProperty)
 
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNFCProperty)
 .set_attr("quantize", true);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNTransformerQKProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNTransformerValAttProperty);
 
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeProperty);
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNFCPostQuantizeProperty);
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, ElemwiseMulPostQuantizeProperty);
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeAlignScaleProperty);
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNTransformerPostQuantizeProperty)
+.set_attr("quantize", true);
 
 }  // namespace op
 }  // namespace mxnet

diff --git a/src/operator/subgraph/mkldnn/mkldnn_transformer-inl.h b/src/operator/subgraph/mkldnn/mkldnn_transformer-inl.h
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_INL_H_
+#define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_INL_H_
+
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+
+
+namespace mxnet {
+namespace op {
+
+struct MKLDNNSelfAttParam : public dmlc::Parameter<MKLDNNSelfAttParam> {
+  int heads;
+  bool quantized;
+  bool enable_float_output;
+  dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
+  dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
+  DMLC_DECLARE_PARAMETER(MKLDNNSelfAttParam) {
+    DMLC_DECLARE_FIELD(heads)
+    .describe("Set number of heads.");
+    DMLC_DECLARE_FIELD(quantized).set_default(false)
+    .describe("Whether it's a quantized self attention matmul operator.");
+    DMLC_DECLARE_FIELD(enable_float_output).set_default(false)
+    .describe("Whether to enable float32 output.");
+    DMLC_DECLARE_FIELD(min_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The minimum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to by "
+              "quantized self-attention op to calculate primitive scale.");
+    DMLC_DECLARE_FIELD(max_calib_range)
+    .set_default(dmlc::optional<float>())
+    .describe("The maximum scalar value in the form of float32 obtained "
+              "through calibration. If present, it will be used to by "
+              "quantized self-attention op to calculate primitive scale.");
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_TRANSFORMER_INL_H_