apache · masahi · Jan 18, 2020 · Jan 19, 2020 · Jan 19, 2020 · Jan 19, 2020
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
@@ -158,7 +158,6 @@ def optimize(self, func, target=None, params=None):
 
         return mod, params
 
-
     def _set_params(self, params):
         self._set_params_func(_convert_param_map(params))
 

diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -31,6 +31,7 @@
 
 #include <fstream>
 #include <sstream>
+#include <numeric>
 
 #include "../codegen_c/codegen_c.h"
 
@@ -50,82 +51,109 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase {
     out_.push_back({node->name_hint(), 0});
   }
 
-  void VisitExpr_(const TupleGetItemNode* op) final {
-    // Do nothing
-  }
-
   void VisitExpr_(const CallNode* call) final {
-    std::ostringstream decl_stream;
-    std::ostringstream buf_stream;
-    // Args: ID
-    std::vector<std::string> args;
+    struct Output {
+      std::string decl, buf;
+      int out_size = 1;
+      std::string out;
+    };
+
+    auto generate_body = [=](const CallNode* root_call, const std::string& func_name,
+                             const std::vector<std::string>& args,
+                             const std::vector<std::string>& fused_func_args) {
+      // Make function call with input buffers when visiting arguments
+      bool first = true;
+      std::ostringstream arg_stream;
+      arg_stream << "(";
+      for (size_t i = 0; i < root_call->args.size(); ++i) {
+        VisitExpr(root_call->args[i]);
+        for (auto out : out_) {
+          if (!first) {
+            arg_stream << ", ";
+          }
+          first = false;
+          arg_stream << out.first;
+        }
+      }
+
+      for (auto arg_name : fused_func_args) {
+        arg_stream << ", " << arg_name;
+      }
+
+      // Analyze the output buffer
+      auto type_node = root_call->checked_type().as<TensorTypeNode>();
+      CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
+          << "Only support single output tensor with float type";
+
+      auto out_shape = GetShape(root_call->checked_type());
+
+      Output ret;
+      ret.out = "buf_" + std::to_string(buf_idx_++);
+      ret.out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1, std::multiplies<int>());
+
+      this->PrintIndents();
+
+      std::ostringstream buf_stream;
+      buf_stream << "float* " << ret.out << " = (float*)std::malloc(4 * " << ret.out_size << ");";
+      ret.buf = buf_stream.str();
 
-    // Get the arguments for various DNNL kernels.
-    if (IsOp(call, "nn.conv2d")) {
-      decl_stream << "dnnl_conv2d";
-      args = Conv2d(call);
+      arg_stream << ", " << ret.out;
+      // Attach attribute arguments
+      for (size_t i = 0; i < args.size(); ++i) {
+        arg_stream << ", " << args[i];
+      }
+      arg_stream << ");";
+      ret.decl = func_name + arg_stream.str();
+
+      return ret;
+    };
+
+    Output ret;
+    if (auto conv_call = DetectFusedConv2DBiasReLU(call)) {
+      ret = generate_body(conv_call, "dnnl_fused_conv2d_bias_relu",
+                          FusedConv2dBiasReLU(conv_call), ext_fused_func_args_);
+    } else if (IsOp(call, "nn.conv2d")) {
+      ret = generate_body(call, "dnnl_conv2d", Conv2d(call), {});
     } else if (IsOp(call, "nn.dense")) {
-      decl_stream << "dnnl_dense";
-      args = Dense(call);
+      ret = generate_body(call, "dnnl_dense", Dense(call), {});
     } else if (IsOp(call, "nn.relu")) {
-      decl_stream << "dnnl_relu";
-      args = Relu(call);
+      ret = generate_body(call, "dnnl_relu", Relu(call), {});
     } else if (IsOp(call, "nn.batch_norm")) {
-      decl_stream << "dnnl_bn";
-      args = BatchNorm(call);
+      ret = generate_body(call, "dnnl_bn", BatchNorm(call), {});
     } else if (IsOp(call, "add")) {
-      decl_stream << "dnnl_add";
-      args = Add(call);
+      ret = generate_body(call, "dnnl_add", Add(call), {});
     } else {
       LOG(FATAL) << "Unsupported op: " << AsText(call->op, false);
     }
 
-    // Make function call with input buffers when visiting arguments
-    bool first = true;
-    decl_stream << "(";
-    for (size_t i = 0; i < call->args.size(); ++i) {
-      VisitExpr(call->args[i]);
-      for (auto out : out_) {
-        if (!first) {
-          decl_stream << ", ";
-        }
-        first = false;
-        decl_stream << out.first;
-      }
-    }
-
-    // Analyze the output buffer
-    auto type_node = call->checked_type().as<TensorTypeNode>();
-    CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
-        << "Only support single output tensor with float type";
-    std::string out = "buf_" + std::to_string(buf_idx_++);
-    auto out_shape = GetShape(call->checked_type());
-    int out_size = 1;
-    for (size_t i = 0; i < out_shape.size(); ++i) {
-      out_size *= out_shape[i];
-    }
-    this->PrintIndents();
-    buf_stream << "float* " << out << " = (float*)std::malloc(4 * " << out_size << ");";
-    buf_decl_.push_back(buf_stream.str());
-    decl_stream << ", " << out;
-
-    // Attach attribute arguments
-    for (size_t i = 0; i < args.size(); ++i) {
-      decl_stream << ", " << args[i];
-    }
-    decl_stream << ");";
-    ext_func_body.push_back(decl_stream.str());
+    buf_decl_.push_back(ret.buf);
+    ext_func_body.push_back(ret.decl);
 
     // Update output buffer
     out_.clear();
-    out_.push_back({out, out_size});
+    out_.push_back({ret.out, ret.out_size});
   }
 
   std::string JIT(void) {
+    ext_func_args_.insert(ext_func_args_.end(), ext_fused_func_args_.begin(),
+                          ext_fused_func_args_.end());
     return JitImpl(ext_func_id_, ext_func_args_, buf_decl_, ext_func_body, out_);
   }
 
  private:
+  const CallNode* DetectFusedConv2DBiasReLU(const CallNode* call) {
+    if (!IsOp(call, "nn.relu")) return nullptr;
+    auto relu_arg = call->args[0];
+    const CallNode* add_call = relu_arg.as<CallNode>();
+    if (!add_call || !IsOp(add_call, "add")) return nullptr;
+    auto add_arg = add_call->args[0];
+    const CallNode* conv_call = add_arg.as<CallNode>();
+    if (!conv_call || !IsOp(conv_call, "nn.conv2d")) return nullptr;
+    auto bias_name = "dnnl_fused_input" + std::to_string(ext_fused_func_args_.size());
+    ext_fused_func_args_.push_back(bias_name);
+    return conv_call;
+  }
+
   std::vector<std::string> Conv2d(const CallNode* call) {
     std::vector<std::string> args;
     const auto* conv2d_attr = call->attrs.as<Conv2DAttrs>();
@@ -152,6 +180,10 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase {
     return args;
   }
 
+  std::vector<std::string> FusedConv2dBiasReLU(const CallNode* call) {
+    return Conv2d(call);
+  }
+
   std::vector<std::string> Dense(const CallNode* call) {
     std::vector<std::string> args;
     auto ishape = GetShape(call->args[0]->checked_type());
@@ -214,6 +246,7 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase {
   int buf_idx_{0};
   /*! \brief The arguments used by a wrapped function that calls DNNL kernels. */
   std::vector<std::string> ext_func_args_;
+  std::vector<std::string> ext_fused_func_args_;
   /*! \brief statement of the function that will be compiled using DNNL kernels. */
   std::vector<std::string> ext_func_body;
   /*! \brief The declaration of intermeidate buffers. */

diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc
@@ -52,10 +52,10 @@ inline void read_from_dnnl_memory(void* handle, const memory& mem) {
   std::copy(src, src + bytes, reinterpret_cast<uint8_t*>(handle));
 }
 
-extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_,
-                            int p_C_, int p_H_, int p_W_, int p_O_, int p_G_,
-                            int p_Ph_, int p_Pw_, int p_Kh_, int p_Kw_,
-                            int p_Sh_, int p_Sw_) {
+void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out,
+                        int p_N_, int p_C_, int p_H_, int p_W_,
+                        int p_O_, int p_G_, int p_Ph_, int p_Pw_, int p_Kh_,
+                        int p_Kw_, int p_Sh_, int p_Sw_, primitive_attr attr) {
   using tag = memory::format_tag;
   using dt = memory::data_type;
   engine eng(engine::kind::cpu, 0);
@@ -65,32 +65,26 @@ extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_,
   memory::dims conv2d_weights_tz = {p_O_, p_C_, p_Kh_, p_Kw_};
   if (p_G_ > 1) conv2d_weights_tz = {p_G_, 1, p_C_ / p_G_, p_Kh_, p_Kw_};
   memory::dims conv2d_bias_tz = {p_O_};
-  memory::dims conv2d_dst_tz = {p_N_, p_O_,
-                                (p_H_ - p_Kh_ + 2 * p_Ph_ + p_Sh_) / p_Sh_,
+  memory::dims conv2d_dst_tz = {p_N_, p_O_, (p_H_ - p_Kh_ + 2 * p_Ph_ + p_Sh_) / p_Sh_,
                                 (p_W_ - p_Kw_ + 2 * p_Pw_ + p_Sw_) / p_Sw_};
   memory::dims conv2d_strides = {p_Sh_, p_Sw_};
   memory::dims conv2d_padding = {p_Ph_, p_Pw_};
 
-  std::vector<float> conv2d_bias(p_O_, 0);
-
-  auto user_src_memory =
-      memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data);
-  auto user_weights_memory = memory(
-      {{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng,
-      weights);
+  auto user_src_memory = memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data);
+  auto user_weights_memory =
+      memory({{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng, weights);
   auto conv2d_user_bias_memory =
-      memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, conv2d_bias.data());
+      memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, bias);
 
   auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any);
   auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any);
   auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any);
   auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::nchw);
 
   auto conv2d_desc = convolution_forward::desc(
-      prop_kind::forward_inference, algorithm::convolution_direct,
-      conv2d_src_md, conv2d_weights_md, conv2d_bias_md, conv2d_dst_md,
-      conv2d_strides, conv2d_padding, conv2d_padding);
-  auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, eng);
+      prop_kind::forward_inference, algorithm::convolution_direct, conv2d_src_md, conv2d_weights_md,
+      conv2d_bias_md, conv2d_dst_md, conv2d_strides, conv2d_padding, conv2d_padding);
+  auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, attr, eng);
 
   auto conv2d_src_memory = user_src_memory;
   auto conv2d_weights_memory = user_weights_memory;
@@ -105,6 +99,39 @@ extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_,
   read_from_dnnl_memory(out, conv2d_dst_memory);
 }
 
+extern "C" void dnnl_conv2d(float* data, float* weights, float* out,
+                            int p_N_, int p_C_, int p_H_, int p_W_,
+                            int p_O_, int p_G_, int p_Ph_, int p_Pw_,
+                            int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_) {
+  primitive_attr attr;
+  std::vector<float> bias(p_O_, 0);
+  return dnnl_conv2d_common(data, weights, bias.data(), out,
+                            p_N_, p_C_, p_H_, p_W_, p_O_, p_G_,
+                            p_Ph_, p_Pw_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
+                            attr);
+}
+
+primitive_attr create_attr_with_relu_post_op() {
+  post_ops ops;
+  ops.append_eltwise(1.f, algorithm::eltwise_relu, 0.f, 0.f);
+
+  primitive_attr attr;
+  attr.set_post_ops(ops);
+
+  return attr;
+}
+
+extern "C" void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float* bias, float* out,
+                                            int p_N_, int p_C_, int p_H_, int p_W_, int p_O_,
+                                            int p_G_, int p_Ph_, int p_Pw_, int p_Kh_, int p_Kw_,
+                                            int p_Sh_, int p_Sw_) {
+  return dnnl_conv2d_common(data, weights, bias, out,
+                            p_N_, p_C_, p_H_, p_W_,
+                            p_O_, p_G_, p_Ph_, p_Pw_,
+                            p_Kh_, p_Kw_, p_Sh_, p_Sw_,
+                            create_attr_with_relu_post_op());
+}
+
 extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_,
                            int p_I_, int p_O_) {
   using tag = memory::format_tag;

diff --git a/src/runtime/contrib/dnnl/dnnl_kernel.h b/src/runtime/contrib/dnnl/dnnl_kernel.h
@@ -38,6 +38,12 @@ extern "C" TVM_DLL void dnnl_conv2d(float* data, float* weights, float* out, int
                                     int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_,
                                     int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_);
 
+extern "C" TVM_DLL void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float* bias,
+                                                    float* out, int p_N_, int p_C_, int p_H_,
+                                                    int p_W_, int p_O_, int p_G_, int p_Ph_,
+                                                    int p_Pw_, int p_Kh_, int p_Kw_, int p_Sh_,
+                                                    int p_Sw_);
+
 extern "C" TVM_DLL void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_,
                                    int p_O_);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -158,7 +158,6 @@ def optimize(self, func, target=None, params=None):

		return mod, params


		def _set_params(self, params):
		self._set_params_func(_convert_param_map(params))

Expand Down