From c2a0a0a7fa8668d4a0a6a8850e2f0851bd574ea3 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Sat, 11 Apr 2020 07:38:07 +0900 Subject: [PATCH 1/3] merge change from dev branch --- .../backend/contrib/codegen_c/codegen.cc | 5 +- .../backend/contrib/codegen_c/codegen_c.h | 35 -- src/relay/backend/contrib/dnnl/codegen.cc | 361 ++++++++++++------ src/relay/backend/utils.h | 72 +++- src/runtime/contrib/dnnl/dnnl.cc | 66 +++- src/runtime/contrib/dnnl/dnnl_kernel.h | 11 + .../python/relay/test_pass_partition_graph.py | 125 +++++- 7 files changed, 487 insertions(+), 188 deletions(-) diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc index 97231dfe3401..500e0dcb0c9a 100644 --- a/src/relay/backend/contrib/codegen_c/codegen.cc +++ b/src/relay/backend/contrib/codegen_c/codegen.cc @@ -19,19 +19,22 @@ #include #include #include -#include #include +#include #include #include #include +#include "../../utils.h" #include "codegen_c.h" namespace tvm { namespace relay { namespace contrib { +using namespace backend; + /*! * \brief An example codegen that is only used for quick prototyping and testing * purpose. Only several binary options are covered. Users diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h index 1db3f20ef05b..1b953f3c4467 100644 --- a/src/relay/backend/contrib/codegen_c/codegen_c.h +++ b/src/relay/backend/contrib/codegen_c/codegen_c.h @@ -169,41 +169,6 @@ class CodegenCBase { */ virtual std::string JIT() = 0; - /*! - * \brief Extract the shape from a Relay tensor type. - * - * \param type The provided type. - * - * \return The extracted shape in a list. - */ - std::vector GetShape(const Type& type) const { - const auto* ttype = type.as(); - CHECK(ttype) << "Expect TensorTypeNode"; - std::vector shape; - for (size_t i = 0; i < ttype->shape.size(); ++i) { - auto* val = ttype->shape[i].as(); - CHECK(val); - shape.push_back(val->value); - } - return shape; - } - - /*! - * \brief Check if a call has the provided name. - * - * \param call A Relay call node. - * \param op_name The name of the expected call. - * - * \return true if the call's name is equivalent to the given name. Otherwise, - * false. - */ - bool IsOp(const CallNode* call, const std::string& op_name) const { - const auto* op_node = call->op.as(); - CHECK(op_node) << "Expects a single op."; - Op op = GetRef(op_node); - return op == Op::Get(op_name); - } - /*! * \brief A common interface that is used by various external runtime to * generate the wrapper to invoke external kernels. diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc index 73711749d9c4..4fbd4173516c 100644 --- a/src/relay/backend/contrib/dnnl/codegen.cc +++ b/src/relay/backend/contrib/dnnl/codegen.cc @@ -30,14 +30,102 @@ #include #include +#include #include +#include "../../utils.h" #include "../codegen_c/codegen_c.h" namespace tvm { namespace relay { namespace contrib { +using namespace backend; + +inline size_t GetShape1DSize(const Type& type) { + const auto shape = GetShape(type); + return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); +} + +std::vector Conv2d(const CallNode* call) { + std::vector args; + const auto* conv2d_attr = call->attrs.as(); + CHECK(conv2d_attr); + + auto ishape = GetShape(call->args[0]->checked_type()); + auto wshape = GetShape(call->args[1]->checked_type()); + + // Args: N, C, H, W + for (auto s : ishape) { + args.push_back(std::to_string(s)); + } + + // Args: O, G, Ph, Pw, Kh, Kw, Sh, Sw + args.push_back(std::to_string(wshape[0])); + args.push_back(std::to_string(conv2d_attr->groups)); + args.push_back(std::to_string(conv2d_attr->padding[0].as()->value)); + args.push_back(std::to_string(conv2d_attr->padding[1].as()->value)); + args.push_back(std::to_string(wshape[2])); + args.push_back(std::to_string(wshape[3])); + args.push_back(std::to_string(conv2d_attr->strides[0].as()->value)); + args.push_back(std::to_string(conv2d_attr->strides[1].as()->value)); + + return args; +} + +std::vector Dense(const CallNode* call) { + std::vector args; + auto ishape = GetShape(call->args[0]->checked_type()); + auto wshape = GetShape(call->args[1]->checked_type()); + + // Args: N, C, O + args.push_back(std::to_string(ishape[0])); + args.push_back(std::to_string(ishape[1])); + args.push_back(std::to_string(wshape[0])); + + return args; +} + +std::vector Relu(const CallNode* call) { + std::vector args; + auto ishape = GetShape(call->args[0]->checked_type()); + + // Args: N, C, H, W + for (auto s : ishape) { + args.push_back(std::to_string(s)); + } + + return args; +} + +std::vector BatchNorm(const CallNode* call) { + std::vector args; + const auto* bn_attr = call->attrs.as(); + auto ishape = GetShape(call->args[0]->checked_type()); + + // Args: N, C, H, W + for (auto s : ishape) { + args.push_back(std::to_string(s)); + } + + // Args: epsilon + args.push_back(std::to_string(bn_attr->epsilon)); + + return args; +} + +std::vector Add(const CallNode* call) { + std::vector args; + auto ishape = GetShape(call->args[0]->checked_type()); + + // Args: H, W + for (auto s : ishape) { + args.push_back(std::to_string(s)); + } + + return args; +} + // TODO(@zhiics, @comaniac): This is a basic implementation. We should implement // all utilities and make a base class for users to implement. class CodegenDNNL : public ExprVisitor, public CodegenCBase { @@ -53,79 +141,64 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase { } void VisitExpr_(const TupleGetItemNode* op) final { - // Do nothing + VisitExpr(op->tuple); + CHECK(out_.size() > static_cast(op->index)); + + // Only keep the item we want for the child node. + // FIXME(@comaniac): The other items should still be requried for the primary outputs. + auto item = out_[op->index]; + out_.clear(); + out_.push_back(item); } - void VisitExpr_(const CallNode* call) final { - std::ostringstream decl_stream; - std::ostringstream buf_stream; - // Args: ID - std::vector args; - - // Get the arguments for various DNNL kernels. - if (IsOp(call, "nn.conv2d")) { - decl_stream << "dnnl_conv2d"; - args = Conv2d(call); - } else if (IsOp(call, "nn.dense")) { - decl_stream << "dnnl_dense"; - args = Dense(call); - } else if (IsOp(call, "nn.relu")) { - decl_stream << "dnnl_relu"; - args = Relu(call); - } else if (IsOp(call, "nn.batch_norm")) { - decl_stream << "dnnl_bn"; - args = BatchNorm(call); - } else if (IsOp(call, "add")) { - decl_stream << "dnnl_add"; - args = Add(call); - } else { - LOG(FATAL) << "Unsupported op: " << AsText(call->op, false); + void VisitExpr_(const ConstantNode* cn) final { + Constant constant = GetRef(cn); + if (visited_.count(constant)) { + out_.push_back(visited_[constant]); + return; } - // Make function call with input buffers when visiting arguments - bool first = true; - decl_stream << "("; - for (size_t i = 0; i < call->args.size(); ++i) { - VisitExpr(call->args[i]); - for (auto out : out_) { - if (!first) { - decl_stream << ", "; - } - first = false; - decl_stream << out.name; - } - } + out_.clear(); + Output output; + output.name = "const_" + std::to_string(const_idx_++); + output.dtype = "float"; + out_.push_back(output); + visited_[constant] = output; + + runtime::NDArray array = cn->data; - // Analyze the output buffer - auto type_node = call->checked_type().as(); + // Get the number of elements. + int64_t num_elems = 1; + for (auto i : array.Shape()) num_elems *= i; + + const auto* type_node = cn->checked_type().as(); CHECK(type_node); - const auto& dtype = GetDtypeString(type_node); - std::string out = "buf_" + std::to_string(buf_idx_++); - auto out_shape = GetShape(call->checked_type()); - int out_size = 1; - for (size_t i = 0; i < out_shape.size(); ++i) { - out_size *= out_shape[i]; + CHECK_EQ(GetDtypeString(type_node), "float") << "Only float is supported for now."; + + std::ostringstream buf_stream; + buf_stream << "float* " << output.name << " = (float*)std::malloc(4 * " << num_elems << ");\n"; + const float* ptr = static_cast(array.ToDLPack()->dl_tensor.data); + for (int64_t i = 0; i < num_elems; i++) { + buf_stream << " " << output.name << "[" << i << "] = " << ptr[i] << ";\n"; } - this->PrintIndents(); - buf_stream << "float* " << out << " = (float*)std::malloc(4 * " << out_size << ");"; - buf_decl_.push_back(buf_stream.str()); - decl_stream << ", " << out; - // Attach attribute arguments - for (size_t i = 0; i < args.size(); ++i) { - decl_stream << ", " << args[i]; + ext_func_body.insert(ext_func_body.begin(), buf_stream.str()); + } + + void VisitExpr_(const CallNode* call) final { + GenerateBodyOutput ret; + if (const auto* func = call->op.as()) { + ret = GenerateCompositeFunctionCall(func, call); + } else { + ret = GenerateOpCall(call); } - decl_stream << ");"; - ext_func_body.push_back(decl_stream.str()); - // Update output buffer out_.clear(); - Output output; - output.name = out; - output.dtype = dtype; - output.need_copy = true; - output.size = out_size; - out_.push_back(output); + for (size_t i = 0; i < ret.outputs.size(); ++i) { + buf_decl_.push_back(ret.buffers[i]); + out_.push_back(ret.outputs[i]); + } + ext_func_body.push_back(ret.decl); } std::string JIT(void) { @@ -133,83 +206,121 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase { } private: - std::vector Conv2d(const CallNode* call) { - std::vector args; - const auto* conv2d_attr = call->attrs.as(); - CHECK(conv2d_attr); - - auto ishape = GetShape(call->args[0]->checked_type()); - auto wshape = GetShape(call->args[1]->checked_type()); - - // Args: N, C, H, W - for (auto s : ishape) { - args.push_back(std::to_string(s)); + struct GenerateBodyOutput { + std::string decl; + std::vector buffers; + std::vector outputs; + }; + + std::vector GetArgumentNames(const CallNode* call) { + std::vector arg_names; + for (size_t i = 0; i < call->args.size(); ++i) { + VisitExpr(call->args[i]); + for (auto out : out_) { + arg_names.push_back(out.name); + } } - - // Args: O, G, Ph, Pw, Kh, Kw, Sh, Sw - args.push_back(std::to_string(wshape[0])); - args.push_back(std::to_string(conv2d_attr->groups)); - args.push_back(std::to_string(conv2d_attr->padding[0].as()->value)); - args.push_back(std::to_string(conv2d_attr->padding[1].as()->value)); - args.push_back(std::to_string(wshape[2])); - args.push_back(std::to_string(wshape[3])); - args.push_back(std::to_string(conv2d_attr->strides[0].as()->value)); - args.push_back(std::to_string(conv2d_attr->strides[1].as()->value)); - - return args; + return arg_names; } - std::vector Dense(const CallNode* call) { - std::vector args; - auto ishape = GetShape(call->args[0]->checked_type()); - auto wshape = GetShape(call->args[1]->checked_type()); - - // Args: N, C, O - args.push_back(std::to_string(ishape[0])); - args.push_back(std::to_string(ishape[1])); - args.push_back(std::to_string(wshape[0])); + GenerateBodyOutput GenerateOpCall(const CallNode* call) { + const auto* op_node = call->op.as(); + CHECK(op_node) << "Expect OpNode, but got " << call->op->GetTypeKey(); + + using ArgFunType = std::function(const CallNode*)>; + static const std::map> op_map = { + {"nn.conv2d", {"dnnl_conv2d", Conv2d}}, + {"nn.dense", {"dnnl_dense", Dense}}, + {"nn.relu", {"dnnl_relu", Relu}}, + {"nn.batch_norm", {"dnnl_bn", BatchNorm}}, + {"add", {"dnnl_add", Add}}, + }; + + const auto op_name = GetRef(op_node)->name; + const auto iter = op_map.find(op_name); + if (iter != op_map.end()) { + return GenerateBody(call, iter->second.first, iter->second.second(call)); + } - return args; + LOG(FATAL) << "Unsupported op: " << AsText(call->op, false); + return {}; } - std::vector Relu(const CallNode* call) { - std::vector args; - auto ishape = GetShape(call->args[0]->checked_type()); - - // Args: N, C, H, W - for (auto s : ishape) { - args.push_back(std::to_string(s)); + GenerateBodyOutput GenerateCompositeFunctionCall(const FunctionNode* callee, + const CallNode* caller) { + const auto pattern_name = callee->GetAttr(attr::kComposite); + CHECK(pattern_name.defined()) << "Only functions with composite attribute supported"; + + if (pattern_name->value == "dnnl.conv2d_bias_relu") { + const auto* conv_call = + GetRootCall(callee->body.as(), 2, {"nn.conv2d", "add", "nn.relu"}); + return GenerateBody(conv_call, "dnnl_fused_conv2d_bias_relu", GetArgumentNames(caller), + Conv2d(conv_call)); + } else if (pattern_name->value == "dnnl.conv2d_relu") { + const auto* conv_call = GetRootCall(callee->body.as(), 1, {"nn.conv2d", "nn.relu"}); + return GenerateBody(conv_call, "dnnl_fused_conv2d_relu", GetArgumentNames(caller), + Conv2d(conv_call)); } - return args; + LOG(FATAL) << "Unknown composite function:" << pattern_name; + return {}; } - std::vector BatchNorm(const CallNode* call) { - std::vector args; - const auto* bn_attr = call->attrs.as(); - auto ishape = GetShape(call->args[0]->checked_type()); + GenerateBodyOutput GenerateBody(const CallNode* root_call, const std::string& func_name, + const std::vector& attribute_args) { + return GenerateBody(root_call, func_name, GetArgumentNames(root_call), attribute_args); + } - // Args: N, C, H, W - for (auto s : ishape) { - args.push_back(std::to_string(s)); + GenerateBodyOutput GenerateBody(const CallNode* root_call, const std::string& func_name, + const std::vector& func_args, + const std::vector& attribute_args) { + // Make function call with input buffers when visiting arguments + CHECK_GT(func_args.size(), 0); + std::ostringstream decl_stream; + decl_stream << "(" << func_args[0]; + for (size_t i = 1; i < func_args.size(); ++i) { + decl_stream << ", " << func_args[i]; } - // Args: epsilon - args.push_back(std::to_string(bn_attr->epsilon)); - - return args; - } - - std::vector Add(const CallNode* call) { - std::vector args; - auto ishape = GetShape(call->args[0]->checked_type()); + // Analyze the output buffers + std::vector out_types; + if (root_call->checked_type()->IsInstance()) { + auto type_node = root_call->checked_type().as(); + for (auto field : type_node->fields) { + CHECK(field->IsInstance()); + out_types.push_back(field); + } + } else if (root_call->checked_type()->IsInstance()) { + CHECK(root_call->checked_type()->IsInstance()); + out_types.push_back(root_call->checked_type()); + } else { + LOG(FATAL) << "Unrecognized type node: " << AsText(root_call->checked_type(), false); + } - // Args: H, W - for (auto s : ishape) { - args.push_back(std::to_string(s)); + GenerateBodyOutput ret; + for (const auto& out_type : out_types) { + this->PrintIndents(); + const std::string out = "buf_" + std::to_string(buf_idx_++); + const auto out_size = GetShape1DSize(out_type); + decl_stream << ", " << out; + + Output output; + output.name = out; + output.size = out_size; + output.dtype = GetDtypeString(out_type.as()); + output.need_copy = true; + ret.buffers.push_back("float* " + out + " = (float*)std::malloc(4 * " + + std::to_string(out_size) + ");"); + ret.outputs.push_back(output); } - return args; + // Attach attribute arguments + for (size_t i = 0; i < attribute_args.size(); ++i) { + decl_stream << ", " << attribute_args[i]; + } + decl_stream << ");"; + ret.decl = func_name + decl_stream.str(); + return ret; } /*! \brief The id of the external dnnl ext_func. */ @@ -219,6 +330,8 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase { * output to a buffer that may be consumed by other kernels. */ int buf_idx_{0}; + /*! \brief The index of global constants. */ + int const_idx_ = 0; /*! \brief The arguments used by a wrapped function that calls DNNL kernels. */ Array ext_func_args_; /*! \brief statement of the function that will be compiled using DNNL kernels. */ @@ -227,6 +340,8 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase { std::vector buf_decl_; /*! \brief The name of the the outputs. */ std::vector out_; + /*! \brief The cached expressions. */ + std::unordered_map visited_; }; /*! diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h index 7171589f79d3..a96ffe4720fc 100644 --- a/src/relay/backend/utils.h +++ b/src/relay/backend/utils.h @@ -25,18 +25,19 @@ #define TVM_RELAY_BACKEND_UTILS_H_ #include +#include #include -#include #include -#include +#include #include -#include #include +#include -#include #include +#include #include #include +#include namespace tvm { namespace relay { @@ -59,7 +60,7 @@ inline const PackedFunc* GetPackedFunc(const std::string& func_name) { */ template inline const runtime::TypedPackedFunc GetTypedPackedFunc(const std::string& func_name) { - auto *pf = GetPackedFunc(func_name); + auto* pf = GetPackedFunc(func_name); CHECK(pf != nullptr) << "can not find packed function"; return runtime::TypedPackedFunc(*pf); } @@ -90,9 +91,8 @@ inline std::string DType2String(const tvm::DataType dtype) { * \param params params dict * \return relay::Function */ -inline relay::Function -BindParamsByName(relay::Function func, - const std::unordered_map& params) { +inline relay::Function BindParamsByName( + relay::Function func, const std::unordered_map& params) { std::unordered_map name_dict; std::unordered_set repeat_var; for (auto arg : func->params) { @@ -122,8 +122,64 @@ BindParamsByName(relay::Function func, return ret; } +/*! + * \brief Extract the shape from a Relay tensor type. + * \param type The provided type. + * \return The extracted shape in a list. + */ +inline std::vector GetShape(const Type& type) { + const auto* ttype = type.as(); + CHECK(ttype) << "Expect TensorTypeNode"; + std::vector shape; + for (size_t i = 0; i < ttype->shape.size(); ++i) { + auto* val = ttype->shape[i].as(); + CHECK(val); + shape.push_back(val->value); + } + return shape; +} + +/*! + * \brief Check if a call has the provided name. + * \param call A Relay call node. + * \param op_name The name of the expected call. + * \return true if the call's name is equivalent to the given name. Otherwise, + * false. + */ +inline bool IsOp(const CallNode* call, const std::string& op_name) { + const auto* op_node = call->op.as(); + CHECK(op_node) << "Expects a single op."; + Op op = GetRef(op_node); + return op == Op::Get(op_name); +} + +/*! + * \brief Retrieve the "root" op nested inside a fused call, such as conv2d in relu(add(conv2d)) + * \param call A Relay call node. Typically nn.relu when called the first time. + * \param depth The number of calls before the root op, counting from current_call. + * \param expected_op_names The names of ops in this fused call. Example: {"nn.conv2d", "add", + * "nn.relu"} + * \return A CallNode corresponding to the root op, whose name is expected_op_names[0] + */ + +inline const CallNode* GetRootCall(const CallNode* current_call, int depth, + const std::vector& expected_op_names) { + CHECK(current_call && depth >= 0 && static_cast(depth) < expected_op_names.size() && + IsOp(current_call, expected_op_names[depth])); + + if (depth == 0) { + return current_call; + } + + CHECK_GT(current_call->args.size(), 0); + + const auto* next_call = current_call->args[0].as(); + return GetRootCall(next_call, depth - 1, expected_op_names); +} + } // namespace backend } // namespace relay } // namespace tvm + #endif // TVM_RELAY_BACKEND_UTILS_H_ diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc index cc430b2c7c76..add92fe1df8c 100644 --- a/src/runtime/contrib/dnnl/dnnl.cc +++ b/src/runtime/contrib/dnnl/dnnl.cc @@ -52,10 +52,9 @@ inline void read_from_dnnl_memory(void* handle, const memory& mem) { std::copy(src, src + bytes, reinterpret_cast(handle)); } -extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, - int p_C_, int p_H_, int p_W_, int p_O_, int p_G_, - int p_Ph_, int p_Pw_, int p_Kh_, int p_Kw_, - int p_Sh_, int p_Sw_) { +void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, int p_N_, int p_C_, + int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_, int p_Kh_, + int p_Kw_, int p_Sh_, int p_Sw_, primitive_attr attr) { using tag = memory::format_tag; using dt = memory::data_type; engine eng(engine::kind::cpu, 0); @@ -65,21 +64,15 @@ extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, memory::dims conv2d_weights_tz = {p_O_, p_C_, p_Kh_, p_Kw_}; if (p_G_ > 1) conv2d_weights_tz = {p_G_, 1, p_C_ / p_G_, p_Kh_, p_Kw_}; memory::dims conv2d_bias_tz = {p_O_}; - memory::dims conv2d_dst_tz = {p_N_, p_O_, - (p_H_ - p_Kh_ + 2 * p_Ph_ + p_Sh_) / p_Sh_, + memory::dims conv2d_dst_tz = {p_N_, p_O_, (p_H_ - p_Kh_ + 2 * p_Ph_ + p_Sh_) / p_Sh_, (p_W_ - p_Kw_ + 2 * p_Pw_ + p_Sw_) / p_Sw_}; memory::dims conv2d_strides = {p_Sh_, p_Sw_}; memory::dims conv2d_padding = {p_Ph_, p_Pw_}; - std::vector conv2d_bias(p_O_, 0); - - auto user_src_memory = - memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data); - auto user_weights_memory = memory( - {{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng, - weights); - auto conv2d_user_bias_memory = - memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, conv2d_bias.data()); + auto user_src_memory = memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data); + auto user_weights_memory = + memory({{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng, weights); + auto conv2d_user_bias_memory = memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, bias); auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any); auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any); @@ -87,10 +80,9 @@ extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::nchw); auto conv2d_desc = convolution_forward::desc( - prop_kind::forward_inference, algorithm::convolution_direct, - conv2d_src_md, conv2d_weights_md, conv2d_bias_md, conv2d_dst_md, - conv2d_strides, conv2d_padding, conv2d_padding); - auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, eng); + prop_kind::forward_inference, algorithm::convolution_direct, conv2d_src_md, conv2d_weights_md, + conv2d_bias_md, conv2d_dst_md, conv2d_strides, conv2d_padding, conv2d_padding); + auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, attr, eng); auto conv2d_src_memory = user_src_memory; auto conv2d_weights_memory = user_weights_memory; @@ -105,6 +97,42 @@ extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, read_from_dnnl_memory(out, conv2d_dst_memory); } +extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, int p_C_, int p_H_, + int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_, int p_Kh_, + int p_Kw_, int p_Sh_, int p_Sw_) { + primitive_attr attr; + std::vector bias(p_O_, 0); + return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, + p_Ph_, p_Pw_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, attr); +} + +primitive_attr create_attr_with_relu_post_op() { + post_ops ops; + ops.append_eltwise(1.f, algorithm::eltwise_relu, 0.f, 0.f); + + primitive_attr attr; + attr.set_post_ops(ops); + + return attr; +} + +extern "C" void dnnl_fused_conv2d_relu(float* data, float* weights, float* out, int p_N_, int p_C_, + int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_, + int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_) { + std::vector bias(p_O_, 0); + return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, + p_Ph_, p_Pw_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, + create_attr_with_relu_post_op()); +} + +extern "C" void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float* bias, float* out, + int p_N_, int p_C_, int p_H_, int p_W_, int p_O_, + int p_G_, int p_Ph_, int p_Pw_, int p_Kh_, int p_Kw_, + int p_Sh_, int p_Sw_) { + return dnnl_conv2d_common(data, weights, bias, out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, p_Ph_, + p_Pw_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, create_attr_with_relu_post_op()); +} + extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_, int p_O_) { using tag = memory::format_tag; diff --git a/src/runtime/contrib/dnnl/dnnl_kernel.h b/src/runtime/contrib/dnnl/dnnl_kernel.h index 4d0b100b92ec..d9afed402147 100644 --- a/src/runtime/contrib/dnnl/dnnl_kernel.h +++ b/src/runtime/contrib/dnnl/dnnl_kernel.h @@ -38,6 +38,17 @@ extern "C" TVM_DLL void dnnl_conv2d(float* data, float* weights, float* out, int int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph_, int p_Pw_, int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_); +extern "C" TVM_DLL void dnnl_fused_conv2d_relu(float* data, float* weights, float* out, int p_N_, + int p_C_, int p_H_, int p_W_, int p_O_, int p_G_, + int p_Ph_, int p_Pw_, int p_Kh_, int p_Kw_, + int p_Sh_, int p_Sw_); + +extern "C" TVM_DLL void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float* bias, + float* out, int p_N_, int p_C_, int p_H_, + int p_W_, int p_O_, int p_G_, int p_Ph_, + int p_Pw_, int p_Kh_, int p_Kw_, int p_Sh_, + int p_Sw_); + extern "C" TVM_DLL void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_, int p_O_); diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py index fb216822c295..a3027c14f9fc 100644 --- a/tests/python/relay/test_pass_partition_graph.py +++ b/tests/python/relay/test_pass_partition_graph.py @@ -27,11 +27,9 @@ from tvm import runtime from tvm.relay import transform from tvm.contrib import util -from tvm.relay import transform from tvm.relay.backend import compile_engine from tvm.relay.expr_functor import ExprMutator from tvm.relay.op.annotation import compiler_begin, compiler_end -from tvm.runtime import container # Leverage the pass manager to write a simple white list based annotator @@ -852,6 +850,128 @@ def expected(): partitioned = transform.PartitionGraph()(mod) assert tvm.ir.structural_equal(partitioned, ref_mod, map_free_vars=True) + +def test_dnnl_fuse(): + def make_pattern(with_bias=True): + data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32")) + weight = relay.var("weight") + bias = relay.var("bias") + conv = relay.nn.conv2d(data=data, weight=weight, kernel_size=(3, 3), + channels=8, padding=(1, 1)) + if with_bias: + conv_out = relay.add(conv, bias) + else: + conv_out = conv + return relay.nn.relu(conv_out) + + conv2d_bias_relu_pat = ("dnnl.conv2d_bias_relu", make_pattern(with_bias=True)) + conv2d_relu_pat = ("dnnl.conv2d_relu", make_pattern(with_bias=False)) + dnnl_patterns = [conv2d_bias_relu_pat, conv2d_relu_pat] + + def get_blocks(prefix, data, in_channel, out_channel, + include_bn=True, include_sigmoid=False): + weight = relay.var(prefix + "weight") + bn_gamma = relay.var(prefix + "bn_gamma") + bn_beta = relay.var(prefix + "bn_beta") + bn_mmean = relay.var(prefix + "bn_mean") + bn_mvar = relay.var(prefix + "bn_var") + + layer = relay.nn.conv2d(data=data, weight=weight, kernel_size=(3, 3), + channels=out_channel, padding=(1, 1)) + if include_bn: + bn_output = relay.nn.batch_norm(layer, bn_gamma, bn_beta, + bn_mmean, bn_mvar) + layer = bn_output[0] + if include_sigmoid: + # dummy layer to prevent pattern detection + layer = relay.sigmoid(layer) + layer = relay.nn.relu(layer) + return layer + + def get_net(include_bn=True, include_sigmoid=False): + data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32")) + block1 = get_blocks("block1_", data, 3, 8, include_bn, include_sigmoid) + # The second block is always conv + relu, to make it more interesting + block2 = get_blocks("block2_", block1, 8, 8, False, include_sigmoid) + return relay.Function(relay.analysis.free_vars(block2), block2) + + def get_partitoned_mod(mod, params, pattern_table): + # This is required for constant folding + mod["main"] = bind_params_by_name(mod["main"], params) + + remove_bn_pass = transform.Sequential([ + transform.InferType(), + transform.SimplifyInference(), + transform.FoldConstant(), + transform.FoldScaleAxis(), + ]) + composite_partition = transform.Sequential([ + remove_bn_pass, + transform.MergeComposite(pattern_table), + transform.AnnotateTarget("dnnl"), + transform.PartitionGraph() + ]) + + with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]): + return composite_partition(mod) + + def test_detect_pattern(pattern_table, include_bn, include_sigmoid, + num_expected_partition): + net = get_net(include_bn, include_sigmoid) + mod, params = tvm.relay.testing.create_workload(net) + mod = get_partitoned_mod(mod, params, pattern_table) + assert(len(mod.functions) - 1 == num_expected_partition) # -1 for main + + def test_partition(): + # conv + bn + relu, conv + relu -> fused conv_bias_relu, conv, and relu + test_detect_pattern([conv2d_bias_relu_pat], True, False, 3) + # conv + bn + relu, conv + relu -> conv, bias, relu, and fused conv_relu + test_detect_pattern([conv2d_relu_pat], True, False, 4) + # conv + bn + relu, conv + relu -> fused conv_bias_relu, and fused conv_relu + test_detect_pattern([conv2d_bias_relu_pat, conv2d_relu_pat], True, False, 2) + # conv + relu, conv + relu -> two fused conv_relu + test_detect_pattern([conv2d_relu_pat], False, False, 2) + # conv + relu, conv + relu -> no fusion, 4 partition each with a single op + test_detect_pattern([conv2d_bias_relu_pat], False, False, 4) + # conv + bn + sigmoid + relu, conv + sigmoid + relu -> no fusion + test_detect_pattern([conv2d_bias_relu_pat, conv2d_relu_pat], True, True, 5) + + def test_partition_mobilenet(): + mod, params = relay.testing.mobilenet.get_workload() + mod = get_partitoned_mod(mod, params, dnnl_patterns) + # 27 fused conv + bn + relu and one dense + assert(len(mod.functions) - 1 == 28) # -1 for main + + def test_exec(mod, params, ref_mod, ref_params, out_shape): + ishape = (1, 3, 224, 224) + i_data = np.random.randn(*ishape).astype(np.float32) + ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0)) + ref_res = ref_ex.evaluate()(i_data, **ref_params) + compile_engine.get().clear() + + mod = get_partitoned_mod(mod, params, dnnl_patterns) + + check_result(mod, {"data": i_data}, + out_shape, ref_res.asnumpy(), tol=1e-5, params=params) + + test_partition() + test_partition_mobilenet() + + if not tvm.get_global_func("relay.ext.dnnl", True): + print("skip because DNNL codegen is not available") + return + + net = get_net() + mod, params = tvm.relay.testing.create_workload(net) + ref_mod, ref_params = tvm.relay.testing.create_workload(net) + test_exec(mod, params, ref_mod, ref_params, (1, 8, 224, 224)) + + # exec test on mobilenet is not possible due to manually inlined constants + # mod, params = relay.testing.mobilenet.get_workload() + # ref_mod, ref_params = relay.testing.mobilenet.get_workload() + # test_exec(mod, params, ref_mod, ref_params, (1, 1000)) + + if __name__ == "__main__": test_multi_node_compiler() test_extern_ccompiler_single_op() @@ -865,3 +985,4 @@ def expected(): test_constant_propagation() test_multiple_outputs() test_mixed_single_multiple_outputs() + test_dnnl_fuse() From 9befd9231c798f9979c1ba20bf27bafc3fe43b9a Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Sat, 11 Apr 2020 07:46:46 +0900 Subject: [PATCH 2/3] fix string issue --- src/relay/backend/contrib/dnnl/codegen.cc | 4 ++-- tests/python/relay/test_pass_partition_graph.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc index 4fbd4173516c..7f3aabf6e016 100644 --- a/src/relay/backend/contrib/dnnl/codegen.cc +++ b/src/relay/backend/contrib/dnnl/codegen.cc @@ -251,12 +251,12 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase { const auto pattern_name = callee->GetAttr(attr::kComposite); CHECK(pattern_name.defined()) << "Only functions with composite attribute supported"; - if (pattern_name->value == "dnnl.conv2d_bias_relu") { + if (pattern_name == "dnnl.conv2d_bias_relu") { const auto* conv_call = GetRootCall(callee->body.as(), 2, {"nn.conv2d", "add", "nn.relu"}); return GenerateBody(conv_call, "dnnl_fused_conv2d_bias_relu", GetArgumentNames(caller), Conv2d(conv_call)); - } else if (pattern_name->value == "dnnl.conv2d_relu") { + } else if (pattern_name == "dnnl.conv2d_relu") { const auto* conv_call = GetRootCall(callee->body.as(), 1, {"nn.conv2d", "nn.relu"}); return GenerateBody(conv_call, "dnnl_fused_conv2d_relu", GetArgumentNames(caller), Conv2d(conv_call)); diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py index a3027c14f9fc..c7d9626931d0 100644 --- a/tests/python/relay/test_pass_partition_graph.py +++ b/tests/python/relay/test_pass_partition_graph.py @@ -30,6 +30,7 @@ from tvm.relay.backend import compile_engine from tvm.relay.expr_functor import ExprMutator from tvm.relay.op.annotation import compiler_begin, compiler_end +from tvm.relay.build_module import bind_params_by_name # Leverage the pass manager to write a simple white list based annotator @@ -454,7 +455,7 @@ def test_extern_dnnl_mobilenet(): mod, params = relay.testing.mobilenet.get_workload( batch_size=1, dtype='float32') - mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params) + mod["main"] = bind_params_by_name(mod["main"], params) mod = transform.AnnotateTarget(["dnnl"])(mod) mod = transform.MergeCompilerRegions()(mod) mod = transform.PartitionGraph()(mod) @@ -661,7 +662,7 @@ def expected(): add = x + y log = relay.log(add) f = relay.Function([x, y], log) - f = relay.build_module.bind_params_by_name(f, {"x": tvm.nd.array(ones)}) + f = bind_params_by_name(f, {"x": tvm.nd.array(ones)}) mod = tvm.IRModule() mod["main"] = f mod = WhiteListAnnotator(["add"], "ccompiler")(mod) From ca921baa90e45bc54b124382f4572ee2dd57002f Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Sat, 11 Apr 2020 08:09:50 +0900 Subject: [PATCH 3/3] bring comanic's change back --- python/tvm/relay/op/contrib/dnnl.py | 9 +-------- src/relay/backend/vm/compiler.cc | 13 +++++++------ src/runtime/contrib/dnnl/dnnl.cc | 4 ++-- src/runtime/contrib/dnnl/dnnl_kernel.h | 4 ++-- 4 files changed, 12 insertions(+), 18 deletions(-) diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py index 1aa71921806d..45a8c8331f72 100644 --- a/python/tvm/relay/op/contrib/dnnl.py +++ b/python/tvm/relay/op/contrib/dnnl.py @@ -56,17 +56,10 @@ def _func_wrapper(attrs, args): return _func_wrapper +_register_external_op_helper("nn.batch_norm") _register_external_op_helper("nn.conv2d") _register_external_op_helper("nn.dense") _register_external_op_helper("nn.relu") _register_external_op_helper("add") _register_external_op_helper("subtract") _register_external_op_helper("multiply") - - -@reg.register("nn.batch_norm", "target.dnnl") -def batch_norm(attrs, args): - """Check if the external DNNL codegen should be used. - FIXME(@zhiics, @comaniac): Turn off due to not support of multiple outputs. - """ - return False diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc index 3e020bb27954..e2b0fffec8bd 100644 --- a/src/relay/backend/vm/compiler.cc +++ b/src/relay/backend/vm/compiler.cc @@ -924,6 +924,13 @@ IRModule VMCompiler::OptimizeModule(const IRModule& mod, const TargetsMap& targe pass_seqs.push_back(transform::LambdaLift()); pass_seqs.push_back(transform::InlinePrimitives()); + // Manifest the allocations. + pass_seqs.push_back(transform::ManifestAlloc(this->target_host_)); + // Compute away possibly introduced constant computation. + pass_seqs.push_back(transform::FoldConstant()); + // Fuse the shape functions. + pass_seqs.push_back(transform::FuseOps()); + // Inline the functions that are lifted to the module scope. We perform this // pass after all other optimization passes but before the memory allocation // pass. This is because memory allocation pass will insert `invoke_tvm_op` @@ -931,12 +938,6 @@ IRModule VMCompiler::OptimizeModule(const IRModule& mod, const TargetsMap& targe // external codegen. pass_seqs.push_back(transform::Inline()); - // Manifest the allocations. - pass_seqs.push_back(transform::ManifestAlloc(this->target_host_)); - // Compute away possibly introduced constant computation. - pass_seqs.push_back(transform::FoldConstant()); - // Fuse the shape functions. - pass_seqs.push_back(transform::FuseOps()); // Manifest the allocations needed for the shape functions. pass_seqs.push_back(transform::ManifestAlloc(this->target_host_)); diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc index add92fe1df8c..0922ac1a65df 100644 --- a/src/runtime/contrib/dnnl/dnnl.cc +++ b/src/runtime/contrib/dnnl/dnnl.cc @@ -197,8 +197,8 @@ extern "C" void dnnl_relu(float* data, float* out, int p_N_, int p_C_, int p_H_, read_from_dnnl_memory(out, dst_memory); } -extern "C" void dnnl_bn(float* data, float* gamma, float* beta, float* mean, - float* variance, float* out, int p_N_, int p_C_, +extern "C" void dnnl_bn(float* data, float* gamma, float* beta, float* mean, float* variance, + float* out, float* new_mean, float* new_variance, int p_N_, int p_C_, int p_H_, int p_W_, int p_E_) { using tag = memory::format_tag; using dt = memory::data_type; diff --git a/src/runtime/contrib/dnnl/dnnl_kernel.h b/src/runtime/contrib/dnnl/dnnl_kernel.h index d9afed402147..f92d7679aeee 100644 --- a/src/runtime/contrib/dnnl/dnnl_kernel.h +++ b/src/runtime/contrib/dnnl/dnnl_kernel.h @@ -55,8 +55,8 @@ extern "C" TVM_DLL void dnnl_dense(float* data, float* weight, float* out, int p extern "C" TVM_DLL void dnnl_relu(float* data, float* out, int p_N_, int p_C_, int p_H_, int p_W_); extern "C" TVM_DLL void dnnl_bn(float* data, float* gamma, float* beta, float* mean, - float* variance, float* out, int p_n_, int p_c_, int p_h_, int p_w_, - int p_e_); + float* variance, float* out, float* new_mean, float* new_variance, + int p_n_, int p_c_, int p_h_, int p_w_, int p_e_); extern "C" TVM_DLL void dnnl_add(float* data, float* weight, float* out, int p_n_, int p_c_, int p_h_, int p_w_);