diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt index 1faa7be28bded3..0c2ad91af1b7be 100644 --- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt +++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt @@ -42,7 +42,7 @@ set(LAYERS # ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_tensoriterator_node.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_tile_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_mvn_node.cpp -# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_normalize_node.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_normalize_node.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_scatter_update_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_interpolate_node.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reduce_node.cpp diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp index 4a8eef03944f85..774d9f82a79234 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp @@ -209,3 +209,19 @@ InferenceEngine::Precision MKLDNNExtensionUtils::getMaxPrecision(std::vector firstInputDims.size()) + return false; + if (std::accumulate(secondInputDims.begin(), secondInputDims.end(), 1, std::multiplies()) == 1) + return true; + for (size_t i = 0; i < (firstInputDims.size() - secondInputDims.size()); i++) { + secondInputDims.insert(secondInputDims.begin(), 1); + } + for (size_t i = 0; i < secondInputDims.size(); i++) { + if ((i == 1 && secondInputDims[i] != firstInputDims[1]) || (i != 1 && secondInputDims[i] != 1)) + return false; + } + return true; +} diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h index 26fc09c92de36b..4223f1444723ed 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h @@ -79,6 +79,7 @@ class MKLDNNExtensionUtils { static bool initTensorsAreEqual(const InferenceEngine::TensorDesc &desc1, const InferenceEngine::TensorDesc &desc2); static std::string getReorderArgs(const InferenceEngine::TensorDesc &parentDesc, const InferenceEngine::TensorDesc &childDesc); static InferenceEngine::Precision getMaxPrecision(std::vector precisions); + static bool isPerTensorOrPerChannelBroadcastable(const InferenceEngine::SizeVector &firstInputDims, InferenceEngine::SizeVector secondInputDims); }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp index 8f33cd32a0bb7f..ceda015d1b7ca0 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp @@ -146,9 +146,9 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) { // FuseInterpolateAndSimpleOperation(graph); // graph.RemoveDroppedNodes(); - // OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseNormalizeAndSimpleOperation"); - // FuseNormalizeAndSimpleOperation(graph); - // graph.RemoveDroppedNodes(); + OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseNormalizeL2AndSimpleOperation"); + FuseNormalizeL2AndSimpleOperation(graph); + graph.RemoveDroppedNodes(); OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseEltwiseAndSimple"); FuseEltwiseAndSimple(graph); @@ -1463,70 +1463,46 @@ void MKLDNNGraphOptimizer::FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph) } } -void MKLDNNGraphOptimizer::FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph) { -// auto& graphNodes = graph.GetNodes(); -// -// auto isSutableParentNode = [](MKLDNNNodePtr node) { -// bool isSutableNormalize = node->getType() == Normalize; -// -// if (isSutableNormalize) { -// return node->getChildEdges().size() == 1; -// } else { -// return false; -// } -// }; -// -// auto isSutableChildNode = [&](MKLDNNNodePtr node) { -// if (!node->getCnnLayer()) -// return false; -// -// if (node->getType() == Quantize) { -// auto* quantizeNode = dynamic_cast(node.get()); -// if (quantizeNode == nullptr) -// IE_THROW() << "Cannot get quantize layer " << node->getName(); -// return !quantizeNode->isBinarization(); -// } else if (node->getType() == Eltwise) { -// auto *eltwiseNode = dynamic_cast(node.get()); -// if (eltwiseNode == nullptr) -// IE_THROW() << "Cannot get Eltwise node " << node->getName(); -// return IsOneOf(eltwiseNode->getOpType(), {Relu, Gelu, Elu, Logistic, BoundedRelu, Clamp, Tanh, Swish, -// Hswish, Mish, Hsigmoid, Round, Linear, Abs, Square, Sqrt}) || -// ((eltwiseNode->getOpType() == MulAdd && eltwiseNode->getCnnLayer()->blobs.size() == 2) || -// (eltwiseNode->getOpType() == Prelu)); -// } -// -// return false; -// }; -// -// auto parent = graphNodes.begin(); -// while (parent != graphNodes.end()) { -// auto parentNode = *parent; -// if (!isSutableParentNode(parentNode)) { -// parent++; -// continue; -// } -// -// auto childNode = parentNode->getChildEdgeAt(0)->getChild(); -// if (!isSutableChildNode(childNode)) { -// parent++; -// continue; -// } -// -// parentNode->fuseWith(childNode); -// -// if (childNode->getType() == Quantize || childNode->getType() == Eltwise) { -// auto parentEdges = childNode->parentEdges; -// for (auto &parentEdge : parentEdges) { -// auto p_edge = parentEdge.lock(); -// if (p_edge->getParent()->getType() == Normalize) -// continue; -// -// removeEdge(graph, p_edge); -// } -// } -// -// graph.DropNode(childNode); -// } +void MKLDNNGraphOptimizer::FuseNormalizeL2AndSimpleOperation(MKLDNNGraph &graph) { + auto& graphNodes = graph.GetNodes(); + + auto isSutableParentNode = [](MKLDNNNodePtr node) { + if (node->getType() == NormalizeL2) { + return node->getChildEdges().size() == 1; + } else { + return false; + } + }; + + auto parent = graphNodes.begin(); + while (parent != graphNodes.end()) { + auto parentNode = *parent; + if (!isSutableParentNode(parentNode)) { + parent++; + continue; + } + + auto childNode = parentNode->getChildEdgeAt(0)->getChild(); + if (!parentNode->canFuse(childNode)) { + parent++; + continue; + } + + parentNode->fuseWith(childNode); + + if (childNode->getType() == Quantize || childNode->getType() == Eltwise) { + auto parentEdges = childNode->parentEdges; + for (auto &parentEdge : parentEdges) { + auto p_edge = parentEdge.lock(); + if (p_edge->getParent()->getType() == NormalizeL2) + continue; + + removeEdge(graph, p_edge); + } + } + + graph.DropNode(childNode); + } } void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h index 01282cca50e65e..94e0a50eade228 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h @@ -33,7 +33,7 @@ class MKLDNNGraphOptimizer { void FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph); void FuseMVNAndSimpleOperation(MKLDNNGraph &graph); void FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph); - void FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph); + void FuseNormalizeL2AndSimpleOperation(MKLDNNGraph &graph); void DropDoubleReorders(MKLDNNGraph& graph); void DropConvertReorder(MKLDNNGraph& graph); diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp index 7faf3da248136b..3f0fb24bab45db 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp @@ -189,7 +189,7 @@ static const InferenceEngine::details::caseless_unordered_map // { "Memory", MemoryOutput }, // for construction from layer ctor // { "Convert", Convert }, { "MVN", MVN}, -// { "Normalize", Normalize}, + { "NormalizeL2", NormalizeL2}, // { "ScatterUpdate", ScatterUpdate}, // { "ScatterElementsUpdate", ScatterElementsUpdate}, // { "ScatterNDUpdate", ScatterNDUpdate}, diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h index 91af3e3a822130..1eac30438449ec 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h @@ -75,7 +75,7 @@ enum Type { TensorIterator, Convert, MVN, - Normalize, + NormalizeL2, ScatterUpdate, ScatterElementsUpdate, ScatterNDUpdate, @@ -239,8 +239,8 @@ static std::string NameFromType(Type type) { return "TensorIterator"; case Convert: return "Convert"; - case Normalize: - return "Normalize"; + case NormalizeL2: + return "NormalizeL2"; case ScatterUpdate: return "ScatterUpdate"; case ScatterElementsUpdate: @@ -623,6 +623,10 @@ class MKLDNNNode : public InferenceEngine::details::no_copy { return algorithm; } + virtual bool canFuse(const MKLDNNNodePtr& node) const { + return false; + } + protected: void setType(Type type) { this->type = type; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h index 7b9b9db1fc056d..bb01502628678e 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h @@ -86,7 +86,7 @@ class MKLDNNBinaryConvolutionNode : public MKLDNNNode { return false; } void setPostOps(mkldnn::primitive_attr &attr); - bool canFuse(const MKLDNNNodePtr& node) const; + bool canFuse(const MKLDNNNodePtr& node) const override; private: bool withSum = false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp index 7da2bb798400c2..b88e5b19139298 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp @@ -23,6 +23,7 @@ #include "utils/general_utils.h" #include "ngraph/ngraph.hpp" +#include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include using namespace MKLDNNPlugin; using namespace InferenceEngine; @@ -962,6 +964,17 @@ MKLDNNEltwiseNode::MKLDNNEltwiseNode(const std::shared_ptr& op, co MKLDNNNode(op, eng, cache) { if (initializers.find(op->get_type_info()) != initializers.end()) { initializers[op->get_type_info()](op, *this); + + std::shared_ptr secondIn; + const auto isConstantBroadcastbleSecondInput = [&](const std::shared_ptr& op) { + secondIn = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(1)); + return secondIn != nullptr && MKLDNNExtensionUtils::isPerTensorOrPerChannelBroadcastable(op->get_input_shape(0), op->get_input_shape(1)); + }; + if (one_of(getAlgorithm(), EltwiseMultiply, EltwiseDivide, EltwisePrelu) && isConstantBroadcastbleSecondInput(op)) { + scales = secondIn->cast_vector(); + } else if (one_of(getAlgorithm(), EltwiseAdd, EltwiseSubtract) && isConstantBroadcastbleSecondInput(op)) { + shifts = secondIn->cast_vector(); + } } else { IE_THROW(NotImplemented) << "CPU Eltwise node doesn't support ngraph operation " << op->get_type_name() << " with name " << op->get_friendly_name(); @@ -1704,59 +1717,98 @@ bool MKLDNNEltwiseNode::canBeInPlace() const { return getParentEdgesAtPort(0)[0].get()->getDims() == getChildEdgesAtPort(0)[0].get()->getDims(); } -void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops) { - switch (getMKLDNNAlgorithm()) { - case mkldnn::algorithm::eltwise_relu: - case mkldnn::algorithm::eltwise_tanh: - case mkldnn::algorithm::eltwise_elu: - case mkldnn::algorithm::eltwise_square: - case mkldnn::algorithm::eltwise_abs: - case mkldnn::algorithm::eltwise_sqrt: - case mkldnn::algorithm::eltwise_linear: - case mkldnn::algorithm::eltwise_bounded_relu: - case mkldnn::algorithm::eltwise_soft_relu: - case mkldnn::algorithm::eltwise_logistic: - case mkldnn::algorithm::eltwise_exp: - case mkldnn::algorithm::eltwise_gelu_erf: - case mkldnn::algorithm::eltwise_gelu_tanh: - case mkldnn::algorithm::eltwise_clip: - case mkldnn::algorithm::eltwise_swish: - case mkldnn::algorithm::eltwise_hswish: - case mkldnn::algorithm::eltwise_mish: - case mkldnn::algorithm::eltwise_hsigmoid: - case mkldnn::algorithm::eltwise_round_half_to_even: - case mkldnn::algorithm::eltwise_round_half_away_from_zero: - ops.append_eltwise(1.0, getMKLDNNAlgorithm(), getAlpha(), getBeta()); +void MKLDNNEltwiseNode::fillScalesAndShifts() { + const size_t bufferSize = static_cast(outDims[0][outDims[0].size() > 1 ? 1 : 0]); + const size_t bufferSizeAligned = rnd_up(bufferSize, 16); + + size_t initSize = scales.size(); + if (initSize > 0) { + scales.resize(bufferSizeAligned, 0); + if (initSize == 1) { + std::fill(scales.begin() + 1, scales.begin() + bufferSize, scales[0]); + } + } + + initSize = shifts.size(); + if (initSize > 0) { + shifts.resize(bufferSizeAligned, 0); + if (initSize == 1) { + std::fill(shifts.begin() + 1, shifts.begin() + bufferSize, shifts[0]); + } + } + + switch (getAlgorithm()) { + case EltwiseAdd: { + scales.resize(bufferSizeAligned, 1.0f); break; - case mkldnn::algorithm::depthwise_scale_shift: - case mkldnn::algorithm::depthwise_prelu: - IE_THROW() << "[NM] Not implemented"; -// if (scales.empty() && shifts.empty()) { -// size_t bufferSize = static_cast(outDims[0][outDims[0].size() > 1 ? 1 : 0]); -// size_t bufferSizeAligned = rnd_up(bufferSize, 16); -// -// Blob::Ptr scalesBlob = getCnnLayer()->blobs["weights"]; -// if (scalesBlob == nullptr) -// IE_THROW() << "Cannot get weights blob in Eltwise node with name `" << getName() << "`"; -// scales.resize(bufferSizeAligned, 0); -// const float *scalesBufferPtr = scalesBlob->buffer().as(); -// for (int i = 0; i < bufferSize; i++) { -// scales[i] = scalesBufferPtr[scalesBlob->size() == 1 ? 0 : i]; -// } -// -// Blob::Ptr shiftsBlob = getCnnLayer()->blobs["biases"]; -// if (shiftsBlob != nullptr) { -// shifts.resize(bufferSizeAligned, 0); -// const float *shiftsBufferPtr = shiftsBlob->buffer().as(); -// for (int i = 0; i < bufferSize; i++) { -// shifts[i] = shiftsBufferPtr[shiftsBlob->size() == 1 ? 0 : i]; -// } -// } -// } -// -// ops.append_depthwise(getAlgorithm(), &scales[0], shifts.empty() ? nullptr : &shifts[0]); + } + case EltwiseSubtract: { + scales.resize(bufferSizeAligned, 1.0f); + std::transform(shifts.begin(), shifts.end(), shifts.begin(), [](float shift){ return -1.0f * shift; }); break; - default: IE_THROW() << "Appending Eltwise node with name `" << getName() << "` as post operation is not supported"; + } + case EltwiseMultiply: { + shifts.resize(bufferSizeAligned, 0.0f); + break; + } + case EltwiseDivide: { + shifts.resize(bufferSizeAligned, 0.0f); + std::transform(scales.begin(), scales.end(), scales.begin(), [](float scale){ return 1.0f / scale; }); + break; + } + default: break; + } +} + +void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops) { + const std::string errorPrefix = "Appending Eltwise node with name '" + getName() + "' "; + if (getMKLDNNAlgorithm() != mkldnn::algorithm::undef) { + switch (getMKLDNNAlgorithm()) { + case mkldnn::algorithm::eltwise_relu: + case mkldnn::algorithm::eltwise_tanh: + case mkldnn::algorithm::eltwise_elu: + case mkldnn::algorithm::eltwise_square: + case mkldnn::algorithm::eltwise_abs: + case mkldnn::algorithm::eltwise_sqrt: + case mkldnn::algorithm::eltwise_linear: + case mkldnn::algorithm::eltwise_bounded_relu: + case mkldnn::algorithm::eltwise_soft_relu: + case mkldnn::algorithm::eltwise_logistic: + case mkldnn::algorithm::eltwise_exp: + case mkldnn::algorithm::eltwise_gelu: + case mkldnn::algorithm::eltwise_clip: + case mkldnn::algorithm::eltwise_swish: + case mkldnn::algorithm::eltwise_hswish: + case mkldnn::algorithm::eltwise_mish: + case mkldnn::algorithm::eltwise_hsigmoid: + case mkldnn::algorithm::eltwise_round_half_to_even: + case mkldnn::algorithm::eltwise_round_half_away_from_zero: + ops.append_eltwise(1.0, getMKLDNNAlgorithm(), getAlpha(), getBeta()); + return; + case mkldnn::algorithm::depthwise_scale_shift: + IE_THROW() << "[NM] Not implemented"; + return; + default: IE_THROW() << errorPrefix << "as post operation is not supported"; + } + } else { + switch (getAlgorithm()) { + case EltwiseAdd: + case EltwiseSubtract: + if (shifts.empty()) IE_THROW() << errorPrefix << "has empty shifts"; + break; + case EltwiseMultiply: + case EltwiseDivide: + case EltwisePrelu: + if (scales.empty()) IE_THROW() << errorPrefix << "has empty scales"; + break; + default: IE_THROW() << errorPrefix << "as post operation is not supported"; + } + fillScalesAndShifts(); + if (getAlgorithm() == EltwisePrelu) { + ops.append_depthwise(mkldnn::algorithm::depthwise_prelu, &scales[0], nullptr); + } else { + ops.append_depthwise(mkldnn::algorithm::depthwise_scale_shift, &scales[0], &shifts[0]); + } } } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h index 1003067f2358a8..a69d5eb31f40c2 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h @@ -73,7 +73,7 @@ class MKLDNNEltwiseNode : public MKLDNNNode { bool isWithBroadcast(); - bool canFuse(const MKLDNNNodePtr& node) const; + bool canFuse(const MKLDNNNodePtr& node) const override; size_t getOpInputsNum() const; mkldnn::algorithm getMKLDNNAlgorithm() const { return mkldnnAlgorithm; } @@ -121,6 +121,8 @@ class MKLDNNEltwiseNode : public MKLDNNNode { void offset_in_calc(std::vector& offset, std::vector& dims_in, std::vector& dims_out); static std::map&, MKLDNNEltwiseNode& node)>> initializers; + + void fillScalesAndShifts(); }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.h index 61d276210fdc4e..c82f4f858e55a7 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.h @@ -96,7 +96,7 @@ class MKLDNNInterpolateNode : public MKLDNNNode { bool canBeInPlace() const override { return false; } - bool canFuse(const MKLDNNNodePtr& node) const; + bool canFuse(const MKLDNNNodePtr& node) const override; private: // nearest neighbor diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp index fc18003ebc3f25..c004b28d218519 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp @@ -4,12 +4,13 @@ #include "mkldnn_normalize_node.h" -#include #include #include "mkldnn_quantize_node.h" #include "mkldnn_eltwise_node.h" #include "utils/bfloat16.hpp" +#include "utils/general_utils.h" +#include #include "emitters/jit_bf16_emitters.hpp" #include "mkldnn_extension_utils.h" #include @@ -19,6 +20,8 @@ #include "nodes/common/cpu_convert.h" #include +#include + using namespace mkldnn; using namespace MKLDNNPlugin; using namespace InferenceEngine; @@ -152,7 +155,7 @@ struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_ker } }; -// dst = src * modulo_inv * scale +// dst = src * modulo_inv template struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_normalize_kernel_f32) @@ -188,8 +191,6 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji mov(reg_src, ptr[reg_params + GET_OFF(src)]); mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); - mov(reg_modulo, ptr[reg_params + GET_OFF(modulo)]); - mov(reg_weights, ptr[reg_params + GET_OFF(weights)]); mov(reg_fused_factor, ptr[reg_params + GET_OFF(fused_factor)]); mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); if (attr_.post_ops_.len() != 0) @@ -220,10 +221,8 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji Xbyak::Reg64 reg_src = r8; Xbyak::Reg64 reg_dst = r9; - Xbyak::Reg64 reg_modulo = r10; - Xbyak::Reg64 reg_weights = r11; - Xbyak::Reg64 reg_fused_factor = r12; - Xbyak::Reg64 reg_work_amount = r15; + Xbyak::Reg64 reg_fused_factor = r10; + Xbyak::Reg64 reg_work_amount = r11; Xbyak::Reg64 reg_params = abi_param1; Reg8 reg_tmp_8 = r14b; @@ -258,10 +257,6 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji inline void normalize_nchw() { if (jcp_.across_spatial) { uni_vbroadcastss(vmm_fused_factor, ptr[reg_fused_factor]); // for channel_shared: false or true. - } else { - if (!jcp_.channel_shared) { - uni_vbroadcastss(vmm_scale, ptr[reg_weights]); - } } Xbyak::Label main_loop_label; @@ -279,16 +274,9 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji if (jcp_.across_spatial) { uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); } else { - if (jcp_.channel_shared) { - uni_vmovups(vmm_fused_factor, ptr[reg_fused_factor]); - uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); - add(reg_fused_factor, vlen); - } else { - uni_vmovups(vmm_modulo, ptr[reg_modulo]); // modulo: ld dynamic - uni_vmulps(vmm_val, vmm_val, vmm_modulo); - uni_vmulps(vmm_val, vmm_val, vmm_scale); // weight: bc once - add(reg_modulo, vlen); - } + uni_vmovups(vmm_fused_factor, ptr[reg_fused_factor]); + uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); + add(reg_fused_factor, vlen); } if (attr_.post_ops_.len() != 0) { apply_post_ops(jcp_.dst_dt, 1); @@ -313,16 +301,9 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji if (jcp_.across_spatial) { uni_vmulps(xmm_val, xmm_val, xmm_fused_factor); } else { - if (jcp_.channel_shared) { - load_scalar(xmm_fused_factor, ptr[reg_fused_factor], memory::data_type::f32); - uni_vmulps(xmm_val, xmm_val, xmm_fused_factor); - add(reg_fused_factor, step * sizeof(float)); - } else { - load_scalar(xmm_modulo, ptr[reg_modulo], memory::data_type::f32); - uni_vmulps(xmm_val, xmm_val, xmm_modulo); - uni_vmulps(xmm_val, xmm_val, xmm_scale); - add(reg_modulo, step * sizeof(float)); - } + load_scalar(xmm_fused_factor, ptr[reg_fused_factor], memory::data_type::f32); + uni_vmulps(xmm_val, xmm_val, xmm_fused_factor); + add(reg_fused_factor, step * sizeof(float)); } if (attr_.post_ops_.len() != 0) { apply_post_ops(jcp_.dst_dt, 1); // vector and boradcast @@ -339,13 +320,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji } inline void normalize_nhwc() { - if (jcp_.channel_shared) { - uni_vbroadcastss(vmm_fused_factor, ptr[reg_fused_factor]); - } else { - if (!jcp_.across_spatial) { - uni_vbroadcastss(vmm_modulo, ptr[reg_modulo]); - } - } + uni_vbroadcastss(vmm_fused_factor, ptr[reg_fused_factor]); Xbyak::Label main_loop_label; Xbyak::Label main_loop_end_label; @@ -359,20 +334,8 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji jl(main_loop_end_label, T_NEAR); load_vector(vmm_val, ptr[reg_src], jcp_.src_dt); - if (jcp_.channel_shared) { - uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); - } else { - if (jcp_.across_spatial) { - uni_vmovups(vmm_fused_factor, ptr[reg_fused_factor]); - uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); - add(reg_fused_factor, vlen); - } else { - uni_vmovups(vmm_scale, ptr[reg_weights]); - uni_vmulps(vmm_val, vmm_val, vmm_scale); - uni_vmulps(vmm_val, vmm_val, vmm_modulo); - add(reg_weights, vlen); - } - } + uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); + if (attr_.post_ops_.len() != 0) { apply_post_ops(jcp_.dst_dt, 0); add(reg_oc_off, vlen); // out channel offset of fused ops weights in byte @@ -394,20 +357,8 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji jl(tail_loop_end_label, T_NEAR); load_scalar(xmm_val, ptr[reg_src], jcp_.src_dt); - if (jcp_.channel_shared) { - uni_vmulps(xmm_val, xmm_val, xmm_fused_factor); - } else { - if (jcp_.across_spatial) { - load_scalar(xmm_fused_factor, ptr[reg_fused_factor], memory::data_type::f32); - uni_vmulps(xmm_val, xmm_val, xmm_fused_factor); - add(reg_fused_factor, step * sizeof(float)); - } else { - load_scalar(xmm_scale, ptr[reg_weights], memory::data_type::f32); - uni_vmulps(xmm_val, xmm_val, xmm_scale); - uni_vmulps(xmm_val, xmm_val, xmm_modulo); - add(reg_weights, step * sizeof(float)); - } - } + uni_vmulps(xmm_val, xmm_val, xmm_fused_factor); + if (attr_.post_ops_.len() != 0) { apply_post_ops(jcp_.dst_dt, 0); add(reg_oc_off, step * sizeof(float)); @@ -438,14 +389,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji bool is_sse42 = (isa == cpu::x64::sse41); if (jcp_.across_spatial) { - if (jcp_.channel_shared) { - uni_vbroadcastss(vmm_fused_factor, ptr[reg_fused_factor]); - } else { - uni_vmovups(vmm_fused_factor, ptr[reg_fused_factor]); - if (is_sse42) { - uni_vmovups(vmm_fused_factor2, ptr[reg_fused_factor + simd_w * sizeof(float)]); - } - } + uni_vbroadcastss(vmm_fused_factor, ptr[reg_fused_factor]); Xbyak::Label norm_loop_label; Xbyak::Label norm_loop_end_label; @@ -466,11 +410,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji if (is_sse42) { int sse42_offset = 4; load_vector(vmm_val, ptr[reg_src + sse42_offset * jcp_.src_data_size], jcp_.src_dt); - if (jcp_.channel_shared) { - uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); // bc once - } else { - uni_vmulps(vmm_val, vmm_val, vmm_fused_factor2); // ld once - } + uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); // bc once if (attr_.post_ops_.len() != 0) { add(reg_oc_off, sse42_offset * sizeof(float)); apply_post_ops(jcp_.dst_dt, 0); @@ -486,11 +426,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji } L(norm_loop_end_label); } else { // across_saptail is flase - if (jcp_.channel_shared) { - uni_vbroadcastss(vmm_fused_factor, ptr[reg_fused_factor]); - } else { - uni_vbroadcastss(vmm_modulo, ptr[reg_modulo]); - } + uni_vbroadcastss(vmm_fused_factor, ptr[reg_fused_factor]); size_t src_stride = jcp_.w * jcp_.h * blk_size * jcp_.src_data_size; size_t dst_stride = jcp_.w * jcp_.h * blk_size * jcp_.dst_data_size; @@ -503,14 +439,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji jle(norm_loop_end_label, T_NEAR); load_vector(vmm_val, ptr[reg_src], jcp_.src_dt); - if (jcp_.channel_shared) { - uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); - } else { - uni_vmovups(vmm_scale, ptr[reg_weights]); - uni_vmulps(vmm_val, vmm_val, vmm_scale); - uni_vmulps(vmm_val, vmm_val, vmm_modulo); - add(reg_weights, vlen); - } + uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); if (attr_.post_ops_.len() != 0) { apply_post_ops(jcp_.dst_dt, 0); add(reg_oc_off, vlen); // vlen is related isa @@ -520,14 +449,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji if (is_sse42) { int sse42_offset = 4; load_vector(vmm_val, ptr[reg_src + sse42_offset * jcp_.src_data_size], jcp_.src_dt); - if (jcp_.channel_shared) { - uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); // bc once - } else { - uni_vmovups(vmm_scale, ptr[reg_weights]); // ld dynamic - uni_vmulps(vmm_val, vmm_val, vmm_scale); - uni_vmulps(vmm_val, vmm_val, vmm_modulo); // bc once - add(reg_weights, vlen); // 4 * sizeof(float) - } + uni_vmulps(vmm_val, vmm_val, vmm_fused_factor); // bc once if (attr_.post_ops_.len() != 0) { apply_post_ops(jcp_.dst_dt, 0); add(reg_oc_off, vlen); // vlen is related isa @@ -721,87 +643,88 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji } }; -MKLDNNNormalizeNode::MKLDNNNormalizeNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) - : MKLDNNNode(layer, eng, cache), src_data_size(0lu), dst_data_size(0lu), weights_data_size(0lu), - input_prec(Precision::UNSPECIFIED), output_prec(Precision::UNSPECIFIED), weights_prec(Precision::UNSPECIFIED) {} - -void MKLDNNNormalizeNode::getSupportedDescriptors() { - if (!descs.empty()) - return; - - std::string errPrefix = "Normalize node with name '" + getName() + "' "; - if (getParentEdges().size() != 1) - IE_THROW() << errPrefix << " has incorrect number of input edges: " << getParentEdges().size(); - if (getChildEdges().empty()) - IE_THROW() << errPrefix << " has incorrect number of output edges: " << getChildEdges().size(); - - if (getParentEdgeAt(0)->getDims().ndims() > 4 || getParentEdgeAt(0)->getDims().ndims() < 2) { - IE_THROW() << errPrefix << "has invalid input shape. Normalize supports from 2D to 4D blobs."; +MKLDNNNormalizeL2Node::MKLDNNNormalizeL2Node(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) : + MKLDNNNode(op, eng, cache), src_data_size(0lu), dst_data_size(0lu), input_prec(Precision::UNSPECIFIED), output_prec(Precision::UNSPECIFIED) { + std::string errorMessage; + if (isSupportedOperation(op, errorMessage)) { + errorPrefix = "NormalizeL2 node with name '" + getName() + "' "; + eps = std::dynamic_pointer_cast(op)->get_eps(); + across_spatial = ngraph::shape_size(op->get_input_shape(AXES)) != 1; + } else { + IE_THROW(NotImplemented) << errorMessage; } +} - auto *layer = getCnnLayer().get(); - if (layer == nullptr) - IE_THROW() << errPrefix << " has nullable CnnLayer."; - across_spatial = layer->GetParamAsBool("across_spatial", false); - channel_shared = layer->GetParamAsBool("channel_shared", false); - eps = layer->GetParamAsFloat("eps"); - - MemoryBlob::Ptr tweights = as(layer->blobs.at("weights")); - if (!tweights) { - IE_THROW() << errPrefix << "has not initialized weights or they cannot be casted to MemoryBlob."; - } +bool MKLDNNNormalizeL2Node::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto norm = std::dynamic_pointer_cast(op); + if (!norm) { + errorMessage = "CPU plug-in supports NormalizeL2 node only from opset1. Node name: " + op->get_friendly_name(); + return false; + } + std::string errorPrefix = "NormalizeL2 node with name '" + op->get_friendly_name() + "' "; + const auto dataDims = norm->get_input_shape(DATA); + if (dataDims.size() < 2 && dataDims.size() > 4) { + errorMessage = errorPrefix + "doesn't support 'data' input with rank: " + std::to_string(dataDims.size()); + return false; + } + const auto axesNode = std::dynamic_pointer_cast(norm->get_input_node_shared_ptr(AXES)); + if (!axesNode) { + errorMessage = errorPrefix + "supports only constant 'axes' input"; + return false; + } - auto inData = getCnnLayer()->insData[0].lock(); - if (inData == nullptr) { - IE_THROW() << errPrefix << "has nullable input data."; - } - const auto& inDims = inData->getDims(); - if (inDims.size() < 2) - IE_THROW() << errPrefix << "has unsupported layout: '" << inData->getLayout() << "'."; - const size_t channels = inDims[1]; - const auto weightsSize = tweights->size(); - if (weightsSize != channels) { - if (weightsSize == 1) { - channel_shared = true; - } else { - IE_THROW() << errPrefix << "has unsupported broadcast type. Channels size: " << channels << "; Weights size: " << weightsSize; + const auto isSupportedAxes = [](const std::vector &axes, const ngraph::Shape &dataDims) { + if (axes.size() == 1 && axes[0] == 1) { + return true; + } else if (axes.size() == dataDims.size() - 1) { + for (size_t i = 0; i < axes.size(); i++) { + if (axes[i] != i + 1) + return false; + } + return true; + } + return false; + }; + if (!isSupportedAxes(axesNode->cast_vector(), dataDims)) { + errorMessage = errorPrefix + "supports only per channel or per channel and per spatial reduction"; + return false; + } + if (norm->get_eps_mode() != ngraph::op::EpsMode::ADD) { + errorMessage = errorPrefix + "supports only eps_mode = add"; + return false; } + } catch (...) { + return false; } + return true; +} - weights_prec = tweights->getTensorDesc().getPrecision(); - if (weights_prec != Precision::FP32 && weights_prec != Precision::BF16) { - // Unknown non supported data type, return an error - IE_THROW() << layer->name << "Weights for layer Normalize with name '" << layer->name << - "' has unsupported data type " << tweights->getTensorDesc().getPrecision(); - } +void MKLDNNNormalizeL2Node::getSupportedDescriptors() { + if (!descs.empty()) + return; + + if (getParentEdges().size() != 2) + IE_THROW() << errorPrefix << " has incorrect number of input edges: " << getParentEdges().size(); + if (getChildEdges().empty()) + IE_THROW() << errorPrefix << " has incorrect number of output edges: " << getChildEdges().size(); - TensorDesc td(Precision::FP32, tweights->getTensorDesc().getDims(), tweights->getTensorDesc().getLayout()); - weights_blob = make_shared_blob(td); - weights_blob->allocate(); - float* dst = weights_blob->wmap(); - if (weights_prec == Precision::FP32) { - float* src = layer->blobs.at("weights")->buffer(); - cpu_memcpy(dst, src, layer->blobs.at("weights")->byteSize()); - } else if (weights_prec == Precision::BF16) { - short* bf16src = tweights->rmap().as(); - cpu_convert(bf16src, dst, Precision::BF16, Precision::FP32, weights_blob->size()); + if (getParentEdgeAt(0)->getDims().ndims() > 4 || getParentEdgeAt(0)->getDims().ndims() < 2) { + IE_THROW() << errorPrefix << "has invalid input shape. Normalize supports from 2D to 4D blobs."; } } -void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() { +void MKLDNNNormalizeL2Node::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; setPostOps(attr, true); - Precision inputPrecision = getCnnLayer()->insData[0].lock()->getPrecision(); - Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision(); + Precision inputPrecision = getOriginalInputPrecisions()[DATA]; + Precision outputPrecision = getOriginalOutputPrecisions()[DATA]; if (!fusedWith.empty()) { - auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer(); - if (lastFusedLayer) { - outputPrecision = lastFusedLayer->outData[0]->getPrecision(); - } + outputPrecision = fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisions()[0]; } if (inputPrecision == Precision::BF16 || outputPrecision == Precision::BF16) { @@ -811,53 +734,38 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() { inputPrecision = outputPrecision = Precision::BF16; } - auto isOneOf = [&](InferenceEngine::Precision precision, std::vector precisions) { - for (auto p : precisions) { - if (precision == p) { - return true; - } - } - return false; - }; - if (!isOneOf(inputPrecision, {Precision::FP32, Precision::BF16, Precision::I8, Precision::U8})) { - IE_THROW() << "Unsupported input precision. " << getName(); - } - if (!isOneOf(outputPrecision, {Precision::FP32, Precision::BF16, Precision::I8, Precision::U8})) { - IE_THROW() << "Unsupported output precision. " << getName(); + if (!one_of(inputPrecision, Precision::FP32, Precision::BF16, Precision::I8, Precision::U8)) { + IE_THROW() << errorPrefix << "has unsupported input precision. " << getName(); } - if (!isOneOf(weights_prec, {Precision::FP32, Precision::BF16})) { - IE_THROW() << "Unsupported wights precision. " << getName(); + if (!one_of(outputPrecision, Precision::FP32, Precision::BF16, Precision::I8, Precision::U8)) { + IE_THROW() << errorPrefix << "has unsupported output precision. " << getName(); } auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision); auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outputPrecision); - auto weightsDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(weights_prec); input_prec = inputPrecision; output_prec = outputPrecision; src_data_size = MKLDNNExtensionUtils::sizeOfDataType(inputDataType); dst_data_size = MKLDNNExtensionUtils::sizeOfDataType(outputDataType); - weights_data_size = MKLDNNExtensionUtils::sizeOfDataType(weightsDataType); - bool canBeInplace = src_data_size == dst_data_size && getParentEdgeAt(0)->getParent()->getChildEdges().size() == 1; + bool canBeInplace = src_data_size == dst_data_size && getParentEdgeAt(DATA)->getParent()->getChildEdges().size() == 1; - InferenceEngine::LayerConfig config; + LayerConfig config; config.dynBatchSupport = false; - config.inConfs.resize(1); + config.inConfs.resize(2); config.outConfs.resize(1); - config.inConfs[0].constant = false; - config.outConfs[0].constant = false; - config.inConfs[0].inPlace = -1; config.outConfs[0].inPlace = canBeInplace ? 0 : -1; auto pushDesc = [&](memory::format_tag format) { - config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format); - config.outConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), outputDataType, format); + config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(DATA)->getDims(), inputDataType, format); + config.inConfs[1].desc = MKLDNNMemoryDesc(getParentEdgeAt(AXES)->getDims(), memory::data_type::s32, memory::format_tag::x); + config.outConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(DATA)->getDims(), outputDataType, format); supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, format}); }; // only plain layout support when w/o sse42 - if (getParentEdgeAt(0)->getDims().ndims() == 4) { + if (getParentEdgeAt(DATA)->getDims().ndims() == 4) { if (mayiuse(cpu::x64::sse41)) { pushDesc(memory::format_tag::nhwc); if (mayiuse(cpu::x64::avx512_common)) { @@ -869,10 +777,35 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() { } if (canBeInplace) config.inConfs[0].inPlace = 0; - pushDesc(MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims())); + pushDesc(MKLDNNMemory::GetPlainFormat(getChildEdgeAt(DATA)->getDims())); } -void MKLDNNNormalizeNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights) { +bool MKLDNNNormalizeL2Node::canFuse(const MKLDNNNodePtr& node) const { + auto isConvertedToScaleShift = [](MKLDNNNodePtr node) { + return one_of(node->getAlgorithm(), EltwiseAdd, EltwiseMultiply, EltwiseSubtract, EltwiseDivide, EltwisePrelu) && + node->getParentEdgeAt(1)->getParent()->isConstant() && + MKLDNNExtensionUtils::isPerTensorOrPerChannelBroadcastable(node->getParentEdgeAt(0)->getDims().ToSizeVector(), + node->getParentEdgeAt(1)->getDims().ToSizeVector()); + }; + + if (node->getType() == Quantize) { + auto* quantizeNode = dynamic_cast(node.get()); + if (quantizeNode == nullptr) + IE_THROW() << "Cannot get quantize layer " << node->getName(); + return !quantizeNode->isBinarization(); + } else if (node->getType() == Eltwise) { + return one_of(node->getAlgorithm(), EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseBoundedRelu, EltwiseClamp, EltwiseTanh, + EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven, + EltwiseRoundHalfAwayFromZero, EltwiseLinear, EltwiseAbs, EltwiseSquare, EltwiseSqrt) || + isConvertedToScaleShift(node); + // TODO [NM]: implemented after enabling MulAdd operation + // ((eltwiseNode->getOpType() == MulAdd && eltwiseNode->getCnnLayer()->blobs.size() == 2) + } + + return false; +} + +void MKLDNNNormalizeL2Node::setPostOps(mkldnn::primitive_attr &attr, bool initWeights) { mkldnn::post_ops ops; for (auto &node : fusedWith) { @@ -894,15 +827,15 @@ void MKLDNNNormalizeNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeig attr.set_post_ops(ops); } -void MKLDNNNormalizeNode::createPrimitive() { - auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); - auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr(); +void MKLDNNNormalizeL2Node::createPrimitive() { + auto& dstMemPtr = getChildEdgeAt(DATA)->getMemoryPtr(); + auto& srcMemPtr = getParentEdgeAt(DATA)->getMemoryPtr(); if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr()) - IE_THROW() << "Destination memory didn't allocate."; + IE_THROW() << errorPrefix << "has didn't allocated destination memory"; if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr()) - IE_THROW() << "Input memory didn't allocate."; + IE_THROW() << errorPrefix << "has didn't allocated input memory"; if (getSelectedPrimitiveDescriptor() == nullptr) - IE_THROW() << "Preferable primitive descriptor is not set."; + IE_THROW() << errorPrefix << "has nullable preferable primitive descriptor"; auto selectedPD = getSelectedPrimitiveDescriptor(); jcp.src_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().inConfs[0].desc.getPrecision()); @@ -920,7 +853,6 @@ void MKLDNNNormalizeNode::createPrimitive() { } jcp.across_spatial = across_spatial; - jcp.channel_shared = channel_shared; auto dims = getParentEdgeAt(0)->getDesc().getDims(); size_t dims_size = dims.size(); jcp.n = (dims_size > 0) ? dims[0] : 1lu; @@ -960,16 +892,16 @@ void MKLDNNNormalizeNode::createPrimitive() { namespace { struct NormalizeContext { - MKLDNNNormalizeNode &node; + MKLDNNNormalizeL2Node &node; const uint8_t *src; uint8_t *dst; - const InferenceEngine::SizeVector& dims; + const SizeVector& dims; }; } // namespace template -struct MKLDNNNormalizeNode::NormalizeExecute { +struct MKLDNNNormalizeL2Node::NormalizeExecute { using src_t = typename std::tuple_element<0, T>::type; using dst_t = typename std::tuple_element<1, T>::type; @@ -980,13 +912,13 @@ struct MKLDNNNormalizeNode::NormalizeExecute { } }; -void MKLDNNNormalizeNode::execute(mkldnn::stream strm) { - auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr(); - auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); +void MKLDNNNormalizeL2Node::execute(mkldnn::stream strm) { + auto &srcMemPtr = getParentEdgeAt(DATA)->getMemoryPtr(); + auto &dstMemPtr = getChildEdgeAt(DATA)->getMemoryPtr(); const uint8_t *src_ptr = reinterpret_cast(srcMemPtr->GetPtr()); uint8_t *dst_ptr = reinterpret_cast(dstMemPtr->GetPtr()); - auto dims = getParentEdgeAt(0)->getDesc().getDims(); + auto dims = getParentEdgeAt(DATA)->getDesc().getDims(); NormalizeContext ctx = { *this, @@ -1009,7 +941,7 @@ void MKLDNNNormalizeNode::execute(mkldnn::stream strm) { } template -void MKLDNNNormalizeNode::normalize_nchw(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) { +void MKLDNNNormalizeL2Node::normalize_nchw(const in_data_t* src_data, out_data_t* dst_data, const SizeVector& dims) { size_t blk_size = 1; // elt in vmm if (mayiuse(cpu::x64::avx512_common)) { blk_size = 16; @@ -1024,7 +956,6 @@ void MKLDNNNormalizeNode::normalize_nchw(const in_data_t* src_data, out_data_t* size_t H = (dims_size > 2) ? dims[2] : 1lu; size_t C = (dims_size > 1) ? dims[1] : 1lu; size_t B = (dims_size > 0) ? dims[0] : 1lu; - float *weights = weights_blob->buffer().as(); for (size_t b = 0lu; b < B; b++) { const in_data_t *src_data_b = src_data + b * C * H * W; @@ -1062,11 +993,10 @@ void MKLDNNNormalizeNode::normalize_nchw(const in_data_t* src_data, out_data_t* parallel_for(C, [&](size_t ic) { const in_data_t *src_data_bc = src_data_b + ic * H * W; out_data_t *dst_data_bc = dst_data_b + ic * H * W; - float fused_weight_modulo = channel_shared ? (weights[0] * modulo_inv) : (weights[ic] * modulo_inv); auto arg = jit_normalize_call_args(); arg.src = src_data_bc; arg.dst = dst_data_bc; - arg.fused_factor = static_cast(&fused_weight_modulo); // broadcast once + arg.fused_factor = static_cast(&modulo_inv); // broadcast once arg.oc_off = ic * sizeof(float); arg.work_amount = static_cast(W * H); (*normalize_kernel)(&arg); @@ -1097,8 +1027,6 @@ void MKLDNNNormalizeNode::normalize_nchw(const in_data_t* src_data, out_data_t* for (size_t m = 0; m < H * W; m++) { moduloM[m] = 1.0f / (std::sqrt(moduloM[m]) + eps); - if (channel_shared) - moduloM[m] = moduloM[m] * weights[0]; } // normalize @@ -1108,12 +1036,7 @@ void MKLDNNNormalizeNode::normalize_nchw(const in_data_t* src_data, out_data_t* auto arg = jit_normalize_call_args(); arg.src = src_data_bc; arg.dst = dst_data_bc; - if (channel_shared) { - arg.fused_factor = static_cast(&moduloM[0]); // ld dynamic - } else { - arg.modulo = static_cast(&moduloM[0]); // ld dynamic - arg.weights = static_cast(&weights[ic]); // bc once - } + arg.fused_factor = static_cast(&moduloM[0]); // ld dynamic arg.oc_off = ic * sizeof(float); arg.work_amount = static_cast(W * H); (*normalize_kernel)(&arg); @@ -1123,13 +1046,12 @@ void MKLDNNNormalizeNode::normalize_nchw(const in_data_t* src_data, out_data_t* } template -void MKLDNNNormalizeNode::normalize_nchw_ref(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) { +void MKLDNNNormalizeL2Node::normalize_nchw_ref(const in_data_t* src_data, out_data_t* dst_data, const SizeVector& dims) { size_t dims_size = dims.size(); size_t W = (dims_size > 3) ? dims[3] : 1lu; size_t H = (dims_size > 2) ? dims[2] : 1lu; size_t C = (dims_size > 1) ? dims[1] : 1lu; size_t B = (dims_size > 0) ? dims[0] : 1lu; - float *weights = weights_blob->buffer().as(); for (size_t b = 0lu; b < B; b++) { const in_data_t *src_data_b = src_data + b * C * H * W; @@ -1154,9 +1076,8 @@ void MKLDNNNormalizeNode::normalize_nchw_ref(const in_data_t* src_data, out_data parallel_for(C, [&](size_t ic) { const in_data_t *src_data_bc = src_data_b + ic * H * W; out_data_t *dst_data_bc = dst_data_b + ic * H * W; - float fused_weight_modulo = channel_shared ? (weights[0] * modulo_inv) : (weights[ic] * modulo_inv); for (size_t m = 0; m < W * H; m++) { - float dst_value = src_data_bc[m] * fused_weight_modulo; + float dst_value = src_data_bc[m] * modulo_inv; apply_post_ops_scalar(dst_value, ic); if (output_prec == Precision::U8) { dst_data_bc[m] = (dst_value >= 0) ? dst_value : 0; @@ -1181,8 +1102,6 @@ void MKLDNNNormalizeNode::normalize_nchw_ref(const in_data_t* src_data, out_data for (size_t m = 0; m < H * W; m++) { moduloM[m] = 1.0f / (std::sqrt(moduloM[m]) + eps); - if (channel_shared) - moduloM[m] = moduloM[m] * weights[0]; } // normalize @@ -1190,8 +1109,7 @@ void MKLDNNNormalizeNode::normalize_nchw_ref(const in_data_t* src_data, out_data const in_data_t *src_data_bc = src_data_b + ic * H * W; out_data_t *dst_data_bc = dst_data_b + ic * H * W; for (size_t m = 0; m < W * H; m++) { - float dst_value = channel_shared ? src_data_bc[m] * moduloM[m] : - src_data_bc[m] * moduloM[m] * weights[ic]; + float dst_value = src_data_bc[m] * moduloM[m]; apply_post_ops_scalar(dst_value, ic); if (output_prec == Precision::U8) { dst_data_bc[m] = (dst_value >= 0) ? dst_value : 0; @@ -1205,7 +1123,7 @@ void MKLDNNNormalizeNode::normalize_nchw_ref(const in_data_t* src_data, out_data } template -void MKLDNNNormalizeNode::normalize_nhwc(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) { +void MKLDNNNormalizeL2Node::normalize_nhwc(const in_data_t* src_data, out_data_t* dst_data, const SizeVector& dims) { size_t blk_size = 1; // elt in vmm if (mayiuse(cpu::x64::avx512_common)) { blk_size = 16; @@ -1220,7 +1138,6 @@ void MKLDNNNormalizeNode::normalize_nhwc(const in_data_t* src_data, out_data_t* size_t H = (dims_size > 2) ? dims[2] : 1lu; size_t C = (dims_size > 1) ? dims[1] : 1lu; size_t B = (dims_size > 0) ? dims[0] : 1lu; - float *weights = weights_blob->buffer().as(); for (size_t b = 0lu; b < B; b++) { const in_data_t *src_data_b = src_data + b * C * H * W; @@ -1254,36 +1171,17 @@ void MKLDNNNormalizeNode::normalize_nhwc(const in_data_t* src_data, out_data_t* float modulo_inv = 1.0f / (modulo + eps); // normalize - if (channel_shared) { - float fused_weight_modulo = weights[0] * modulo_inv; - parallel_for2d(H, W, [&](int ih, int iw) { - const in_data_t *src_data_bhw = src_data_b + ih * C * W + iw * C; - out_data_t *dst_data_bhw = dst_data_b + ih * C * W + iw * C; - auto arg = jit_normalize_call_args(); - arg.src = src_data_bhw; - arg.dst = dst_data_bhw; - arg.fused_factor = static_cast(&fused_weight_modulo); // bc static - arg.oc_off = 0; - arg.work_amount = static_cast(C); - (*normalize_kernel)(&arg); - }); - } else { // channel_shared=false - std::vector fused_weight_modulo(C); - for (size_t c = 0; c < C; c++) { - fused_weight_modulo[c] = weights[c] * modulo_inv; - } - parallel_for2d(H, W, [&](int ih, int iw) { - const in_data_t *src_data_bhw = src_data_b + ih * C * W + iw * C; - out_data_t *dst_data_bhw = dst_data_b + ih * C * W + iw * C; - auto arg = jit_normalize_call_args(); - arg.src = src_data_bhw; - arg.dst = dst_data_bhw; - arg.fused_factor = static_cast(&fused_weight_modulo[0]); // ld dynamic - arg.oc_off = 0; - arg.work_amount = static_cast(C); - (*normalize_kernel)(&arg); - }); - } + parallel_for2d(H, W, [&](int ih, int iw) { + const in_data_t *src_data_bhw = src_data_b + ih * C * W + iw * C; + out_data_t *dst_data_bhw = dst_data_b + ih * C * W + iw * C; + auto arg = jit_normalize_call_args(); + arg.src = src_data_bhw; + arg.dst = dst_data_bhw; + arg.fused_factor = static_cast(&modulo_inv); // bc static + arg.oc_off = 0; + arg.work_amount = static_cast(C); + (*normalize_kernel)(&arg); + }); } else { // for across_spatial=false parallel_for2d(H, W, [&](int ih, int iw) { // modulo @@ -1309,14 +1207,7 @@ void MKLDNNNormalizeNode::normalize_nhwc(const in_data_t* src_data, out_data_t* // normalize arg.dst = dst_data_bhw; - float fused_weight_modulo = 0; - if (channel_shared) { - fused_weight_modulo = modulo_inv * weights[0]; - arg.fused_factor = static_cast(&fused_weight_modulo); // bc static - } else { - arg.modulo = static_cast(&modulo_inv); // bc static - arg.weights = static_cast(&weights[0]); // ld dynamic - } + arg.fused_factor = static_cast(&modulo_inv); // bc static arg.work_amount = C; arg.oc_off = 0; (*normalize_kernel)(&arg); @@ -1326,7 +1217,7 @@ void MKLDNNNormalizeNode::normalize_nhwc(const in_data_t* src_data, out_data_t* } template -void MKLDNNNormalizeNode::normalize_blk(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) { +void MKLDNNNormalizeL2Node::normalize_blk(const in_data_t* src_data, out_data_t* dst_data, const SizeVector& dims) { size_t blk_size = 1; // channel blk for memory layout if (mayiuse(cpu::x64::avx512_common)) { blk_size = 16; @@ -1341,17 +1232,9 @@ void MKLDNNNormalizeNode::normalize_blk(const in_data_t* src_data, out_data_t* d size_t H = (dims_size > 2) ? dims[2] : 1lu; size_t C = (dims_size > 1) ? dims[1] : 1lu; size_t B = (dims_size > 0) ? dims[0] : 1lu; - float *weights = weights_blob->buffer().as(); size_t CB = div_up(C, blk_size); - // normalize for tails: data is padding, norm weight is padding, so tails as vector for normalize; - // post ops for tails: post-ops params is padding. - std::vector weights_padding(CB * blk_size); - if (!channel_shared) { - cpu_memcpy(static_cast(&weights_padding[0]), weights, C * sizeof(float)); - } - for (size_t b = 0lu; b < B; b++) { const in_data_t *src_data_b = src_data + b * CB * H * W * blk_size; out_data_t *dst_data_b = dst_data + b * CB * H * W * blk_size; @@ -1386,36 +1269,17 @@ void MKLDNNNormalizeNode::normalize_blk(const in_data_t* src_data, out_data_t* d float modulo_inv = 1.0f / (modulo + eps); // normalize - if (channel_shared) { - float fused_weight_modulo = weights[0] * modulo_inv; - parallel_for2d(CB, H, [&](size_t cb, size_t h) { - const in_data_t *src_data_b_cb_h = src_data_b + cb * H * W * blk_size + h * W * blk_size; - out_data_t *dst_data_b_cb_h = dst_data_b + cb * H * W * blk_size + h * W * blk_size; - auto arg = jit_normalize_call_args(); - arg.src = src_data_b_cb_h; - arg.dst = dst_data_b_cb_h; - arg.fused_factor = static_cast(&fused_weight_modulo); // broadcast once - arg.work_amount = static_cast(W); - arg.oc_off = cb * blk_size * sizeof(float); - (*normalize_kernel)(&arg); - }); - } else { - std::vector fused_weight_modulo(weights_padding.size(), 0); - for (size_t c = 0; c < C; c++) { - fused_weight_modulo[c] = weights_padding[c] * modulo_inv; - } - parallel_for2d(CB, H, [&](size_t cb, size_t h) { - const in_data_t *src_data_b_cb_h = src_data_b + cb * H * W * blk_size + h * W * blk_size; - out_data_t *dst_data_b_cb_h = dst_data_b + cb * H * W * blk_size + h * W * blk_size; - auto arg = jit_normalize_call_args(); - arg.src = src_data_b_cb_h; - arg.dst = dst_data_b_cb_h; - arg.fused_factor = static_cast(&fused_weight_modulo[cb * blk_size]); // load once - arg.work_amount = static_cast(W); - arg.oc_off = cb * blk_size * sizeof(float); - (*normalize_kernel)(&arg); - }); - } + parallel_for2d(CB, H, [&](size_t cb, size_t h) { + const in_data_t *src_data_b_cb_h = src_data_b + cb * H * W * blk_size + h * W * blk_size; + out_data_t *dst_data_b_cb_h = dst_data_b + cb * H * W * blk_size + h * W * blk_size; + auto arg = jit_normalize_call_args(); + arg.src = src_data_b_cb_h; + arg.dst = dst_data_b_cb_h; + arg.fused_factor = static_cast(&modulo_inv); // broadcast once + arg.work_amount = static_cast(W); + arg.oc_off = cb * blk_size * sizeof(float); + (*normalize_kernel)(&arg); + }); } else { // across_spatial: false parallel_for2d(H, W, [&](size_t ih, size_t iw) { // modulo @@ -1443,14 +1307,7 @@ void MKLDNNNormalizeNode::normalize_blk(const in_data_t* src_data, out_data_t* d // normalize arg.dst = dst_data_bhw; - float fused_weight_modulo = 0; - if (channel_shared) { - fused_weight_modulo = weights[0] * modulo_inv; - arg.fused_factor = static_cast(&fused_weight_modulo); // broadcast - } else { - arg.weights = static_cast(&weights_padding[0]); // load - arg.modulo = static_cast(&modulo_inv); // broadcast - } + arg.fused_factor = static_cast(&modulo_inv); // broadcast arg.work_amount = CB; arg.oc_off = 0; (*normalize_kernel)(&arg); @@ -1460,7 +1317,7 @@ void MKLDNNNormalizeNode::normalize_blk(const in_data_t* src_data, out_data_t* d } template -void MKLDNNNormalizeNode::normalize_function(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) { +void MKLDNNNormalizeL2Node::normalize_function(const in_data_t* src_data, out_data_t* dst_data, const SizeVector& dims) { if (mayiuse(cpu::x64::sse41) && normalize_modulo_kernel && normalize_kernel) { if (jcp.is_nchw) { normalize_nchw(src_data, dst_data, dims); @@ -1469,18 +1326,18 @@ void MKLDNNNormalizeNode::normalize_function(const in_data_t* src_data, out_data } else if (jcp.is_blk) { normalize_blk(src_data, dst_data, dims); } else { - IE_THROW() << "The selected layout is not supported."; + IE_THROW() << errorPrefix << "has selected layout which is not supported."; } } else { if (jcp.is_nchw) { normalize_nchw_ref(src_data, dst_data, dims); } else { - IE_THROW() << "Only support plain layout on machine w/o sse42."; + IE_THROW() << errorPrefix << "supports only plain layout on machine w/o sse42."; } } } -inline void MKLDNNNormalizeNode::apply_post_ops_scalar(float &dst_value, int index_c) { +inline void MKLDNNNormalizeL2Node::apply_post_ops_scalar(float &dst_value, int index_c) { const auto &p = (*attr.get()).post_ops_; int eltwise_inj_idx = 0; int depthwise_inj_idx = 0; @@ -1521,8 +1378,8 @@ inline void MKLDNNNormalizeNode::apply_post_ops_scalar(float &dst_value, int ind } } -bool MKLDNNNormalizeNode::created() const { - return getType() == Normalize; +bool MKLDNNNormalizeL2Node::created() const { + return getType() == NormalizeL2; } -REG_MKLDNN_PRIM_FOR(MKLDNNNormalizeNode, Normalize); +REG_MKLDNN_PRIM_FOR(MKLDNNNormalizeL2Node, NormalizeL2); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.h index 8d6f33d477754a..3875936eb85af7 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.h @@ -20,7 +20,6 @@ struct jit_normalize_config_params { bool is_nhwc; bool is_blk; bool across_spatial; - bool channel_shared; mkldnn::memory::data_type src_dt; mkldnn::memory::data_type dst_dt; int src_data_size; @@ -31,7 +30,6 @@ struct jit_normalize_config_params { struct jit_normalize_call_args { const void *src; void *dst; - const float *weights; const float *modulo; const float *fused_factor; size_t src_stride; @@ -73,10 +71,10 @@ struct jit_uni_normalize_kernel { const mkldnn_primitive_attr &attr_; }; -class MKLDNNNormalizeNode : public MKLDNNNode { +class MKLDNNNormalizeL2Node : public MKLDNNNode { public: - MKLDNNNormalizeNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); - ~MKLDNNNormalizeNode() override = default; + MKLDNNNormalizeL2Node(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + ~MKLDNNNormalizeL2Node() override = default; void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -87,6 +85,9 @@ class MKLDNNNormalizeNode : public MKLDNNNode { return false; } + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool canFuse(const MKLDNNNodePtr& node) const override; + private: template struct NormalizeExecute; @@ -109,13 +110,11 @@ class MKLDNNNormalizeNode : public MKLDNNNode { template void normalize_function(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims); - MemoryBlob::Ptr weights_blob; bool across_spatial = true; - bool channel_shared = true; float eps = 1e-10f; - InferenceEngine::Precision input_prec, output_prec, weights_prec; - size_t src_data_size, dst_data_size, weights_data_size; + InferenceEngine::Precision input_prec, output_prec; + size_t src_data_size, dst_data_size; mkldnn::primitive_attr attr; @@ -128,6 +127,11 @@ class MKLDNNNormalizeNode : public MKLDNNNode { std::vector> depthwise_injectors_ref; jit_normalize_config_params jcp = {}; + + static const size_t DATA = 0; + static const size_t AXES = 1; + + std::string errorPrefix; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/convolution.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/convolution.cpp index 597b6d053b7525..2d3ea301325274 100755 --- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/convolution.cpp +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/convolution.cpp @@ -108,7 +108,7 @@ const std::vector fusingParamsSet{ fusingElu, fusingSigmoid, fusingClamp, - fusingPRelu, + fusingPReluPerChannel, fusingSwish, fusingHSwish, fusingMish, @@ -128,7 +128,7 @@ const std::vector fusingParamsSetBF16{ fusingElu, fusingSigmoid, fusingClamp, - fusingPRelu, + fusingPReluPerChannel, fusingSwish, // other patterns fusingReluScaleShift, diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp index b3267e7e19947e..1a63f7025a437a 100644 --- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp @@ -119,7 +119,7 @@ std::vector fusingParamsSet { fusingElu, fusingSigmoid, fusingClamp, - fusingPRelu, + fusingPReluPerChannel, fusingSwish, fusingHSwish, fusingMish, diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/normalize.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/normalize.cpp index bda32ff8c96567..2ce34a6555c21a 100755 --- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/normalize.cpp +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/normalize.cpp @@ -48,20 +48,18 @@ class NormalizeL2LayerCPUTest : public testing::WithParamInterface(params)); auto normalize = builder::makeNormalizeL2(paramOuts[0], axes, eps, eps_mode); function = makeNgraphFunction(netPrc, params, normalize, "Normalize"); - selectedType = "unknown_" + std::string(netPrecision.name()); + selectedType = "unknown_" + std::string(inPrc.name()); threshold = 0.015f; - checkFusingPosition = false; } }; @@ -69,31 +67,25 @@ TEST_P(NormalizeL2LayerCPUTest, CompareWithRefs) { SKIP_IF_CURRENT_TEST_IS_DISABLED() Run(); - CheckPluginRelatedResults(executableNetwork, "Normalize"); + CheckPluginRelatedResults(executableNetwork, "NormalizeL2"); } namespace { /* ============= Common params ============= */ -const auto fusingMultiplySharedChannel = fusingSpecificParams{std::make_shared(std::vector{ - {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ - SizeVector secondMultInShape(1, 1); - auto secondMultInput = builder::makeConstant(ngPrc, Shape(secondMultInShape), std::vector{}, true); - return std::make_shared(inpNode, secondMultInput); - }, "Multiply(SharedChannel)"}}), {"Multiply"}}; - -const auto fusingMultiplyNoSharedChannel = fusingSpecificParams{std::make_shared(std::vector{ - {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ - SizeVector secondMultInShape(inpNode->get_shape().size(), 1); - secondMultInShape[1] = inpNode->get_shape()[1]; - auto secondMultInput = builder::makeConstant(ngPrc, Shape(secondMultInShape), std::vector{}, true); - return std::make_shared(inpNode, secondMultInput); - }, "Multiply(NoSharedChannel)"}}), {"Multiply"}}; - std::vector fusingParamsSet { emptyFusingSpec, - fusingMultiplySharedChannel, - fusingMultiplyNoSharedChannel + fusingMultiplyPerTensor, + fusingMultiplyPerChannel, + fusingAddPerTensor, + fusingAddPerChannel, + fusingSubtractPerTensor, + fusingSubtractPerChannel, + fusingDividePerTensor, + fusingDividePerChannel, + fusingPReluPerChannel, + fusingPReluPerTensor, + fusingRelu }; const float epsilon = 1e-4f; diff --git a/inference-engine/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp b/inference-engine/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp index b084dacbd169b2..93971c9508cbab 100644 --- a/inference-engine/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp +++ b/inference-engine/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp @@ -74,23 +74,48 @@ class CpuTestWithFusing : public CPUTestsBase { /* FUSING PATTERNS */ const auto emptyFusingSpec = fusingSpecificParams{nullptr, {}}; + const auto fusingRelu = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Relu); }, "Relu"}}), {"Relu"}}; + const auto fusingElu = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Elu, {}, {2.0f}); }, "Elu"}}), {"Elu"}}; + +const auto fusingGelu = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Gelu); + }, "Gelu"}}), {"Gelu"}}; + const auto fusingSigmoid = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Sigmoid); }, "Sigmoid"}}), {"Sigmoid"}}; + const auto fusingClamp = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Clamp, {}, {3.0f, 6.0f}); }, "Clamp"}}), {"Clamp"}}; -const auto fusingPRelu = fusingSpecificParams{std::make_shared(std::vector{ + +const auto fusingTanh = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Tanh); + }, "Tanh"}}), {"Tanh"}}; + +const auto fusingAbs = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Abs); + }, "Abs"}}), {"Abs"}}; + +const auto fusingSqrt = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Sqrt); + }, "Sqrt"}}), {"Sqrt"}}; + +const auto fusingPReluPerChannel = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ auto shape = inpNode->get_shape(); if (shape.size() == 1) @@ -100,6 +125,14 @@ const auto fusingPRelu = fusingSpecificParams{std::make_shared(std auto data = NGraphFunctions::Utils::generateVector(ngraph::shape_size(newShape)); return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::LeakyRelu, newShape, data); }, "PRelu(PerChannel)"}}), {"PRelu"}}; + +const auto fusingPReluPerTensor = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + ngraph::Shape shape(1, 1); + auto data = NGraphFunctions::Utils::generateVector(ngraph::shape_size(shape)); + return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::LeakyRelu, shape, data); + }, "PRelu(PerTensor)"}}), {"PRelu"}}; + const auto fusingSwish = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Swish, {}, {1.0f}); @@ -120,6 +153,22 @@ const auto fusingTanh = fusingSpecificParams{std::make_shared(std: {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Tanh, {}, {}); }, "Tanh"}}), {"Tanh"}}; + +const auto fusingHSwish = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::HSwish); + }, "HSwish"}}), {"HSwish"}}; + +const auto fusingMish = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Mish); + }, "Mish"}}), {"Mish"}}; + +const auto fusingHSigmoid = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::HSigmoid); + }, "HSigmoid"}}), {"HSigmoid"}}; + const auto fusingReluScaleShift = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Relu); @@ -142,6 +191,7 @@ const auto fusingReluScaleShift = fusingSpecificParams{std::make_shared(ngraph::element::f32, newShape, {}, true); return std::make_shared(inpNode, constNode); }, "Add(PerChannel)"}}), {"Relu", "Add"}}; + const auto fusingScaleShift = fusingSpecificParams{ std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params) { auto shape = inpNode->get_shape(); @@ -161,6 +211,7 @@ const auto fusingScaleShift = fusingSpecificParams{ std::make_shared(ngraph::element::f32, newShape, {}, true); return std::make_shared(inpNode, constNode); }, "Add(PerChannel)"}}), {"Add"} }; + const auto fusingFakeQuantizePerChannel = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ auto localPrc = inpNode->get_element_type(); @@ -171,6 +222,7 @@ const auto fusingFakeQuantizePerChannel = fusingSpecificParams{std::make_shared< newShape[1] = shape[1]; return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape); }, "FakeQuantize(PerChannel)"}}), {"FakeQuantize"}}; + const auto fusingFakeQuantizePerChannelRelu = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ auto localPrc = inpNode->get_element_type(); @@ -184,6 +236,7 @@ const auto fusingFakeQuantizePerChannelRelu = fusingSpecificParams{std::make_sha {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Relu); }, "Relu"}}), {"FakeQuantize", "Relu"}}; + const auto fusingFakeQuantizePerTensorRelu = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params) { auto localPrc = inpNode->get_element_type(); @@ -193,6 +246,7 @@ const auto fusingFakeQuantizePerTensorRelu = fusingSpecificParams{std::make_shar {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Relu); }, "Relu"}}), {"FakeQuantize", "Relu"}}; + const auto fusingSum = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ auto shape = inpNode->get_shape(); @@ -202,6 +256,7 @@ const auto fusingSum = fusingSpecificParams{std::make_shared(std:: ngraph::helpers::castOps2Nodes(newParams)); return std::make_shared(inpNode, newParamOuts[0]); }, "Add(Parameters)"}}), {"Add"}}; + const auto fusingSumEluFQ = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ auto shape = inpNode->get_shape(); @@ -219,4 +274,65 @@ const auto fusingSumEluFQ = fusingSpecificParams{std::make_shared( auto newShape = ngraph::Shape(inpNode->get_shape().size(), 1); return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape); }, "FakeQuantize(PerTensor)"}}), {"Add", "Elu", "FakeQuantize"}}; + +const auto fusingMultiplyPerTensor = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + ngraph::Shape secondMultInShape(1, 1); + auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector{}, true); + return std::make_shared(inpNode, secondMultInput); + }, "Multiply(PerTensor)"}}), {"Multiply"}}; + +const auto fusingMultiplyPerChannel = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + ngraph::Shape secondMultInShape(inpNode->get_shape().size(), 1); + secondMultInShape[1] = inpNode->get_shape()[1]; + auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector{}, true); + return std::make_shared(inpNode, secondMultInput); + }, "Multiply(PerChannel)"}}), {"Multiply"}}; + +const auto fusingAddPerTensor = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + ngraph::Shape secondMultInShape(1, 1); + auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector{}, true); + return std::make_shared(inpNode, secondMultInput); + }, "Add(PerTensor)"}}), {"Add"}}; + +const auto fusingAddPerChannel = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + ngraph::Shape secondMultInShape(inpNode->get_shape().size(), 1); + secondMultInShape[1] = inpNode->get_shape()[1]; + auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector{}, true); + return std::make_shared(inpNode, secondMultInput); + }, "Add(PerChannel)"}}), {"Add"}}; + +const auto fusingSubtractPerTensor = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + ngraph::Shape secondMultInShape(1, 1); + auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector{}, true); + return std::make_shared(inpNode, secondMultInput); + }, "Subtract(PerTensor)"}}), {"Subtract"}}; + +const auto fusingSubtractPerChannel = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + ngraph::Shape secondMultInShape(inpNode->get_shape().size(), 1); + secondMultInShape[1] = inpNode->get_shape()[1]; + auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector{}, true); + return std::make_shared(inpNode, secondMultInput); + }, "Subtract(PerChannel)"}}), {"Subtract"}}; + +const auto fusingDividePerTensor = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + ngraph::Shape secondMultInShape(1, 1); + auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector{}, true); + return std::make_shared(inpNode, secondMultInput); + }, "Divide(PerTensor)"}}), {"Divide"}}; + +const auto fusingDividePerChannel = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + ngraph::Shape secondMultInShape(inpNode->get_shape().size(), 1); + secondMultInShape[1] = inpNode->get_shape()[1]; + auto secondMultInput = ngraph::builder::makeConstant(ngPrc, ngraph::Shape(secondMultInShape), std::vector{}, true); + return std::make_shared(inpNode, secondMultInput); + }, "Divide(PerChannel)"}}), {"Divide"}}; + } // namespace CPUTestUtils diff --git a/inference-engine/tests/unit/CMakeLists.txt b/inference-engine/tests/unit/CMakeLists.txt index f729ae89e1b2e9..ff3570a7980969 100644 --- a/inference-engine/tests/unit/CMakeLists.txt +++ b/inference-engine/tests/unit/CMakeLists.txt @@ -12,9 +12,9 @@ endif() add_subdirectory(inference_engine) -if (ENABLE_MKL_DNN) - add_subdirectory(cpu) -endif () +# if (ENABLE_MKL_DNN) +# add_subdirectory(cpu) +# endif () if (ENABLE_GNA) add_subdirectory(gna) diff --git a/inference-engine/tests_deprecated/CMakeLists.txt b/inference-engine/tests_deprecated/CMakeLists.txt index 31f5a9f7f96add..e64f14eca43779 100644 --- a/inference-engine/tests_deprecated/CMakeLists.txt +++ b/inference-engine/tests_deprecated/CMakeLists.txt @@ -14,14 +14,14 @@ if (ENABLE_GAPI_TESTS) add_subdirectory(fluid_preproc) endif() -if (ENABLE_FUNCTIONAL_TESTS) - add_subdirectory(functional) -endif() +# if (ENABLE_FUNCTIONAL_TESTS) +# add_subdirectory(functional) +# endif() -if (ENABLE_BEH_TESTS) - add_subdirectory(behavior) -endif() +# if (ENABLE_BEH_TESTS) +# add_subdirectory(behavior) +# endif() -if(ENABLE_TESTS) - add_subdirectory(unit) -endif() +# if(ENABLE_TESTS) +# add_subdirectory(unit) +# endif()