Performance problems fixes. Part 2 (openvinotoolkit#50)

* Performance problems fixes. Part 2 * additional fixes * dw fixes * int8 pooling fusing fix * moved transformation to ngraph * [CPU] Select node migration on nGraph * [CPU] DepthToSpace nodes migration on nGraph * [CPU] SpaceToDepth nodes migration on nGraph * added check that op is supported
mandrono · Apr 19, 2021 · 0849f6c · 0849f6c
1 parent 5f4ad55
commit 0849f6c
Show file tree

Hide file tree

Showing 27 changed files with 902 additions and 416 deletions.
diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
@@ -54,7 +54,7 @@ set(LAYERS
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/ctc_greedy_decoder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/ctc_greedy_decoder_seq_len.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/ctc_loss.cpp
-#    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/depth_to_space.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/depth_to_space.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/detectionoutput.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/detectionoutput_onnx.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/embedding_bag_offset_sum.cpp
@@ -83,11 +83,11 @@ set(LAYERS
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/reorg_yolo.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/reverse_sequence.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/roifeatureextractor_onnx.cpp
-#    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/select.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/select.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/shuffle_channels.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/simplernms.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/space_to_batch.cpp
-#    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/space_to_depth.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/space_to_depth.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/sparse_fill_empty_rows.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/sparse_segment_reduce.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/sparse_weighted_reduce.cpp

diff --git a/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp b/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp
@@ -5,6 +5,7 @@
 #include "jit_eltwise_emitters.hpp"
 #include <cpu/x64/jit_uni_eltwise.hpp>
 #include <ngraph/opsets/opset1.hpp>
+#include <nodes/mkldnn_eltwise_node.h>
 
 using namespace InferenceEngine;
 using namespace mkldnn::impl::utils;
@@ -1303,13 +1304,16 @@ jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_
 
     prepare_table();
 }
+
 jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
 : jit_emitter(host, host_isa, node, exec_prc) {
-    IE_THROW() << "[NM] Not implemented";
-
-//    power = powerLayer->power;
-//    scale = powerLayer->scale;
-//    shift = powerLayer->offset;
+    const MKLDNNEltwiseNode *powerNode = dynamic_cast<const MKLDNNEltwiseNode *>(node);
+    if (powerNode == nullptr) {
+        IE_THROW() << "Can't cast to MKLDNNEltwiseNode";
+    }
+    power = powerNode->getAlpha();
+    scale = powerNode->getBeta();
+    shift = powerNode->getGamma();
 
     prepare_table();
 }

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@@ -93,9 +93,8 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
     graph.SortTopologically();
     graph.RemoveDroppedEdges();
 
-// TODO [NM]: transformation should be implemented w/o using of CNNLayer
-//    FuseConvolutionAndDWConvolution(graph);
-//    graph.RemoveDroppedNodes();
+    FuseConvolutionAndDWConvolution(graph);
+    graph.RemoveDroppedNodes();
 
     FuseBinaryConvolutionAndFakeQuantize(graph);
     graph.RemoveDroppedNodes();
@@ -777,134 +776,120 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
 }
 
 void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
-    // auto& graphNodes = graph.GetNodes();
-
-    // auto isConvolutionNode = [](MKLDNNNodePtr node) {
-    //     return node->getType() == Convolution;
-    // };
-
-    // auto is1x1Convolution = [](ConvolutionLayer* layer) {
-    //     return layer->_kernel[X_AXIS] == 1 && layer->_kernel[Y_AXIS] == 1;
-    // };
-
-    // auto isSutableParentConvolution = [&](MKLDNNNodePtr node) {
-    //     auto *layer = dynamic_cast<ConvolutionLayer *>(node->getCnnLayer().get());
-    //     if (layer == nullptr)
-    //         IE_THROW() << "Cannot get convolution layer " << node->getName();
-
-    //     auto* parentConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
-    //     if (parentConvolutionNode == nullptr)
-    //         IE_THROW() << "Cannot get convolution node " << node->getName();
-
-    //     if (!parentConvolutionNode->weightsZeroPoints.empty())
-    //         return false;
-
-        // // TODO [oneDNN]: is it still valide constrain on conv to fuse in?
-        // bool isSupportedParams = layer->_group == 1 &&
-        //         is1x1Convolution(layer) &&  // TODO [oneDNN] : fusing is permitted only with 1x1 convolutions
-        //         everyone_is(1, layer->_stride[X_AXIS], layer->_stride[Y_AXIS]) &&
-        //         everyone_is(Precision::FP32, layer->insData[0].lock()->getPrecision(), layer->outData[0].get()->getPrecision()) &&
-        //         node->getChildEdgeAt(0)->getDims().ndims() == 4;
-        // if (!isSupportedParams) return false;
-
-    //     return node->getChildEdges().size() == 1 && isConvolutionNode(node->getChildEdgeAt(0)->getChild());
-    // };
+    auto& graphNodes = graph.GetNodes();
 
-    // auto isSutableChildConvolution = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
-    //     auto* childLayer = dynamic_cast<ConvolutionLayer*>(childNode->getCnnLayer().get());
-    //     if (childLayer == nullptr)
-    //         IE_THROW() << "Cannot get convolution layer " << childNode->getName();
+    auto isConvolutionNode = [](const MKLDNNNodePtr &node) {
+        return node->getType() == Convolution;
+    };
 
-    //     auto* parentLayer = dynamic_cast<ConvolutionLayer*>(parentNode->getCnnLayer().get());
-    //     if (parentLayer == nullptr)
-    //         IE_THROW() << "Cannot get convolution layer " << parentNode->getName();
+    auto is1x1Convolution = [](const std::shared_ptr<MKLDNNConvolutionNode> &conv) {
+        const auto weightRank = conv->getWeightDims().size();
+        return conv->getWeightDims()[weightRank - 1] == 1 && conv->getWeightDims()[weightRank - 2] == 1;
+    };
 
-        // if (!everyone_is(Precision::FP32, parentLayer->outData[0].get()->getPrecision(), childLayer->insData[0].lock()->getPrecision(),
-        //         childLayer->outData[0].get()->getPrecision()))
-        //     return false;
+    auto isSutableParentConvolution = [&](MKLDNNNodePtr node) {
+        const auto conv = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(node);
+        if (conv == nullptr)
+            IE_THROW() << "Cannot cast to convolution node " << node->getName();
 
-        // if (!everyone_is(Precision::FP32, parentLayer->precision, childLayer->precision))
-        //     return false;
+        if (!conv->weightsZeroPoints.empty())
+            return false;
 
-    //     auto parentOutputPrecision = !parentNode->fusedWith.empty()
-    //             ? parentNode->fusedWith[parentNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
-    //             : parentNode->getCnnLayer()->outData[0].get()->getPrecision();
+        const auto &strides = conv->getStride();
+        bool isSupportedParams = conv->getGroupNum() == 1 &&
+                is1x1Convolution(conv) &&  // TODO [oneDNN] : fusing is permitted only with 1x1 convolutions
+                everyone_is(1, strides[strides.size() - 1], strides[strides.size() - 2]) &&
+                everyone_is(Precision::FP32, conv->getOriginalInputPrecisionAtPort(0), conv->getOriginalOutputPrecisionAtPort(0)) &&
+                node->getChildEdgeAt(0)->getDims().ndims() == 4;
+        if (!isSupportedParams) return false;
 
-    //     auto childOutputPrecision = !childNode->fusedWith.empty()
-    //             ? childNode->fusedWith[childNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
-    //             : childNode->getCnnLayer()->outData[0].get()->getPrecision();
+        return node->getChildEdges().size() == 1 && isConvolutionNode(node->getChildEdgeAt(0)->getChild());
+    };
 
-        // if (!everyone_is(Precision::FP32, parentOutputPrecision, childOutputPrecision))
-        //     return false;
+    auto isSutableChildConvolution = [&](const MKLDNNNodePtr &parentNode, const MKLDNNNodePtr &childNode) {
+        const auto convChild = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(childNode);
+        if (convChild == nullptr)
+            IE_THROW() << "Cannot cast to convolution node " << childNode->getName();
 
-    //     auto* childConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(childNode.get());
-    //     if (childConvolutionNode == nullptr)
-    //         IE_THROW() << "Cannot get convolution node " << childNode->getName();
+        const auto convParent = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(parentNode);
+        if (convParent == nullptr)
+            IE_THROW() << "Cannot cast to convolution node " << parentNode->getName();
 
-    //     if (!childConvolutionNode->inputZeroPoints.empty() || !childConvolutionNode->weightsZeroPoints.empty())
-    //         return false;
+        if (!everyone_is(Precision::FP32, convParent->getOriginalOutputPrecisionAtPort(0), convChild->getOriginalInputPrecisionAtPort(0),
+                convChild->getOriginalOutputPrecisionAtPort(0)))
+            return false;
 
-    //     bool withBias = (childLayer->_biases != nullptr && childLayer->_biases->size() != 0) ||
-    //                     childConvolutionNode->getBaseIntputsNumber() == 3;
+        auto parentOutputPrecision = !parentNode->fusedWith.empty()
+                ? parentNode->fusedWith[parentNode->fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)
+                : parentNode->getOriginalOutputPrecisionAtPort(0);
 
-    //     auto allPads = getPaddings(*childLayer);
+        auto childOutputPrecision = !childNode->fusedWith.empty()
+                ? childNode->fusedWith[childNode->fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)
+                : childNode->getOriginalOutputPrecisionAtPort(0);
 
-    //     bool isSupportedParams = childLayer->_out_depth == childLayer->_group &&
-    //                              childLayer->_out_depth != 1 &&
-    //                              everyone_is(3, childLayer->_kernel[X_AXIS], childLayer->_kernel[Y_AXIS]) &&
-    //                              everyone_is(1, allPads.begin[X_AXIS], allPads.begin[Y_AXIS]) &&
-    //                              everyone_is(1, allPads.end[X_AXIS], allPads.end[Y_AXIS]) &&
-    //                              everyone_is(1, childLayer->_dilation[X_AXIS], childLayer->_dilation[Y_AXIS]) &&
-    //                              childLayer->_stride[X_AXIS] == childLayer->_stride[Y_AXIS] &&
-    //                              withBias &&
-    //                              one_of(childLayer->_stride[X_AXIS], 1, 2) &&
-    //                              childNode->getChildEdgeAt(0)->getDims().ndims() == 4;
+        if (!everyone_is(Precision::FP32, parentOutputPrecision, childOutputPrecision))
+            return false;
 
-    //     return isSupportedParams;
-    // };
+        if (!convChild->inputZeroPoints.empty() || !convChild->weightsZeroPoints.empty())
+            return false;
 
-    // auto isFusingWorthwhile = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
-    //     auto layer = std::dynamic_pointer_cast<ConvolutionLayer>(childNode->getCnnLayer());
-    //     if (layer == nullptr)
-    //         IE_THROW() << "Cannot get convolution layer " << childNode->getName();
+        bool withBias = convChild->getOriginalInputPrecisions().size() == 3;
+
+        const auto weightRank = convChild->getWeightDims().size();
+        const auto stridesSize = convChild->getStride().size();
+        bool isSupportedParams = convChild->outDims[0][1] == convChild->getGroupNum() &&
+                                 convChild->outDims[0][1] != 1 &&
+                                 everyone_is(3, convChild->getWeightDims()[weightRank - 1], convChild->getWeightDims()[weightRank - 2]) &&
+                                 everyone_is(1, convChild->getPaddingL()[stridesSize - 1], convChild->getPaddingL()[stridesSize - 2]) &&
+                                 everyone_is(1, convChild->getPaddingR()[stridesSize - 1], convChild->getPaddingR()[stridesSize - 2]) &&
+                                 everyone_is(1, convChild->getDilation()[stridesSize - 1] + 1, convChild->getDilation()[stridesSize - 2] + 1) &&
+                                 convChild->getStride()[stridesSize - 1] == convChild->getStride()[stridesSize - 2] &&
+                                 withBias &&
+                                 one_of(convChild->getStride()[stridesSize - 1], 1, 2) &&
+                                 childNode->getChildEdgeAt(0)->getDims().ndims() == 4;
+
+        return isSupportedParams;
+    };
 
-    //     auto inDims = childNode->inDims[0];
-    //     auto outDims = childNode->outDims[0];
-    //     int elemSize = layer->precision.size();
+    auto isFusingWorthwhile = [&](const MKLDNNNodePtr &parentNode, const MKLDNNNodePtr &childNode) {
+        auto inDims = childNode->inDims[0];
+        auto outDims = childNode->outDims[0];
+        int elemSize = childNode->getOriginalOutputPrecisionAtPort(0).size();
 
-    //     int L3_cache_size = utils::get_cache_size(3, false);
-    //     int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize;
-    //     int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize;
+        int L3_cache_size = utils::get_cache_size(3, false);
+        int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize;
+        int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize;
 
-    //     auto parentConvolutionNode = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(parentNode);
-    //     if (parentConvolutionNode == nullptr)
-    //         IE_THROW() << "Cannot get convolution node " << parentNode->getName();
+        auto parentConvolutionNode = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(parentNode);
+        if (parentConvolutionNode == nullptr)
+            IE_THROW() << "Cannot get convolution node " << parentNode->getName();
 
-    //     if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) || impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common))
-    //         return false;
+        if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) || impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common))
+            return false;
 
-    //     return (dw_conv_input_size + dw_conv_output_size > L3_cache_size / 2);
-    // };
+        return (dw_conv_input_size + dw_conv_output_size > L3_cache_size / 2);
+    };
 
-    // for (int i = 0; i < graphNodes.size(); i++) {
-    //     if (!isConvolutionNode(graphNodes[i])) continue;
+    for (int i = 0; i < graphNodes.size(); i++) {
+        if (!isConvolutionNode(graphNodes[i])) continue;
 
-    //     auto parentConvNode = graphNodes[i];
-    //     if (!isSutableParentConvolution(parentConvNode)) continue;
+        auto parentConvNode = graphNodes[i];
+        if (!isSutableParentConvolution(parentConvNode)) continue;
 
-    //     auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild();
-    //     if (!isSutableChildConvolution(parentConvNode, childConvNode)) continue;
+        auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild();
+        if (!isSutableChildConvolution(parentConvNode, childConvNode)) continue;
 
-    //     if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue;
+        if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue;
 
-    //     parentConvNode->fuseWith(childConvNode);
+        parentConvNode->addFusedNode(childConvNode);
 
-    //     for (auto node : childConvNode->getFusedWith())
-    //         parentConvNode->fuseWith(node);
-    //     childConvNode->clearFusedWith();
+        for (auto node : childConvNode->getFusedWith()) {
+            parentConvNode->addFusedNode(node);
+        }
+        childConvNode->clearFusedWith();
 
-    //     graph.DropDWConvNode(childConvNode);
-    // }
+        graph.DropDWConvNode(childConvNode);
+    }
 }
 
 // TODO: mandrono: unite with FuseConvolutionAndSimpleOperation
@@ -1039,7 +1024,12 @@ void MKLDNNGraphOptimizer::FusePoolingAndFakeQuantize(MKLDNNGraph &graph) {
     auto& graphNodes = graph.GetNodes();
 
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
-        return node->getType() == Pooling && node->getChildEdges().size() == 1 && node->getAlgorithm() == Algorithm::PoolingAvg;
+        if (node->getType() == Pooling) {
+            if (!one_of(node->getOriginalInputPrecisionAtPort(0), Precision::U8, Precision::I8))
+                return false;
+            return node->getChildEdges().size() == 1 && node->getAlgorithm() == Algorithm::PoolingAvg;
+        }
+        return false;
     };
 
     auto isSutableChildNode = [](MKLDNNNodePtr node) {

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@@ -78,6 +78,7 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
         { "Mod", Eltwise },
         { "FloorMod", Eltwise },
         { "Power", Eltwise },
+        { "PowerStatic", Eltwise },
         { "Equal", Eltwise },
         { "NotEqual", Eltwise },
         { "Greater", Eltwise },
@@ -89,6 +90,7 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
         { "LogicalXor", Eltwise },
         { "LogicalNot", Eltwise },
         { "Relu", Eltwise },
+        { "LeakyRelu", Eltwise },
         { "Gelu", Eltwise },
         { "Elu", Eltwise },
         { "Tanh", Eltwise },
@@ -222,7 +224,8 @@ MKLDNNNode::MKLDNNNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::en
     }
 
     for (size_t i = 0; i < op->get_input_size(); i++) {
-        inDims.emplace_back(op->get_input_shape(i));
+        const auto &shape = op->get_input_shape(i);
+        inDims.emplace_back(ngraph::is_scalar(shape) ? ngraph::Shape{1} : shape);
         originalInputPrecisions.emplace_back(details::convertPrecision(op->get_input_element_type(i)));
     }
 
@@ -231,7 +234,8 @@ MKLDNNNode::MKLDNNNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::en
             IE_THROW() << "Node with type '" << typeStr << "' and name '" << name << "' does not have any outputs.";
         }
         for (size_t i = 0; i < op->get_output_size(); i++) {
-            outDims.emplace_back(op->get_output_shape(i));
+            const auto &shape = op->get_output_shape(i);
+            outDims.emplace_back(ngraph::is_scalar(shape) ? ngraph::Shape{1} : shape);
             originalOutputPrecisions.emplace_back(details::convertPrecision(op->get_output_element_type(i)));
         }
     }

diff --git a/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_to_cpu_specific_opset.hpp b/inference-engine/src/mkldnn_plugin/ngraph_transformations/convert_to_cpu_specific_opset.hpp
@@ -10,6 +10,8 @@
 #include "convert_broadcast_to_tiles.hpp"
 #include "convert_tile_to_seq_tiles.hpp"
 #include "reshape_1d_ops.hpp"
+#include "convert_to_power_static.hpp"
+#include "convert_to_leaky_relu.hpp"
 
 namespace MKLDNNPlugin {
 
@@ -25,6 +27,8 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr<ngraph::Function> &nGraphF
     manager.register_pass<ConvertMatMulToGemm>();
     manager.register_pass<FullyConnectedBiasFusion>();
     manager.register_pass<ReshapeFullyConnected>();
+    manager.register_pass<ConvertToPowerStatic>();
+    manager.register_pass<ConvertToLeakyRelu>();
     if (!ngraph::op::util::has_op_with_type<ngraph::op::FakeQuantize>(nGraphFunc)) {
         manager.register_pass<ReshapeFullyConnectedFusion>();
     }