Skip to content

Commit

Permalink
Performance problems fixes. Part 2 (openvinotoolkit#50)
Browse files Browse the repository at this point in the history
* Performance problems fixes. Part 2

* additional fixes

* dw fixes

* int8 pooling fusing fix

* moved transformation to ngraph

* [CPU] Select node migration on nGraph

* [CPU] DepthToSpace nodes migration on nGraph

* [CPU] SpaceToDepth nodes migration on nGraph

* added check that op is supported
  • Loading branch information
Maxim Andronov authored Apr 19, 2021
1 parent 5f4ad55 commit 0849f6c
Show file tree
Hide file tree
Showing 27 changed files with 902 additions and 416 deletions.
6 changes: 3 additions & 3 deletions inference-engine/src/mkldnn_plugin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ set(LAYERS
${CMAKE_CURRENT_SOURCE_DIR}/nodes/ctc_greedy_decoder.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/ctc_greedy_decoder_seq_len.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/ctc_loss.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/depth_to_space.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/depth_to_space.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/detectionoutput.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/detectionoutput_onnx.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/embedding_bag_offset_sum.cpp
Expand Down Expand Up @@ -83,11 +83,11 @@ set(LAYERS
${CMAKE_CURRENT_SOURCE_DIR}/nodes/reorg_yolo.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/reverse_sequence.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/roifeatureextractor_onnx.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/select.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/select.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/shuffle_channels.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/simplernms.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/space_to_batch.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/space_to_depth.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/space_to_depth.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/sparse_fill_empty_rows.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/sparse_segment_reduce.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/sparse_weighted_reduce.cpp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "jit_eltwise_emitters.hpp"
#include <cpu/x64/jit_uni_eltwise.hpp>
#include <ngraph/opsets/opset1.hpp>
#include <nodes/mkldnn_eltwise_node.h>

using namespace InferenceEngine;
using namespace mkldnn::impl::utils;
Expand Down Expand Up @@ -1303,13 +1304,16 @@ jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_

prepare_table();
}

jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
IE_THROW() << "[NM] Not implemented";

// power = powerLayer->power;
// scale = powerLayer->scale;
// shift = powerLayer->offset;
const MKLDNNEltwiseNode *powerNode = dynamic_cast<const MKLDNNEltwiseNode *>(node);
if (powerNode == nullptr) {
IE_THROW() << "Can't cast to MKLDNNEltwiseNode";
}
power = powerNode->getAlpha();
scale = powerNode->getBeta();
shift = powerNode->getGamma();

prepare_table();
}
Expand Down
202 changes: 96 additions & 106 deletions inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,8 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
graph.SortTopologically();
graph.RemoveDroppedEdges();

// TODO [NM]: transformation should be implemented w/o using of CNNLayer
// FuseConvolutionAndDWConvolution(graph);
// graph.RemoveDroppedNodes();
FuseConvolutionAndDWConvolution(graph);
graph.RemoveDroppedNodes();

FuseBinaryConvolutionAndFakeQuantize(graph);
graph.RemoveDroppedNodes();
Expand Down Expand Up @@ -777,134 +776,120 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
}

void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
// auto& graphNodes = graph.GetNodes();

// auto isConvolutionNode = [](MKLDNNNodePtr node) {
// return node->getType() == Convolution;
// };

// auto is1x1Convolution = [](ConvolutionLayer* layer) {
// return layer->_kernel[X_AXIS] == 1 && layer->_kernel[Y_AXIS] == 1;
// };

// auto isSutableParentConvolution = [&](MKLDNNNodePtr node) {
// auto *layer = dynamic_cast<ConvolutionLayer *>(node->getCnnLayer().get());
// if (layer == nullptr)
// IE_THROW() << "Cannot get convolution layer " << node->getName();

// auto* parentConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
// if (parentConvolutionNode == nullptr)
// IE_THROW() << "Cannot get convolution node " << node->getName();

// if (!parentConvolutionNode->weightsZeroPoints.empty())
// return false;

// // TODO [oneDNN]: is it still valide constrain on conv to fuse in?
// bool isSupportedParams = layer->_group == 1 &&
// is1x1Convolution(layer) && // TODO [oneDNN] : fusing is permitted only with 1x1 convolutions
// everyone_is(1, layer->_stride[X_AXIS], layer->_stride[Y_AXIS]) &&
// everyone_is(Precision::FP32, layer->insData[0].lock()->getPrecision(), layer->outData[0].get()->getPrecision()) &&
// node->getChildEdgeAt(0)->getDims().ndims() == 4;
// if (!isSupportedParams) return false;

// return node->getChildEdges().size() == 1 && isConvolutionNode(node->getChildEdgeAt(0)->getChild());
// };
auto& graphNodes = graph.GetNodes();

// auto isSutableChildConvolution = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
// auto* childLayer = dynamic_cast<ConvolutionLayer*>(childNode->getCnnLayer().get());
// if (childLayer == nullptr)
// IE_THROW() << "Cannot get convolution layer " << childNode->getName();
auto isConvolutionNode = [](const MKLDNNNodePtr &node) {
return node->getType() == Convolution;
};

// auto* parentLayer = dynamic_cast<ConvolutionLayer*>(parentNode->getCnnLayer().get());
// if (parentLayer == nullptr)
// IE_THROW() << "Cannot get convolution layer " << parentNode->getName();
auto is1x1Convolution = [](const std::shared_ptr<MKLDNNConvolutionNode> &conv) {
const auto weightRank = conv->getWeightDims().size();
return conv->getWeightDims()[weightRank - 1] == 1 && conv->getWeightDims()[weightRank - 2] == 1;
};

// if (!everyone_is(Precision::FP32, parentLayer->outData[0].get()->getPrecision(), childLayer->insData[0].lock()->getPrecision(),
// childLayer->outData[0].get()->getPrecision()))
// return false;
auto isSutableParentConvolution = [&](MKLDNNNodePtr node) {
const auto conv = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(node);
if (conv == nullptr)
IE_THROW() << "Cannot cast to convolution node " << node->getName();

// if (!everyone_is(Precision::FP32, parentLayer->precision, childLayer->precision))
// return false;
if (!conv->weightsZeroPoints.empty())
return false;

// auto parentOutputPrecision = !parentNode->fusedWith.empty()
// ? parentNode->fusedWith[parentNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
// : parentNode->getCnnLayer()->outData[0].get()->getPrecision();
const auto &strides = conv->getStride();
bool isSupportedParams = conv->getGroupNum() == 1 &&
is1x1Convolution(conv) && // TODO [oneDNN] : fusing is permitted only with 1x1 convolutions
everyone_is(1, strides[strides.size() - 1], strides[strides.size() - 2]) &&
everyone_is(Precision::FP32, conv->getOriginalInputPrecisionAtPort(0), conv->getOriginalOutputPrecisionAtPort(0)) &&
node->getChildEdgeAt(0)->getDims().ndims() == 4;
if (!isSupportedParams) return false;

// auto childOutputPrecision = !childNode->fusedWith.empty()
// ? childNode->fusedWith[childNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
// : childNode->getCnnLayer()->outData[0].get()->getPrecision();
return node->getChildEdges().size() == 1 && isConvolutionNode(node->getChildEdgeAt(0)->getChild());
};

// if (!everyone_is(Precision::FP32, parentOutputPrecision, childOutputPrecision))
// return false;
auto isSutableChildConvolution = [&](const MKLDNNNodePtr &parentNode, const MKLDNNNodePtr &childNode) {
const auto convChild = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(childNode);
if (convChild == nullptr)
IE_THROW() << "Cannot cast to convolution node " << childNode->getName();

// auto* childConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(childNode.get());
// if (childConvolutionNode == nullptr)
// IE_THROW() << "Cannot get convolution node " << childNode->getName();
const auto convParent = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(parentNode);
if (convParent == nullptr)
IE_THROW() << "Cannot cast to convolution node " << parentNode->getName();

// if (!childConvolutionNode->inputZeroPoints.empty() || !childConvolutionNode->weightsZeroPoints.empty())
// return false;
if (!everyone_is(Precision::FP32, convParent->getOriginalOutputPrecisionAtPort(0), convChild->getOriginalInputPrecisionAtPort(0),
convChild->getOriginalOutputPrecisionAtPort(0)))
return false;

// bool withBias = (childLayer->_biases != nullptr && childLayer->_biases->size() != 0) ||
// childConvolutionNode->getBaseIntputsNumber() == 3;
auto parentOutputPrecision = !parentNode->fusedWith.empty()
? parentNode->fusedWith[parentNode->fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)
: parentNode->getOriginalOutputPrecisionAtPort(0);

// auto allPads = getPaddings(*childLayer);
auto childOutputPrecision = !childNode->fusedWith.empty()
? childNode->fusedWith[childNode->fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)
: childNode->getOriginalOutputPrecisionAtPort(0);

// bool isSupportedParams = childLayer->_out_depth == childLayer->_group &&
// childLayer->_out_depth != 1 &&
// everyone_is(3, childLayer->_kernel[X_AXIS], childLayer->_kernel[Y_AXIS]) &&
// everyone_is(1, allPads.begin[X_AXIS], allPads.begin[Y_AXIS]) &&
// everyone_is(1, allPads.end[X_AXIS], allPads.end[Y_AXIS]) &&
// everyone_is(1, childLayer->_dilation[X_AXIS], childLayer->_dilation[Y_AXIS]) &&
// childLayer->_stride[X_AXIS] == childLayer->_stride[Y_AXIS] &&
// withBias &&
// one_of(childLayer->_stride[X_AXIS], 1, 2) &&
// childNode->getChildEdgeAt(0)->getDims().ndims() == 4;
if (!everyone_is(Precision::FP32, parentOutputPrecision, childOutputPrecision))
return false;

// return isSupportedParams;
// };
if (!convChild->inputZeroPoints.empty() || !convChild->weightsZeroPoints.empty())
return false;

// auto isFusingWorthwhile = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
// auto layer = std::dynamic_pointer_cast<ConvolutionLayer>(childNode->getCnnLayer());
// if (layer == nullptr)
// IE_THROW() << "Cannot get convolution layer " << childNode->getName();
bool withBias = convChild->getOriginalInputPrecisions().size() == 3;

const auto weightRank = convChild->getWeightDims().size();
const auto stridesSize = convChild->getStride().size();
bool isSupportedParams = convChild->outDims[0][1] == convChild->getGroupNum() &&
convChild->outDims[0][1] != 1 &&
everyone_is(3, convChild->getWeightDims()[weightRank - 1], convChild->getWeightDims()[weightRank - 2]) &&
everyone_is(1, convChild->getPaddingL()[stridesSize - 1], convChild->getPaddingL()[stridesSize - 2]) &&
everyone_is(1, convChild->getPaddingR()[stridesSize - 1], convChild->getPaddingR()[stridesSize - 2]) &&
everyone_is(1, convChild->getDilation()[stridesSize - 1] + 1, convChild->getDilation()[stridesSize - 2] + 1) &&
convChild->getStride()[stridesSize - 1] == convChild->getStride()[stridesSize - 2] &&
withBias &&
one_of(convChild->getStride()[stridesSize - 1], 1, 2) &&
childNode->getChildEdgeAt(0)->getDims().ndims() == 4;

return isSupportedParams;
};

// auto inDims = childNode->inDims[0];
// auto outDims = childNode->outDims[0];
// int elemSize = layer->precision.size();
auto isFusingWorthwhile = [&](const MKLDNNNodePtr &parentNode, const MKLDNNNodePtr &childNode) {
auto inDims = childNode->inDims[0];
auto outDims = childNode->outDims[0];
int elemSize = childNode->getOriginalOutputPrecisionAtPort(0).size();

// int L3_cache_size = utils::get_cache_size(3, false);
// int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize;
// int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize;
int L3_cache_size = utils::get_cache_size(3, false);
int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize;
int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize;

// auto parentConvolutionNode = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(parentNode);
// if (parentConvolutionNode == nullptr)
// IE_THROW() << "Cannot get convolution node " << parentNode->getName();
auto parentConvolutionNode = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(parentNode);
if (parentConvolutionNode == nullptr)
IE_THROW() << "Cannot get convolution node " << parentNode->getName();

// if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) || impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common))
// return false;
if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) || impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common))
return false;

// return (dw_conv_input_size + dw_conv_output_size > L3_cache_size / 2);
// };
return (dw_conv_input_size + dw_conv_output_size > L3_cache_size / 2);
};

// for (int i = 0; i < graphNodes.size(); i++) {
// if (!isConvolutionNode(graphNodes[i])) continue;
for (int i = 0; i < graphNodes.size(); i++) {
if (!isConvolutionNode(graphNodes[i])) continue;

// auto parentConvNode = graphNodes[i];
// if (!isSutableParentConvolution(parentConvNode)) continue;
auto parentConvNode = graphNodes[i];
if (!isSutableParentConvolution(parentConvNode)) continue;

// auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild();
// if (!isSutableChildConvolution(parentConvNode, childConvNode)) continue;
auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild();
if (!isSutableChildConvolution(parentConvNode, childConvNode)) continue;

// if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue;
if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue;

// parentConvNode->fuseWith(childConvNode);
parentConvNode->addFusedNode(childConvNode);

// for (auto node : childConvNode->getFusedWith())
// parentConvNode->fuseWith(node);
// childConvNode->clearFusedWith();
for (auto node : childConvNode->getFusedWith()) {
parentConvNode->addFusedNode(node);
}
childConvNode->clearFusedWith();

// graph.DropDWConvNode(childConvNode);
// }
graph.DropDWConvNode(childConvNode);
}
}

// TODO: mandrono: unite with FuseConvolutionAndSimpleOperation
Expand Down Expand Up @@ -1039,7 +1024,12 @@ void MKLDNNGraphOptimizer::FusePoolingAndFakeQuantize(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();

auto isSutableParentNode = [](MKLDNNNodePtr node) {
return node->getType() == Pooling && node->getChildEdges().size() == 1 && node->getAlgorithm() == Algorithm::PoolingAvg;
if (node->getType() == Pooling) {
if (!one_of(node->getOriginalInputPrecisionAtPort(0), Precision::U8, Precision::I8))
return false;
return node->getChildEdges().size() == 1 && node->getAlgorithm() == Algorithm::PoolingAvg;
}
return false;
};

auto isSutableChildNode = [](MKLDNNNodePtr node) {
Expand Down
8 changes: 6 additions & 2 deletions inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
{ "Mod", Eltwise },
{ "FloorMod", Eltwise },
{ "Power", Eltwise },
{ "PowerStatic", Eltwise },
{ "Equal", Eltwise },
{ "NotEqual", Eltwise },
{ "Greater", Eltwise },
Expand All @@ -89,6 +90,7 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
{ "LogicalXor", Eltwise },
{ "LogicalNot", Eltwise },
{ "Relu", Eltwise },
{ "LeakyRelu", Eltwise },
{ "Gelu", Eltwise },
{ "Elu", Eltwise },
{ "Tanh", Eltwise },
Expand Down Expand Up @@ -222,7 +224,8 @@ MKLDNNNode::MKLDNNNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::en
}

for (size_t i = 0; i < op->get_input_size(); i++) {
inDims.emplace_back(op->get_input_shape(i));
const auto &shape = op->get_input_shape(i);
inDims.emplace_back(ngraph::is_scalar(shape) ? ngraph::Shape{1} : shape);
originalInputPrecisions.emplace_back(details::convertPrecision(op->get_input_element_type(i)));
}

Expand All @@ -231,7 +234,8 @@ MKLDNNNode::MKLDNNNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::en
IE_THROW() << "Node with type '" << typeStr << "' and name '" << name << "' does not have any outputs.";
}
for (size_t i = 0; i < op->get_output_size(); i++) {
outDims.emplace_back(op->get_output_shape(i));
const auto &shape = op->get_output_shape(i);
outDims.emplace_back(ngraph::is_scalar(shape) ? ngraph::Shape{1} : shape);
originalOutputPrecisions.emplace_back(details::convertPrecision(op->get_output_element_type(i)));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include "convert_broadcast_to_tiles.hpp"
#include "convert_tile_to_seq_tiles.hpp"
#include "reshape_1d_ops.hpp"
#include "convert_to_power_static.hpp"
#include "convert_to_leaky_relu.hpp"

namespace MKLDNNPlugin {

Expand All @@ -25,6 +27,8 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr<ngraph::Function> &nGraphF
manager.register_pass<ConvertMatMulToGemm>();
manager.register_pass<FullyConnectedBiasFusion>();
manager.register_pass<ReshapeFullyConnected>();
manager.register_pass<ConvertToPowerStatic>();
manager.register_pass<ConvertToLeakyRelu>();
if (!ngraph::op::util::has_op_with_type<ngraph::op::FakeQuantize>(nGraphFunc)) {
manager.register_pass<ReshapeFullyConnectedFusion>();
}
Expand Down
Loading

0 comments on commit 0849f6c

Please sign in to comment.