diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 4004c806e9d28f..ec79ae4f69e045 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -2356,6 +2356,106 @@ bool GraphOptimizer::checkAscendingSummaryOrder(const VectorDims& transposeOrder return true; } +void GraphOptimizer::mergeTransposeReshapeReorder(Graph& graph, + const NodePtr& transposeNode, + const NodePtr& reshapeNode, + const NodePtr& reorderNode, + const bool reverseOrder) { + const auto& parentNode = reverseOrder ? reorderNode : transposeNode; + const auto& childNode = reverseOrder ? transposeNode : reorderNode; + auto nodeBeforeSequence = parentNode->getParentEdgesAtPort(0)[0]->getParent(); + auto nodeAfterSequence = childNode->getChildEdgeAt(0)->getChild(); + + auto removeInputEdge = [&](const NodePtr& node, const size_t idx) { + auto remEdge = node->getParentEdgesAtPort(idx)[0]; + auto parentNode = remEdge->getParent(); + remEdge->drop(); + auto& edges = graph.GetEdges(); + for (auto it = edges.begin(); it != edges.end(); it++) { + if ((*it) == remEdge) { + edges.erase(it); + if (parentNode->getChildEdges().empty()) + parentNode->remove(); + break; + } + } + }; + + removeInputEdge(transposeNode, 1); + if (reshapeNode) + removeInputEdge(reshapeNode, 1); + + // to prevent inPlace conflict we must check that the memory reference is unidirectional or + // inPlace memory is not used + const auto parentInPlace = parentNode->getParentEdgeAt(0)->inPlace(Edge::LOOK_UP); + const auto& childEdges = childNode->getChildEdgesAtPort(0); + const auto childInPlace = std::any_of(childEdges.begin(), childEdges.end(), [](const EdgePtr& edge) { + return edge->inPlace(Edge::LOOK_DOWN); + }); + bool isOptimized = !(parentInPlace && childInPlace); + + graph.DropNode(transposeNode); + graph.DropNode(reorderNode); + if (reshapeNode) + graph.DropNode(reshapeNode); + + auto inDesc = parentNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc(); + auto outDesc = childNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].getMemDesc(); + + auto inPrec = inDesc->getPrecision(); + auto outPrec = outDesc->getPrecision(); + + auto reorderInDesc = inDesc; + auto reorderOutDesc = outDesc->cloneWithNewPrecision(inPrec); + + EdgePtr edge; + for (auto& childEdge : nodeBeforeSequence->getChildEdges()) { + if (childEdge.lock()->getChild() == nodeAfterSequence) { + edge = childEdge.lock(); + break; + } + } + if (!edge) { + IE_THROW() << "Parent node '" << parentNode->getName() << "' has invalid edges."; + } + + // transposeNode support blocked input & non-blocked output, in the case, the Reorder after Transpose cannot be optimized + std::vector srcPerm; + if (!reverseOrder) { + auto* castedTranspose = dynamic_cast(transposeNode.get()); + if (castedTranspose == nullptr) { + IE_THROW() << "[CPU] parent node of type:" << castedTranspose->getTypeStr() + << " with name: " << castedTranspose->getName() << " is not a transpose node"; + } + + const auto& inOrder = castedTranspose->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as()->getOrder(); + if (inOrder.size() > reorderOutDesc->as()->getOrder().size()) { + isOptimized = false; + // inDesc should be permuted before calling reorder + auto& ord = castedTranspose->getOrder(); + srcPerm = std::vector(ord.size()); + for (size_t i = 0; i < ord.size(); i++) { + srcPerm[ord[i]] = i; + } + } + } + + std::string reorderlayerName = nodeBeforeSequence->getName() + "_" + Reorder::getReorderArgs(*reorderInDesc, *reorderOutDesc) + "_fake"; + auto newReorderNode = graph.InsertReorder(edge, reorderlayerName, *reorderInDesc, *reorderOutDesc, isOptimized, srcPerm); + + // If precisions don't match, another reorder must be inserted to perform conversion + if (inPrec != outPrec) { + auto reorderInDesc2 = reorderOutDesc; + auto reorderOutDesc2 = outDesc; + + std::string reorderLayerName2 = newReorderNode->getName() + "_" + + Reorder::getReorderArgs(*reorderInDesc2, *reorderOutDesc2) + "_" + + nodeAfterSequence->getName(); + + graph.InsertReorder(newReorderNode->getChildEdgeAt(0), reorderLayerName2, *reorderInDesc2, *reorderOutDesc2, false); + } +} + void GraphOptimizer::MergeTransposeAndReorder(Graph& graph) { auto& graphNodes = graph.GetNodes(); @@ -2447,114 +2547,6 @@ void GraphOptimizer::MergeTransposeAndReorder(Graph& graph) { return transformedOrder; }; - auto removeInputEdge = [&](const NodePtr& node, const size_t idx) { - auto remEdge = node->getParentEdgesAtPort(idx)[0]; - auto parentNode = remEdge->getParent(); - remEdge->drop(); - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == remEdge) { - edges.erase(it); - if (parentNode->getChildEdges().empty()) - parentNode->remove(); - break; - } - } - }; - - // Transpose and Reorder do opposite permutation to each other. - // Example: - // chain [physical layout: NCHW, logical layout: NCHW] -> Transpose(order=0312) -> [physical layout: NWCH, logical layout: NCHW] -> - // Reorder(nchw->nhwc) -> [physical layout: NCHW, logical layout: NHWC] can be replaced with Reorder(nchw->nhwc; isOptimized=true) - // which will just reinterprets layout without physical change of the memory. - // Two cases are possible: - // 1) inPrec = outPrec - // In this case, we replace Transpose+Reorder pattern with a new Reorder that does nothing. - // 2) inPrec != outPrec - // As in the first case, we also replace Transpose+Reorder pattern with a new Reorder. - // Additionally, we insert another Reorder that performs the conversion from the input precision (inPrec) - // to the output precision (outPrec) - auto mergeTransposeAndReorder = [&](const NodePtr& transposeNode, const NodePtr& reshapeNode, const NodePtr& reorderNode) { - auto transposeParentNode = transposeNode->getParentEdgesAtPort(0)[0]->getParent(); - auto reorderChildNode = reorderNode->getChildEdgeAt(0)->getChild(); - - removeInputEdge(transposeNode, 1); - if (reshapeNode) - removeInputEdge(reshapeNode, 1); - - // to prevent inPlace conflict we must check that the memory reference is unidirectional or - // inPlace memory is not used - const auto parentInPlace = transposeNode->getParentEdgeAt(0)->inPlace(Edge::LOOK_UP); - const auto& childEdges = reorderNode->getChildEdgesAtPort(0); - const auto childInPlace = std::any_of(childEdges.begin(), childEdges.end(), - [](const EdgePtr& edge){ return edge->inPlace(Edge::LOOK_DOWN); }); - bool isOptimized = !(parentInPlace && childInPlace); - - graph.DropNode(transposeNode); - graph.DropNode(reorderNode); - if (reshapeNode) - graph.DropNode(reshapeNode); - - auto inDesc = transposeNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc(); - auto outDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].getMemDesc(); - - auto inPrec = inDesc->getPrecision(); - auto outPrec = outDesc->getPrecision(); - - auto reorderInDesc = inDesc; - auto reorderOutDesc = outDesc->cloneWithNewPrecision(inPrec); - - std::string reorderlayerName = transposeParentNode->getName() + "_" + - Reorder::getReorderArgs(*reorderInDesc, *reorderOutDesc) + "_fake"; - - EdgePtr edge; - for (auto &childEdge : transposeParentNode->getChildEdges()) { - if (childEdge.lock()->getChild() == reorderChildNode) { - edge = childEdge.lock(); - break; - } - } - if (!edge) { - IE_THROW() << "Transpose node '" << transposeNode->getName() << "' has invalid edges."; - } - - std::vector srcPerm; - auto configReorder = [&]() { - // transposeNode support blocked input & non-blocked output, in the case, the reorder cannot be optimized - auto* castedTranspose = dynamic_cast(transposeNode.get()); - if (castedTranspose == nullptr) { - IE_THROW() << "[CPU] parent node of type:" << castedTranspose->getTypeStr() << " with name: " - << castedTranspose->getName() << " is not a transpose node"; - } - - const auto& inOrder = castedTranspose->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as()->getOrder(); - if (inOrder.size() > reorderOutDesc->as()->getOrder().size()) { - isOptimized = false; - // inDesc should be permuted before calling reorder - auto& ord = castedTranspose->getOrder(); - srcPerm = std::vector(ord.size()); - for (size_t i = 0; i < ord.size(); i++) { - srcPerm[ord[i]] = i; - } - } - }; - - configReorder(); - - auto newReorderNode = graph.InsertReorder(edge, reorderlayerName, *reorderInDesc, *reorderOutDesc, isOptimized, srcPerm); - - // case 2 - if (inPrec != outPrec) { - auto reorderInDesc2 = reorderOutDesc; - auto reorderOutDesc2 = outDesc; - - std::string reorderLayerName2 = newReorderNode->getName() + "_" + - Reorder::getReorderArgs(*reorderInDesc2, *reorderOutDesc2) + "_" + reorderChildNode->getName(); - - graph.InsertReorder(newReorderNode->getChildEdgeAt(0), reorderLayerName2, *reorderInDesc2, *reorderOutDesc2, false); - } - }; - for (size_t i = 0; i < graphNodes.size(); i++) { auto parentNode = graphNodes[i]; if (!isSuitableTranspose(parentNode)) { @@ -2596,7 +2588,7 @@ void GraphOptimizer::MergeTransposeAndReorder(Graph& graph) { auto& outOrder = outBlockedDesc->getOrder(); if (checkAscendingSummaryOrder(transposeOrder, layoutOrder, inOrder, outOrder)) { - mergeTransposeAndReorder(transposeNode, reshapeNode, reorderNode); + mergeTransposeReshapeReorder(graph, transposeNode, reshapeNode, reorderNode, false); } } } @@ -2671,85 +2663,6 @@ void GraphOptimizer::MergeReorderAndTranspose(Graph &graph) { return transformedOrder; }; - auto removeInputEdge = [&](const NodePtr& node, const size_t idx) { - auto remEdge = node->getParentEdgesAtPort(idx)[0]; - auto parentNode = remEdge->getParent(); - remEdge->drop(); - auto& edges = graph.GetEdges(); - for (auto it = edges.begin(); it != edges.end(); it++) { - if ((*it) == remEdge) { - edges.erase(it); - if (parentNode->getChildEdges().empty()) - parentNode->remove(); - break; - } - } - }; - - // Merge Reorder and Transpose which do opposite permutation to each other. - // Two cases are possible: - // 1) inPrec = outPrec - // In this case, we replace Reorder+Transpose pattern with a new Reorder that does nothing. - // 2) inPrec != outPrec - // As in the first case, we also replace Reorder+Transpose pattern with a new Reorder. - // Additionally, we insert another Reorder that performs the conversion from the input precision (inPrec) - // to the output precision (outPrec) - auto mergeTransposeAndReorder = [&](const NodePtr& transposeNode, const NodePtr& reshapeNode, const NodePtr& reorderNode) { - auto reorderParentNode = reorderNode->getParentEdgesAtPort(0)[0]->getParent(); - auto transposeChildNode = transposeNode->getChildEdgeAt(0)->getChild(); - - removeInputEdge(transposeNode, 1); - if (reshapeNode) - removeInputEdge(reshapeNode, 1); - - // to prevent inPlace conflict we must check that the memory reference is unidirectional or - // inPlace memory is not used - const auto parentInPlace = reorderNode->getParentEdgeAt(0)->inPlace(Edge::LOOK_UP); - const auto& childEdges = transposeNode->getChildEdgesAtPort(0); - const auto childInPlace = std::any_of(childEdges.begin(), childEdges.end(), - [](const EdgePtr& edge){ return edge->inPlace(Edge::LOOK_DOWN); }); - bool isOptimized = !(parentInPlace && childInPlace); - - graph.DropNode(reorderNode); - graph.DropNode(transposeNode); - if (reshapeNode) - graph.DropNode(reshapeNode); - - auto inDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc(); - auto outDesc = transposeNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].getMemDesc(); - - auto inPrec = inDesc->getPrecision(); - auto outPrec = outDesc->getPrecision(); - - auto reorderInDesc = inDesc; - auto reorderOutDesc = outDesc->cloneWithNewPrecision(inPrec); - - std::string reorderlayerName = reorderParentNode->getName() + "_" + - Reorder::getReorderArgs(*reorderInDesc, *reorderOutDesc) + "_fake"; - - EdgePtr edge; - for (auto &childEdge : reorderParentNode->getChildEdges()) { - if (childEdge.lock()->getChild() == transposeChildNode) { - edge = childEdge.lock(); - break; - } - } - if (!edge) { - IE_THROW() << "Transpose node '" << transposeNode->getName() << "' has invalid edges."; - } - - auto newReorderNode = graph.InsertReorder(edge, reorderlayerName, *reorderInDesc, *reorderOutDesc, isOptimized); - if (inPrec != outPrec) { - auto reorderInDesc2 = reorderOutDesc; - auto reorderOutDesc2 = outDesc; - - std::string reorderLayerName2 = newReorderNode->getName() + "_" + - Reorder::getReorderArgs(*reorderInDesc2, *reorderOutDesc2) + "_" + transposeChildNode->getName(); - - graph.InsertReorder(newReorderNode->getChildEdgeAt(0), reorderLayerName2, *reorderInDesc2, *reorderOutDesc2, false); - } - }; - for (size_t i = 0; i < graphNodes.size(); i++) { auto parentNode = graphNodes[i]; if (!isSuitableReorder(parentNode)) { @@ -2791,7 +2704,7 @@ void GraphOptimizer::MergeReorderAndTranspose(Graph &graph) { auto& outOrder = outBlockedDesc->getOrder(); if (checkAscendingSummaryOrder(transposeOrder, layoutOrder, inOrder, outOrder)) { - mergeTransposeAndReorder(transposeNode, reshapeNode, reorderNode); + mergeTransposeReshapeReorder(graph, transposeNode, reshapeNode, reorderNode, true); } } } diff --git a/src/plugins/intel_cpu/src/graph_optimizer.h b/src/plugins/intel_cpu/src/graph_optimizer.h index a515ae13751afa..55acfc987c3e72 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.h +++ b/src/plugins/intel_cpu/src/graph_optimizer.h @@ -52,11 +52,25 @@ class GraphOptimizer { void RemoveSameConvert(Graph &graph); // Method checks that after the sequential execution of Transpose and Reorder nodes, - // the order of the elements in the memory will not change. + // the order of the elements in the memory (physical layout) will not change. bool checkAscendingSummaryOrder(const VectorDims& transposeOrder, const VectorDims& layoutOrder, const VectorDims& reorderInOrder, const VectorDims& reorderOutOrder); + // Method merges Transpose -> Reshape(optional) -> Reorder sequences which do opposite permutation to each other. + // Reverse order Reorder -> Reshape(optional) -> Transpose is supported too. + // Reshape support has the following limitations: + // - direct order: Only reshape which separates one of the dimension on 2 consecutive ones is supported + // - reverse order: Only reshape which 2 consecutive dimensions into one is supported + // Example: + // chain [physical layout: NCHW, logical layout: NCHW] -> Transpose(order=0312) -> [physical layout: NWCH, logical layout: NCHW] -> + // Reorder(nchw->nhwc) -> [physical layout: NCHW, logical layout: NHWC] can be replaced with Reorder(nchw->nhwc; isOptimized=true) + // which will just reinterprets layout without physical change of the memory. + void mergeTransposeReshapeReorder(Graph& graph, + const NodePtr& transposeNode, + const NodePtr& reshapeNode, + const NodePtr& reorderNode, + const bool reverseOrder); }; } // namespace intel_cpu