[CPU] Extend Concat node logic to avoid fallback on slow ref implemen…

…tation. (openvinotoolkit#4129)
yekruglov · Jun 7, 2021 · 2b702e4 · 2b702e4
1 parent 0ff1e47
commit 2b702e4
Show file tree

Hide file tree

Showing 7 changed files with 395 additions and 323 deletions.
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
@@ -134,6 +134,16 @@ PartialBlkDesc PartialBlkDesc::makeCBlocked(const InferenceEngine::SizeVector &d
     return res;
 }
 
+
+PartialBlkDesc PartialBlkDesc::makeTailC(const InferenceEngine::SizeVector &dims) {
+    PartialBlkDesc res = makePlain(dims);
+    if (dims.size() > 2) {
+        auto itr = res.outer_order.begin() + 1;
+        std::rotate(itr, itr + 1, res.outer_order.end());
+    }
+    return res;
+}
+
 PartialBlkDesc PartialBlkDesc::extractFrom(const InferenceEngine::TensorDesc &desc) {
     if (desc.getLayout() == InferenceEngine::ANY)
         IE_THROW() << "Cannot extract partial blocked descriptor for `ANY` layout";

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
@@ -59,6 +59,9 @@ class PartialBlkDesc {
     /** Construct blocked Channel PartialBlkDesc based on dims information */
     static PartialBlkDesc makeCBlocked(const InferenceEngine::SizeVector &dims, size_t block_size);
 
+    /** Construct per Channel PartialBlkDesc based on dims information */
+    static PartialBlkDesc makeTailC(const InferenceEngine::SizeVector &dims);
+
     /** Compare operators. Allow to use it as key for std::map */
     bool operator == (const PartialBlkDesc& it) const;
     bool operator < (const PartialBlkDesc& it) const;

diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h
@@ -30,8 +30,10 @@ class MKLDNNConcatNode : public MKLDNNNode {
 
 private:
     size_t axis = 0;
+    bool canOptimizeNspc = false;
 
     size_t inverseOrder(const InferenceEngine::SizeVector& order, size_t axis);
+    void execNspcSpecCase();
 
     InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::FP32;
     InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;

diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shuffle_channels_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shuffle_channels_node.cpp
@@ -94,11 +94,15 @@ void MKLDNNShuffleChannelsNode::initSupportedPrimitiveDescriptors() {
         impl_type = impl_desc_type::ref;
     }
 
-    addSupportedPrimDesc({{TensorDescCreatorTypes::nspc, precision}},
-                         {{TensorDescCreatorTypes::nspc, precision}},
+    // use ncsp as default for non-quantized networks and nspc for quantized
+    auto firstCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::nspc : TensorDescCreatorTypes::ncsp;
+    auto secondCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::ncsp : TensorDescCreatorTypes::nspc;
+
+    addSupportedPrimDesc({{firstCreatorType, precision}},
+                         {{firstCreatorType, precision}},
                          impl_type, supportDynamicBatch_);
-    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision}},
-                         {{TensorDescCreatorTypes::ncsp, precision}},
+    addSupportedPrimDesc({{secondCreatorType, precision}},
+                         {{secondCreatorType, precision}},
                          impl_type, supportDynamicBatch_);
     // canUseBlocked
     if (axis_ != 1) {

diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/concat.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/concat.cpp
@@ -0,0 +1,214 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph_functions/builders.hpp"
+#include "test_utils/cpu_test_utils.hpp"
+
+using namespace InferenceEngine;
+using namespace CPUTestUtils;
+
+namespace CPULayerTestsDefinitions {
+
+typedef std::tuple<
+        size_t,                            // Concat axis
+        std::vector<std::vector<size_t>>,  // Input shapes
+        InferenceEngine::Precision,        // Network precision
+        std::string,                       // Device name
+        CPUSpecificParams
+> concatCPUTestParams;
+
+class ConcatLayerCPUTest : public testing::WithParamInterface<concatCPUTestParams>,
+                           virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<concatCPUTestParams> obj) {
+        int axis;
+        std::vector<std::vector<size_t>> inputShapes;
+        InferenceEngine::Precision netPrecision;
+        std::string targetName;
+        CPUSpecificParams cpuParams;
+        std::tie(axis, inputShapes, netPrecision, targetName, cpuParams) = obj.param;
+
+        std::ostringstream result;
+        result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
+        result << "axis=" << axis << "_";
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "trgDev=" << targetName << "_";
+        result << CPUTestsBase::getTestCaseName(cpuParams);
+        return result.str();
+    }
+protected:
+    void SetUp() override {
+        int axis;
+        std::vector<std::vector<size_t>> inputShape;
+        InferenceEngine::Precision netPrecision;
+        CPUSpecificParams cpuParams;
+        std::tie(axis, inputShape, netPrecision, targetDevice, cpuParams) = this->GetParam();
+        inPrc = outPrc = netPrecision;
+
+        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+        selectedType += std::string("_") + inPrc.name();
+
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+        auto params = ngraph::builder::makeParams(ngPrc, inputShape);
+        auto paramOuts = ngraph::helpers::convert2OutputVector(
+                ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+        auto concat = std::make_shared<ngraph::opset1::Concat>(paramOuts, axis);
+
+        function = makeNgraphFunction(ngPrc, params, concat, "concat");
+    }
+};
+
+TEST_P(ConcatLayerCPUTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    Run();
+    CheckPluginRelatedResults(executableNetwork, "Concatenation");
+}
+
+namespace {
+const auto planar_4D_ref = CPUSpecificParams{{nchw}, {nchw}, {"ref"}, "ref"};
+const auto planar_5D_ref = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref"}, "ref"};
+
+const auto planar_4D = CPUSpecificParams{{nchw}, {nchw}, {}, "unknown"};
+const auto planar_5D = CPUSpecificParams{{ncdhw}, {ncdhw}, {}, "unknown"};
+
+const auto planarChannels_4D = CPUSpecificParams{{nhwc}, {nhwc}, {}, "ref"};
+const auto planarChannels_5D = CPUSpecificParams{{ndhwc}, {ndhwc}, {}, "ref"};
+
+const auto blocked8_4D = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "unknown"};
+const auto blocked8_5D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "unknown"};
+
+const auto blocked8_4D_ref = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "ref"};
+const auto blocked8_5D_ref = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "ref"};
+
+const auto blocked16_4D = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "unknown"};
+const auto blocked16_5D = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "unknown"};
+
+const auto blocked16_4D_ref = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "ref"};
+const auto blocked16_5D_ref = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "ref"};
+
+// List of precisions natively supported by mkldnn.
+const std::vector<Precision> netPrecisions = {
+        Precision::I8,
+        Precision::I32,
+        Precision::FP32,
+        Precision::BF16
+};
+
+INSTANTIATE_TEST_CASE_P(concat_Concat4D_CPU_Block8inPlace, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(1),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{1, 8,  3, 5},
+                                                                                   {1, 16, 3, 5}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(planar_4D, planarChannels_4D, blocked8_4D)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block8, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0, 2, 3),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5},
+                                                                                   {2, 16, 3, 5}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(planar_4D_ref, planarChannels_4D, blocked8_4D_ref)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16inPlace, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(1),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5},
+                                                                                   {2, 32, 3, 5}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(blocked16_4D)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0, 2, 3),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 32, 3, 5},
+                                                                                   {2, 32, 3, 5}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(blocked16_4D_ref)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(concat_Concat5D_CPU_Block8inPlace, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(1),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{1, 8,  3, 5, 7},
+                                                                                   {1, 16, 3, 5, 7}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(planar_5D, planarChannels_5D, blocked8_5D)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block8, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0, 2, 3, 4),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5, 7},
+                                                                                   {2, 16, 3, 5, 7}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(planar_5D_ref, planarChannels_5D, blocked8_5D_ref)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16inPlace, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(1),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5, 7},
+                                                                                   {2, 32, 3, 5, 7}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(blocked16_5D)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0, 2, 3, 4),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 32, 3, 5, 7},
+                                                                                   {2, 32, 3, 5, 7}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(blocked16_5D_ref)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat_inPlace, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(1),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 3, 5},
+                                                                                   {2, 4, 5}},
+                                                  std::vector<std::vector<size_t>>{{2, 3},
+                                                                                   {2, 4}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat3D, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0, 2),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 4, 5},
+                                                                                   {2, 4, 5}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat_1D_2D, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 4},
+                                                                                   {3, 4}},
+                                                  std::vector<std::vector<size_t>>{{2}, {3}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+} // namespace
+} // namespace CPULayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_transpose_reorder.cpp b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_transpose_reorder.cpp
@@ -222,6 +222,7 @@ void FuseTransposeAndReorderTest2::CreateGraph() {
     transpose2->get_rt_info() = makeCPUInfo({memFmt2}, {memFmt2}, {});
 
     auto concat = ngraph::builder::makeConcat({transpose1, transpose2}, 1);
+    concat->get_rt_info() = makeCPUInfo({memFmt1, memFmt1}, {memFmt1}, {});
 
     ngraph::ResultVector results{std::make_shared<ngraph::opset5::Result>(concat)};
     function = std::make_shared<ngraph::Function>(results, params, "Transpose_Transpose_Concat");