Bf16 crop layer (#4)

* [IE TESTS][CPU] Cpu specific test for the Crop layer has been created. * [IE TESTS][CPU] Deprecated Crop single layer test removed. * [CPU BF16] Bfloat16 precision was added to the Crop layer. * [CPU BF16] Crop layer minor code improvements. * [IE TESTS][CPU] Crop layer test added 2D tensor tests. * [IE TESTS][CPU] Crop layer test, obsolete comment removed. * [IE TESTS][CPU] Fixed CropIE include path. * Crop test fix for older gcc compiler.
yury-intel · Nov 10, 2020 · 7bb0249 · 7bb0249
1 parent 194519c
commit 7bb0249
Show file tree

Hide file tree

Showing 5 changed files with 293 additions and 293 deletions.
diff --git a/inference-engine/src/legacy_api/include/legacy/ngraph_ops/crop_ie.hpp b/inference-engine/src/legacy_api/include/legacy/ngraph_ops/crop_ie.hpp
@@ -27,6 +27,8 @@ class INFERENCE_ENGINE_API_CLASS(CropIE) : public Op {
     void validate_and_infer_types() override;
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    bool evaluate(const HostTensorVector& outputs,
+                  const HostTensorVector& inputs) const override;
 
     std::vector<int64_t> axes, dim, offset;
 };

diff --git a/inference-engine/src/legacy_api/src/ngraph_ops/crop_ie.cpp b/inference-engine/src/legacy_api/src/ngraph_ops/crop_ie.cpp
@@ -45,3 +45,80 @@ void op::CropIE::validate_and_infer_types() {
 
     set_output_type(0, get_input_element_type(0), PartialShape(output_shape));
 }
+
+bool op::CropIE::evaluate(const HostTensorVector &outputs, const HostTensorVector &inputs) const {
+    if (inputs.front()->get_element_type() != outputs.front()->get_element_type()) {
+        throw ngraph_error("Input and output data types must be the same!");
+    }
+
+    auto *dst_ptr = outputs.front()->get_data_ptr<uint8_t>();
+
+    const int ndims = dim.size();
+
+    const size_t OFFSET_N = (ndims > 0) ? offset.at(0) : 0;
+    const size_t OFFSET_C = (ndims > 1) ? offset.at(1) : 0;
+    const size_t OFFSET_D = (ndims > 4) ? offset.at(ndims - 3) : 0;
+    const size_t OFFSET_H = (ndims > 2) ? offset.at(ndims - 2) : 0;
+    const size_t OFFSET_W = (ndims > 3) ? offset.at(ndims - 1) : 0;
+
+    auto outputShape = get_output_partial_shape(0).get_shape();
+
+    const size_t ON = (ndims > 0) ? outputShape.at(0) : 1;
+    const size_t OC = (ndims > 1) ? outputShape.at(1) : 1;
+    const size_t OD = (ndims > 4) ? outputShape.at(ndims - 3) : 1;
+    const size_t OH = (ndims > 2) ? outputShape.at(ndims - 2) : 1;
+    const size_t OW = (ndims > 3) ? outputShape.at(ndims - 1) : 1;
+
+    auto inputShape = get_input_partial_shape(0).get_shape();
+
+    const size_t IN = (ndims > 0) ? inputShape.at(0) : 1;
+    const size_t IC = (ndims > 1) ? inputShape.at(1) : 1;
+    const size_t ID = (ndims > 4) ? inputShape.at(ndims - 3) : 1;
+    const size_t IH = (ndims > 2) ? inputShape.at(ndims - 2) : 1;
+    const size_t IW = (ndims > 3) ? inputShape.at(ndims - 1) : 1;
+
+    auto dst_off = [=](size_t n, size_t c, size_t d, size_t h, size_t w) -> size_t {
+        return (n * OC * OD * OH * OW + c * OD * OH * OW + d * OH * OW + h * OW + w);
+    };
+    auto src_off = [=](size_t n, size_t c, size_t d, size_t h, size_t w) -> size_t {
+        return (n * IC * ID * IH * IW + c * ID * IH * IW + d * IH * IW + h * IW + w);
+    };
+
+    if (IN - OFFSET_N < ON) {
+        throw ngraph_error("Wrong offset!");
+    }
+    if (IC - OFFSET_C < OC) {
+        throw ngraph_error("Wrong offset!");
+    }
+    if (IC - OFFSET_C < OC) {
+        throw ngraph_error("Wrong offset!");
+    }
+    if (ID - OFFSET_D < OD) {
+        throw ngraph_error("Wrong offset!");
+    }
+    if (IH - OFFSET_H < OH) {
+        throw ngraph_error("Wrong offset!");
+    }
+    if (IW - OFFSET_W < OW) {
+        throw ngraph_error("Wrong offset!");
+    }
+
+    size_t dataSize = inputs.front()->get_element_type().size();
+
+    auto src_ptr = inputs.front()->get_data_ptr<const uint8_t>();
+    for (size_t n = 0; n < ON; ++n) {
+        for (size_t c = 0; c < OC; ++c) {
+            for (size_t d = 0; d < OD; ++d) {
+                for (size_t h = 0; h < OH; ++h) {
+                    for (size_t w = 0; w < OW; ++w) {
+                        memcpy(dst_ptr + dataSize * dst_off(n, c, d, h, w),
+                               src_ptr + dataSize * src_off(n + OFFSET_N, c + OFFSET_C, d + OFFSET_D, h + OFFSET_H, w + OFFSET_W),
+                               dataSize);
+                    }
+                }
+            }
+        }
+    }
+
+    return true;
+}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
@@ -58,13 +58,12 @@ void MKLDNNCropNode::initSupportedPrimitiveDescriptors() {
         return;
 
     InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
     auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
     precision = getCnnLayer()->outData[0]->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
     auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
+    if (inputDataType != outputDataType) {
+        outputDataType = inputDataType; // Crop doesn't convert precisions, only moves data
+    }
 
     auto& inDims = getParentEdgeAt(0)->getDims();
     if (inDims.ndims() != 2 && inDims.ndims() != 4 && inDims.ndims() != 5) {
@@ -125,19 +124,19 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
     if (!MKLDNNMemory::IsPlainFormat(parentMem.GetFormat())) {
         m_block_size = parentMem.GetDescriptor().data.layout_desc.blocking.block_dims[1];
     }
-    int m_inner_dim = dims[dims.size() - 1] * m_block_size;
+    const int m_inner_dim = dims[dims.size() - 1] * m_block_size;
 
     const memory &dst_d = getChildEdgeAt(0)->getMemory().GetPrimitive();
 
-    int dst_ndims = dst_d.get_primitive_desc().desc().data.ndims;
+    const int dst_ndims = dst_d.get_primitive_desc().desc().data.ndims;
 
     // TODO: Rewrite it in general case. For every tensor
     // and rank, without using letter N,C,D,H,W
-    int OFFSET_N = (dst_ndims > 0) ? offsets[0] : 0;
-    int OFFSET_C = (dst_ndims > 1) ? offsets[1] : 0;
-    int OFFSET_D = (dst_ndims > 4) ? offsets[offsets.size() - 3] : 0;
-    int OFFSET_H = (dst_ndims > 2) ? offsets[offsets.size() - 2] : 0;
-    int OFFSET_W = (dst_ndims > 3) ? offsets[offsets.size() - 1] : 0;
+    const int OFFSET_N = (dst_ndims > 0) ? offsets[0] : 0;
+    const int OFFSET_C = (dst_ndims > 1) ? offsets[1] : 0;
+    const int OFFSET_D = (dst_ndims > 4) ? offsets[offsets.size() - 3] : 0;
+    const int OFFSET_H = (dst_ndims > 2) ? offsets[offsets.size() - 2] : 0;
+    const int OFFSET_W = (dst_ndims > 3) ? offsets[offsets.size() - 1] : 0;
 
     // TODO: Check applicability of dyn_batch_lim in early steps.
     //       crop of batch dimension doesn't support dyn batch.
@@ -155,42 +154,16 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
     const int IH = (src_ndims  > 2) ? src_dims[src_dims.size() - 2] : 1;
     const int IW = (src_ndims  > 3) ? src_dims[src_dims.size() - 1] : 1;
 
-    const auto *src_data = reinterpret_cast<const float*>(parentMem.GetData()) +
-            parentMem.GetDescriptor().data.layout_desc.blocking.offset_padding;
-    float *dst_data = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) +
-            getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
+    const uint8_t itemSize = MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(parentMem.GetDataType()));
+
+    const auto *src_data = reinterpret_cast<const uint8_t *>(parentMem.GetData()) +
+            itemSize * parentMem.GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto *dst_data = reinterpret_cast<uint8_t*>(getChildEdgeAt(0)->getMemory().GetData()) +
+            itemSize * getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
 
-#ifdef _WIN32
-    if (OD == 1 && OH == 1 && OW == 1 && ID == 1 && IH == 1 && IW == 1) {
-        for (int n = 0; n < ON; ++n) {
-            cpu_memcpy(&dst_data[n*OC], &src_data[(n+OFFSET_N)*IC + OFFSET_C], OC * sizeof(float));
-        }
-    } else {
-        for (int n = 0; n < ON; ++n) {
-            for (int c = 0; c < OC; c += m_block_size) {
-                for (int d = 0; d < OD; ++d) {
-                    for (int h = 0; h < OH; ++h) {
-                        int dst_ind =
-                                n*OC*OD*OH*OW + c*OD*OH*OW + d*OH*OW*m_block_size +
-                                h*OW*m_block_size;
-
-                        int src_ind =
-                                (n+OFFSET_N)*IC*ID*IH*IW +
-                                (c+OFFSET_C)*ID*IH*IW +
-                                (d+OFFSET_D)*IH*IW*m_block_size +
-                                (h+OFFSET_H)*IW*m_block_size +
-                                OFFSET_W*m_block_size;
-
-                        cpu_memcpy(dst_data + dst_ind, src_data + src_ind, m_inner_dim * sizeof(float));
-                    }
-                }
-            }
-        }
-    }
-#else
     if (OD == 1 && OH == 1 && OW == 1 && ID == 1 && IH == 1 && IW == 1) {
         parallel_for(ON, [&](int n) {
-            cpu_memcpy(&dst_data[n*OC], &src_data[(n+OFFSET_N)*IC + OFFSET_C], OC * sizeof(float));
+            cpu_memcpy(dst_data + itemSize * n * OC, src_data + itemSize *((n+OFFSET_N)*IC + OFFSET_C), OC * itemSize);
         });
     } else {
         parallel_for2d(ON, (OC / m_block_size), [&](int n, int c) {
@@ -201,15 +174,14 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
                               ((d+OFFSET_D)*IH*IW + OFFSET_H*IW + OFFSET_W)*m_block_size;
 
                 for (int h = 0; h < OH; ++h) {
-                    cpu_memcpy(dst_data + dst_ind, src_data + src_ind, m_inner_dim * sizeof(float));
+                    cpu_memcpy(dst_data + itemSize * dst_ind, src_data + itemSize * src_ind, m_inner_dim * itemSize);
 
                     src_ind += IW * m_block_size;
                     dst_ind += OW * m_block_size;
                 }
             }
         });
     }
-#endif
 }
 
 bool MKLDNNCropNode::created() const {

diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/crop.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/crop.cpp
@@ -0,0 +1,196 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "../src/legacy_api/include/legacy/ngraph_ops/crop_ie.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "test_utils/cpu_test_utils.hpp"
+
+using namespace InferenceEngine;
+using namespace CPUTestUtils;
+
+namespace CPULayerTestsDefinitions {
+
+typedef std::tuple<
+        std::vector<size_t>, //input shape
+        std::vector<int64_t>, //dims
+        std::vector<int64_t> // offset
+        > testCaseParams;
+
+typedef std::tuple<
+        testCaseParams,
+        InferenceEngine::Precision,        // Net precision. We'll use only the net precision because the primitive is not supposed to convert precisions.
+        std::string,                       // Device name
+        std::map<std::string, std::string>, // Additional network configuration
+        CPUSpecificParams> CropLayerCPUTestParamSet;
+
+class CropLayerCPUTest : public testing::WithParamInterface<CropLayerCPUTestParamSet>,
+                        virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<CropLayerCPUTestParamSet> obj) {
+        testCaseParams testCase;
+        InferenceEngine::Precision netPrc;
+        std::string targetName;
+        std::map<std::string, std::string> additionalConfig;
+
+        CPUSpecificParams cpuParams;
+        std::tie(testCase, netPrc, targetName, additionalConfig, cpuParams) = obj.param;
+
+        std::ostringstream result;
+        result << "inShape=" << CommonTestUtils::vec2str(std::get<0>(testCase)) << "_";
+        result << "dims=" << CommonTestUtils::vec2str(std::get<1>(testCase)) << "_";
+        result << "offset=" << CommonTestUtils::vec2str(std::get<2>(testCase)) << "_";
+        result << "netPRC=" << netPrc.name() << "_";
+        result << "targetDevice=" << targetName;
+        result << CPUTestsBase::getTestCaseName(cpuParams);
+
+        return result.str();
+    }
+protected:
+    void SetUp() override {
+        testCaseParams testCase;
+        std::vector<size_t> inpShape;
+        std::vector<int64_t> dims;
+        std::vector<int64_t> offset;
+        InferenceEngine::Precision netPrecision;
+        std::map<std::string, std::string> additionalConfig;
+        CPUSpecificParams cpuParams;
+        std::tie(testCase, netPrecision, targetDevice, additionalConfig, cpuParams) = this->GetParam();
+        std::tie(inpShape, dims, offset) = testCase;
+        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+        inPrc = outPrc = netPrecision;
+
+        configuration.insert(additionalConfig.begin(), additionalConfig.end());
+
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+        auto params = ngraph::builder::makeParams(ngPrc, {inpShape});
+        auto paramOuts = ngraph::helpers::convert2OutputVector(
+                ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+
+        std::vector<int64_t> axes;
+        for (size_t i = 0; i < inpShape.size(); ++i) {
+            axes.push_back(i);
+        }
+        auto ss = std::make_shared<ngraph::op::CropIE>(paramOuts[0], axes, dims, offset);
+
+        std::string strExpectedPrc;
+        if (Precision::BF16 == inPrc) {
+            strExpectedPrc = "BF16";
+        } else if (Precision::FP32 == inPrc) {
+            strExpectedPrc = "FP32";
+        }
+
+        selectedType = "unknown_" + strExpectedPrc;
+
+        ss->get_rt_info() = getCPUInfo();
+
+        ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(ss)};
+        function = std::make_shared<ngraph::Function>(results, params, "Crop");
+    }
+};
+
+TEST_P(CropLayerCPUTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    Run();
+    CheckCPUImpl(executableNetwork, "Crop");
+}
+
+namespace {
+// Withing the test scope we don't need any implicit bf16 optimisations, so let's run the network as is.
+std::map<std::string, std::string> additional_config = {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}};
+
+std::vector<Precision> netPrc = {Precision::BF16, Precision::FP32};
+
+std::vector<testCaseParams> testCasesPlain2D = {testCaseParams{{32, 32}, {32, 10}, {0, 20}},
+                                                testCaseParams{{32, 20}, {30, 10}, {2, 10}}};
+
+const auto CropParamsPlain2D = ::testing::Combine(
+        ::testing::ValuesIn(testCasesPlain2D),
+        ::testing::ValuesIn(netPrc),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+        ::testing::Values(additional_config),
+        ::testing::Values(emptyCPUSpec));
+
+INSTANTIATE_TEST_CASE_P(CompareWithRefs_Plain_2D, CropLayerCPUTest, CropParamsPlain2D, CropLayerCPUTest::getTestCaseName);
+
+std::vector<testCaseParams> testCasesPlain4D = {testCaseParams{{1, 5, 32, 32}, {1, 2, 23, 23}, {0, 2, 5, 4}},
+                                                testCaseParams{{1, 5, 32, 32}, {1, 5, 5, 5}, {0, 0, 20, 20}},
+                                                testCaseParams{{1, 5, 32, 32}, {1, 5, 32, 10}, {0, 0, 0, 20}},
+                                                testCaseParams{{1, 5, 32, 20}, {1, 5, 30, 10}, {0, 0, 2, 10}}};
+
+std::vector<CPUSpecificParams> cpuParams_4D = {
+        CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
+        CPUSpecificParams({nchw}, {nchw}, {}, {})
+};
+
+const auto CropParamsPlain4D = ::testing::Combine(
+                ::testing::ValuesIn(testCasesPlain4D),
+                ::testing::ValuesIn(netPrc),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                ::testing::Values(additional_config),
+                ::testing::Values(cpuParams_4D.at(1)));
+
+INSTANTIATE_TEST_CASE_P(CompareWithRefs_Plain_4D, CropLayerCPUTest, CropParamsPlain4D, CropLayerCPUTest::getTestCaseName);
+
+std::vector<testCaseParams> testCasesBlocked4D = {testCaseParams{{1, 16, 32, 32}, {1, 16, 5, 5}, {0, 0, 20, 20}},
+                                                  testCaseParams{{1, 32, 32, 32}, {1, 16, 32, 10}, {0, 0, 0, 20}}};
+
+const auto CropParamsBlocked4D = ::testing::Combine(
+        ::testing::ValuesIn(testCasesBlocked4D),
+        ::testing::ValuesIn(netPrc),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+        ::testing::Values(additional_config),
+        ::testing::Values(filterCPUSpecificParams(cpuParams_4D).front()));
+
+INSTANTIATE_TEST_CASE_P(CompareWithRefs_Blocked_4D, CropLayerCPUTest, CropParamsBlocked4D, CropLayerCPUTest::getTestCaseName);
+
+std::vector<testCaseParams> testCasesPlain4DynBatch = {testCaseParams{{10, 5, 32, 32}, {1, 2, 23, 23}, {0, 2, 5, 4}},
+                                                       testCaseParams{{10, 5, 32, 32}, {1, 5, 5, 5}, {0, 0, 20, 20}},
+                                                       testCaseParams{{10, 5, 32, 32}, {1, 5, 32, 10}, {0, 0, 0, 20}},
+                                                       testCaseParams{{10, 5, 32, 20}, {1, 5, 30, 10}, {0, 0, 2, 10}}};
+
+std::map<std::string, std::string> additional_config_dyn_batch = {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO},
+                                                                  {PluginConfigParams::KEY_DYN_BATCH_ENABLED, PluginConfigParams::YES}};
+
+const auto CropParamsPlain4DynBatch = ::testing::Combine(
+        ::testing::ValuesIn(testCasesPlain4DynBatch),
+        ::testing::ValuesIn(netPrc),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+        ::testing::Values(additional_config_dyn_batch),
+        ::testing::Values(cpuParams_4D.at(1)));
+
+INSTANTIATE_TEST_CASE_P(CompareWithRefs_Blocked_4DynBatch, CropLayerCPUTest, CropParamsPlain4DynBatch, CropLayerCPUTest::getTestCaseName);
+
+std::vector<testCaseParams> testCasesPlain5D = {testCaseParams{{1, 5, 32, 20, 14}, {1, 5, 30, 10, 8}, {0, 0, 2, 10, 6}},
+                                                testCaseParams{{5, 9, 32, 20, 14}, {2, 5, 30, 10, 8}, {3, 4, 2, 10, 6}}};
+
+std::vector<CPUSpecificParams> cpuParams_5D = {
+        CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {}),
+        CPUSpecificParams({ncdhw}, {ncdhw}, {}, {})
+};
+
+const auto CropParamsPlain5D = ::testing::Combine(
+        ::testing::ValuesIn(testCasesPlain5D),
+        ::testing::ValuesIn(netPrc),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+        ::testing::Values(additional_config),
+        ::testing::Values(cpuParams_5D.at(1)));
+
+INSTANTIATE_TEST_CASE_P(CompareWithRefs_Plain_5D, CropLayerCPUTest, CropParamsPlain5D, CropLayerCPUTest::getTestCaseName);
+
+std::vector<testCaseParams> testCasesBlocked5D = {testCaseParams{{1, 32, 32, 20, 14}, {1, 16, 30, 10, 8}, {0, 0, 2, 10, 6}},
+                                                  testCaseParams{{5, 32, 32, 20, 14}, {2, 32, 30, 10, 8}, {3, 0, 2, 10, 6}}};
+
+const auto CropParamsBlocked5D = ::testing::Combine(
+        ::testing::ValuesIn(testCasesBlocked5D),
+        ::testing::ValuesIn(netPrc),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+        ::testing::Values(additional_config),
+        ::testing::Values(cpuParams_5D.at(0)));
+
+INSTANTIATE_TEST_CASE_P(CompareWithRefs_Blocked_5D, CropLayerCPUTest, CropParamsBlocked5D, CropLayerCPUTest::getTestCaseName);
+
+} // namespace
+} // namespace CPULayerTestsDefinitions
+