From d2003095dc5656736ba19f93b92f6a02a8e5dd6e Mon Sep 17 00:00:00 2001 From: Chenhu Wang Date: Fri, 28 May 2021 22:56:04 +0800 Subject: [PATCH] [CPU] MVN_accuracy_fix_on_avx512 (#5787) --- .../emitters/jit_load_store_emitters.cpp | 18 ++- .../mkldnn_plugin/mkldnn_graph_optimizer.cpp | 12 +- .../mkldnn_plugin/nodes/mkldnn_mvn_node.cpp | 16 +- .../plugin/cpu/single_layer_tests/mvn.cpp | 148 +++++++----------- 4 files changed, 80 insertions(+), 114 deletions(-) diff --git a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp index 276791b7d7de6d..57689d6302d70a 100644 --- a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp @@ -105,8 +105,10 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 ®_src, int offset_byte, In h->uni_vcvtdq2ps(Vmm(out_vec_idx), Vmm(out_vec_idx)); break; case Precision::I32: - if ((src_prc == Precision::FP32) || (src_prc == Precision::BF16)) + if ((src_prc == Precision::FP32) || (src_prc == Precision::BF16)) { + h->uni_vroundps(Vmm(out_vec_idx), Vmm(out_vec_idx), 3); // rounding to zero h->uni_vcvtps2dq(Vmm(out_vec_idx), Vmm(out_vec_idx)); + } break; default: break; @@ -549,8 +551,10 @@ template if (src_prc != dst_prc) { switch (src_prc) { case Precision::FP32: - if ((dst_prc != Precision::FP32) && (dst_prc != Precision::BF16)) + if ((dst_prc != Precision::FP32) && (dst_prc != Precision::BF16)) { + h->uni_vroundps(Vmm(in_vec_idx), Vmm(in_vec_idx), 3); // rounding to zero h->uni_vcvtps2dq(Vmm(in_vec_idx), Vmm(in_vec_idx)); + } break; case Precision::I32: if ((dst_prc == Precision::FP32) || (dst_prc == Precision::BF16)) @@ -636,7 +640,7 @@ template mask = (mask << store_size) - mask; h->mov(Reg64(aux_gpr_idxs[0]), mask); h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); - h->vmovdqu8(addr(0) | k_mask, zmm); + h->vmovdqu8(addr(0), zmm | k_mask); } else { if (store_size == 64) { h->uni_vmovdqu(addr(0), zmm); @@ -768,10 +772,10 @@ template h->mov(Reg32(aux_gpr_idxs[0]), mask); h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); if (is_signed) { - h->vpmovsdb(addr(0) | k_mask, vmm); + h->vpmovsdb(addr(0), vmm | k_mask); } else { h->vpmaxsd(vmm, vmm, Vmm(aux_vec_idxs[0])); - h->vpmovusdb(addr(0) | k_mask, vmm); + h->vpmovusdb(addr(0), vmm | k_mask); } } } else { @@ -850,10 +854,10 @@ template h->mov(Reg32(aux_gpr_idxs[0]), mask); h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); if (is_signed) { - h->vpmovsdw(ptr[reg + offset] | k_mask, vmm); + h->vpmovsdw(ptr[reg + offset], vmm | k_mask); } else { h->vmaxsd(vmm, Vmm(aux_vec_idxs[0]), vmm); - h->vpmovusdw(ptr[reg + offset] | k_mask, vmm); + h->vpmovusdw(ptr[reg + offset], vmm | k_mask); } } } else { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp index 17c27928112609..fd0c9fc136abfc 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp @@ -1122,17 +1122,7 @@ void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) { auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](MKLDNNNodePtr node) { - bool isSutableMVN = (node->getType() == MVN); - - if (isSutableMVN) { - auto mvnNode = std::dynamic_pointer_cast(node); - if (mvnNode == nullptr) - IE_THROW() << "CPU node with name '" << node->getName() << "' is not a MVN node."; - - return mvnNode->getChildEdges().size() == 1 && !mvnNode->getAcrossChannels() && mvnNode->getNormalizeVariance(); - } else { - return false; - } + return (node->getType() == MVN) && (node->getChildEdges().size() == 1); }; auto parent = graphNodes.begin(); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp index 0bac642158e09b..f92de2af2c7aa6 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp @@ -703,12 +703,6 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() { setPostOps(attr, true); Precision inputPrecision = getOriginalInputPrecisionAtPort(0); - if (getParentEdgeAt(0)->getDims().ndims() < 3 || getParentEdgeAt(0)->getDims().ndims() > 5 - || acrossChannels_ || !normalizeVariance_) { - if (!isFloatCompatible(inputPrecision)) { - inputPrecision = Precision::FP32; - } - } Precision outputPrecision = getOriginalOutputPrecisionAtPort(0); if (!mayiuse(avx512_core)) { if (outputPrecision == Precision::BF16) @@ -1409,6 +1403,16 @@ bool MKLDNNMVNNode::canFuse(const MKLDNNNodePtr& node) const { if (!mayiuse(cpu::x64::sse41)) { return false; } + // limit post ops to unary when shape transformed on channel + // 1D only fused with unary + int inputRank = getParentEdgeAt(0)->getDims().ndims(); + bool unaryEltwise = one_of(node->getAlgorithm(), EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseClamp, EltwiseTanh, + EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven, + EltwiseRoundHalfAwayFromZero, EltwiseAbs, EltwiseSqrt, EltwiseSoftRelu); + if ((inputRank == 1 && !unaryEltwise) || + (inputRank == 2 && !unaryEltwise && acrossChannels_)) { + return false; + } return canFuseSimpleOperation(node); } diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mvn.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mvn.cpp index 31ca8b3cd09e55..f89e302c539e50 100644 --- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mvn.cpp +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mvn.cpp @@ -91,6 +91,7 @@ const std::vector> inputShapes_2D = { const std::vector> inputShapes_3D = { {1, 32, 17}, {1, 37, 9}, + {1, 16, 4}, }; const std::vector> inputShapes_4D = { @@ -127,7 +128,8 @@ const std::vector epsilon = { 0.000000001 }; -std::vector inpOutPrc = {Precision::BF16, Precision::FP32}; +std::vector inpPrc = {Precision::I8, Precision::BF16, Precision::FP32}; +std::vector outPrc = {Precision::BF16, Precision::FP32}; std::vector cpuParams_4D = { CPUSpecificParams({nhwc}, {nhwc}, {}, {}), @@ -141,35 +143,20 @@ std::vector cpuParams_5D = { CPUSpecificParams({ncdhw}, {ncdhw}, {}, {}) }; -const auto Mvn1D = ::testing::Combine( - ::testing::Combine( - ::testing::ValuesIn(inputShapes_1D), - ::testing::Values(InferenceEngine::Precision::FP32), - ::testing::ValuesIn(acrossChannels), - ::testing::ValuesIn(normalizeVariance), - ::testing::ValuesIn(epsilon), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - ::testing::Values(emptyCPUSpec), - ::testing::Values(emptyFusingSpec), - ::testing::ValuesIn(inpOutPrc), - ::testing::ValuesIn(inpOutPrc)); - -INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_1D, MvnLayerCPUTest, Mvn1D, MvnLayerCPUTest::getTestCaseName); - -const auto Mvn2D = ::testing::Combine( - ::testing::Combine( - ::testing::ValuesIn(inputShapes_2D), - ::testing::Values(InferenceEngine::Precision::FP32), - ::testing::ValuesIn(acrossChannels), - ::testing::ValuesIn(normalizeVariance), - ::testing::ValuesIn(epsilon), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - ::testing::Values(emptyCPUSpec), - ::testing::Values(emptyFusingSpec), - ::testing::ValuesIn(inpOutPrc), - ::testing::ValuesIn(inpOutPrc)); - -INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_2D, MvnLayerCPUTest, Mvn2D, MvnLayerCPUTest::getTestCaseName); +std::vector fusingParamsSet { + emptyFusingSpec, + /* activations */ + fusingRelu, + fusingElu, + fusingTanh, + fusingSwish, + /* FQ */ + fusingFakeQuantizePerChannel, + fusingFakeQuantizePerChannelRelu, + fusingFakeQuantizePerTensorRelu, + /* another patterns */ + fusingScaleShift, +}; const auto Mvn3D = ::testing::Combine( ::testing::Combine( @@ -180,11 +167,11 @@ const auto Mvn3D = ::testing::Combine( ::testing::ValuesIn(epsilon), ::testing::Values(CommonTestUtils::DEVICE_CPU)), ::testing::Values(emptyCPUSpec), - ::testing::Values(emptyFusingSpec), - ::testing::ValuesIn(inpOutPrc), - ::testing::ValuesIn(inpOutPrc)); + ::testing::ValuesIn(fusingParamsSet), + ::testing::ValuesIn(inpPrc), + ::testing::ValuesIn(outPrc)); -INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_3D, MvnLayerCPUTest, Mvn3D, MvnLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Mvn3D, MvnLayerCPUTest, Mvn3D, MvnLayerCPUTest::getTestCaseName); const auto Mvn4D = ::testing::Combine( ::testing::Combine( @@ -195,11 +182,11 @@ const auto Mvn4D = ::testing::Combine( ::testing::ValuesIn(epsilon), ::testing::Values(CommonTestUtils::DEVICE_CPU)), ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)), - ::testing::Values(emptyFusingSpec), - ::testing::ValuesIn(inpOutPrc), - ::testing::ValuesIn(inpOutPrc)); + ::testing::ValuesIn(fusingParamsSet), + ::testing::ValuesIn(inpPrc), + ::testing::ValuesIn(outPrc)); -INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D, MvnLayerCPUTest, Mvn4D, MvnLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Mvn4D, MvnLayerCPUTest, Mvn4D, MvnLayerCPUTest::getTestCaseName); const auto Mvn5D = ::testing::Combine( ::testing::Combine( @@ -210,86 +197,67 @@ const auto Mvn5D = ::testing::Combine( ::testing::ValuesIn(epsilon), ::testing::Values(CommonTestUtils::DEVICE_CPU)), ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)), - ::testing::Values(emptyFusingSpec), - ::testing::ValuesIn(inpOutPrc), - ::testing::ValuesIn(inpOutPrc)); + ::testing::ValuesIn(fusingParamsSet), + ::testing::ValuesIn(inpPrc), + ::testing::ValuesIn(outPrc)); -INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D, MvnLayerCPUTest, Mvn5D, MvnLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Mvn5D, MvnLayerCPUTest, Mvn5D, MvnLayerCPUTest::getTestCaseName); -std::vector fusingParamsSet { +// 1D 2D case +std::vector fusingUnaryEltwiseParamsSet { /* activations */ fusingRelu, fusingElu, fusingTanh, fusingSwish, - /* FQ */ - fusingFakeQuantizePerChannel, - fusingFakeQuantizePerChannelRelu, - fusingFakeQuantizePerTensorRelu, - /* another patterns */ - fusingScaleShift, }; -const auto Mvn2DFuse = ::testing::Combine( +const auto Mvn1D = ::testing::Combine( ::testing::Combine( - ::testing::ValuesIn(inputShapes_2D), + ::testing::ValuesIn(inputShapes_1D), ::testing::Values(InferenceEngine::Precision::FP32), - ::testing::Values(false), - ::testing::Values(true), + ::testing::ValuesIn(acrossChannels), + ::testing::ValuesIn(normalizeVariance), ::testing::ValuesIn(epsilon), ::testing::Values(CommonTestUtils::DEVICE_CPU)), ::testing::Values(emptyCPUSpec), - ::testing::ValuesIn(fusingParamsSet), - ::testing::ValuesIn(inpOutPrc), - ::testing::ValuesIn(inpOutPrc)); - -INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_2D_Fuse, MvnLayerCPUTest, Mvn2DFuse, MvnLayerCPUTest::getTestCaseName); - -const auto Mvn3DFuse = ::testing::Combine( - ::testing::Combine( - ::testing::ValuesIn(inputShapes_3D), - ::testing::Values(InferenceEngine::Precision::FP32), - ::testing::Values(false), - ::testing::Values(true), - ::testing::ValuesIn(epsilon), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - ::testing::Values(emptyCPUSpec), - ::testing::ValuesIn(fusingParamsSet), - ::testing::ValuesIn(inpOutPrc), - ::testing::ValuesIn(inpOutPrc)); + ::testing::ValuesIn(fusingUnaryEltwiseParamsSet), + ::testing::ValuesIn(inpPrc), + ::testing::ValuesIn(outPrc)); -INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_3D_Fuse, MvnLayerCPUTest, Mvn3DFuse, MvnLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Mvn1D, MvnLayerCPUTest, Mvn1D, MvnLayerCPUTest::getTestCaseName); -const auto Mvn4DFuse = ::testing::Combine( +// 2D no transformed +const auto Mvn2D = ::testing::Combine( ::testing::Combine( - ::testing::ValuesIn(inputShapes_4D), + ::testing::ValuesIn(inputShapes_2D), ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(false), - ::testing::Values(true), + ::testing::ValuesIn(normalizeVariance), ::testing::ValuesIn(epsilon), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyCPUSpec), ::testing::ValuesIn(fusingParamsSet), - ::testing::ValuesIn(inpOutPrc), - ::testing::ValuesIn(inpOutPrc)); + ::testing::ValuesIn(inpPrc), + ::testing::ValuesIn(outPrc)); -INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_Fuse, MvnLayerCPUTest, Mvn4DFuse, MvnLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Mvn2D, MvnLayerCPUTest, Mvn2D, MvnLayerCPUTest::getTestCaseName); -const auto Mvn5DFuse = ::testing::Combine( +// 2d transformed +const auto Mvn2DTrans = ::testing::Combine( ::testing::Combine( - ::testing::ValuesIn(inputShapes_5D), + ::testing::ValuesIn(inputShapes_2D), ::testing::Values(InferenceEngine::Precision::FP32), - ::testing::Values(false), ::testing::Values(true), + ::testing::ValuesIn(normalizeVariance), ::testing::ValuesIn(epsilon), - ::testing::Values(CommonTestUtils::DEVICE_CPU)), - ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)), - ::testing::ValuesIn(fusingParamsSet), - ::testing::ValuesIn(inpOutPrc), - ::testing::ValuesIn(inpOutPrc)); - -INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_Fuse, MvnLayerCPUTest, Mvn5DFuse, MvnLayerCPUTest::getTestCaseName); + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyCPUSpec), + ::testing::ValuesIn(fusingUnaryEltwiseParamsSet), + ::testing::ValuesIn(inpPrc), + ::testing::ValuesIn(outPrc)); +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_MVN2DTrans, MvnLayerCPUTest, Mvn2DTrans, MvnLayerCPUTest::getTestCaseName); } // namespace } // namespace CPULayerTestsDefinitions \ No newline at end of file