Skip to content

Commit

Permalink
[CPU] MVN_accuracy_fix_on_avx512 (openvinotoolkit#5787)
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhu-wang authored and rnugmanx committed Aug 26, 2021
1 parent 86942da commit 1b0fea4
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 114 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,10 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 &reg_src, int offset_byte, In
h->uni_vcvtdq2ps(Vmm(out_vec_idx), Vmm(out_vec_idx));
break;
case Precision::I32:
if ((src_prc == Precision::FP32) || (src_prc == Precision::BF16))
if ((src_prc == Precision::FP32) || (src_prc == Precision::BF16)) {
h->uni_vroundps(Vmm(out_vec_idx), Vmm(out_vec_idx), 3); // rounding to zero
h->uni_vcvtps2dq(Vmm(out_vec_idx), Vmm(out_vec_idx));
}
break;
default:
break;
Expand Down Expand Up @@ -549,8 +551,10 @@ template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
if (src_prc != dst_prc) {
switch (src_prc) {
case Precision::FP32:
if ((dst_prc != Precision::FP32) && (dst_prc != Precision::BF16))
if ((dst_prc != Precision::FP32) && (dst_prc != Precision::BF16)) {
h->uni_vroundps(Vmm(in_vec_idx), Vmm(in_vec_idx), 3); // rounding to zero
h->uni_vcvtps2dq(Vmm(in_vec_idx), Vmm(in_vec_idx));
}
break;
case Precision::I32:
if ((dst_prc == Precision::FP32) || (dst_prc == Precision::BF16))
Expand Down Expand Up @@ -636,7 +640,7 @@ template <typename Vmm>
mask = (mask << store_size) - mask;
h->mov(Reg64(aux_gpr_idxs[0]), mask);
h->kmovq(k_mask, Reg64(aux_gpr_idxs[0]));
h->vmovdqu8(addr(0) | k_mask, zmm);
h->vmovdqu8(addr(0), zmm | k_mask);
} else {
if (store_size == 64) {
h->uni_vmovdqu(addr(0), zmm);
Expand Down Expand Up @@ -768,10 +772,10 @@ template <typename Vmm>
h->mov(Reg32(aux_gpr_idxs[0]), mask);
h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
if (is_signed) {
h->vpmovsdb(addr(0) | k_mask, vmm);
h->vpmovsdb(addr(0), vmm | k_mask);
} else {
h->vpmaxsd(vmm, vmm, Vmm(aux_vec_idxs[0]));
h->vpmovusdb(addr(0) | k_mask, vmm);
h->vpmovusdb(addr(0), vmm | k_mask);
}
}
} else {
Expand Down Expand Up @@ -850,10 +854,10 @@ template <typename Vmm>
h->mov(Reg32(aux_gpr_idxs[0]), mask);
h->kmovw(k_mask, Reg32(aux_gpr_idxs[0]));
if (is_signed) {
h->vpmovsdw(ptr[reg + offset] | k_mask, vmm);
h->vpmovsdw(ptr[reg + offset], vmm | k_mask);
} else {
h->vmaxsd(vmm, Vmm(aux_vec_idxs[0]), vmm);
h->vpmovusdw(ptr[reg + offset] | k_mask, vmm);
h->vpmovusdw(ptr[reg + offset], vmm | k_mask);
}
}
} else {
Expand Down
12 changes: 1 addition & 11 deletions inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1122,17 +1122,7 @@ void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();

auto isSutableParentNode = [](MKLDNNNodePtr node) {
bool isSutableMVN = (node->getType() == MVN);

if (isSutableMVN) {
auto mvnNode = std::dynamic_pointer_cast<MKLDNNMVNNode>(node);
if (mvnNode == nullptr)
IE_THROW() << "CPU node with name '" << node->getName() << "' is not a MVN node.";

return mvnNode->getChildEdges().size() == 1 && !mvnNode->getAcrossChannels() && mvnNode->getNormalizeVariance();
} else {
return false;
}
return (node->getType() == MVN) && (node->getChildEdges().size() == 1);
};

auto parent = graphNodes.begin();
Expand Down
16 changes: 10 additions & 6 deletions inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -703,12 +703,6 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() {
setPostOps(attr, true);

Precision inputPrecision = getOriginalInputPrecisionAtPort(0);
if (getParentEdgeAt(0)->getDims().ndims() < 3 || getParentEdgeAt(0)->getDims().ndims() > 5
|| acrossChannels_ || !normalizeVariance_) {
if (!isFloatCompatible(inputPrecision)) {
inputPrecision = Precision::FP32;
}
}
Precision outputPrecision = getOriginalOutputPrecisionAtPort(0);
if (!mayiuse(avx512_core)) {
if (outputPrecision == Precision::BF16)
Expand Down Expand Up @@ -1409,6 +1403,16 @@ bool MKLDNNMVNNode::canFuse(const MKLDNNNodePtr& node) const {
if (!mayiuse(cpu::x64::sse41)) {
return false;
}
// limit post ops to unary when shape transformed on channel
// 1D only fused with unary
int inputRank = getParentEdgeAt(0)->getDims().ndims();
bool unaryEltwise = one_of(node->getAlgorithm(), EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseClamp, EltwiseTanh,
EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven,
EltwiseRoundHalfAwayFromZero, EltwiseAbs, EltwiseSqrt, EltwiseSoftRelu);
if ((inputRank == 1 && !unaryEltwise) ||
(inputRank == 2 && !unaryEltwise && acrossChannels_)) {
return false;
}

return canFuseSimpleOperation(node);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ const std::vector<std::vector<size_t>> inputShapes_2D = {
const std::vector<std::vector<size_t>> inputShapes_3D = {
{1, 32, 17},
{1, 37, 9},
{1, 16, 4},
};

const std::vector<std::vector<size_t>> inputShapes_4D = {
Expand Down Expand Up @@ -127,7 +128,8 @@ const std::vector<double> epsilon = {
0.000000001
};

std::vector<Precision> inpOutPrc = {Precision::BF16, Precision::FP32};
std::vector<Precision> inpPrc = {Precision::I8, Precision::BF16, Precision::FP32};
std::vector<Precision> outPrc = {Precision::BF16, Precision::FP32};

std::vector<CPUSpecificParams> cpuParams_4D = {
CPUSpecificParams({nhwc}, {nhwc}, {}, {}),
Expand All @@ -141,35 +143,20 @@ std::vector<CPUSpecificParams> cpuParams_5D = {
CPUSpecificParams({ncdhw}, {ncdhw}, {}, {})
};

const auto Mvn1D = ::testing::Combine(
::testing::Combine(
::testing::ValuesIn(inputShapes_1D),
::testing::Values(InferenceEngine::Precision::FP32),
::testing::ValuesIn(acrossChannels),
::testing::ValuesIn(normalizeVariance),
::testing::ValuesIn(epsilon),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
::testing::Values(emptyCPUSpec),
::testing::Values(emptyFusingSpec),
::testing::ValuesIn(inpOutPrc),
::testing::ValuesIn(inpOutPrc));

INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_1D, MvnLayerCPUTest, Mvn1D, MvnLayerCPUTest::getTestCaseName);

const auto Mvn2D = ::testing::Combine(
::testing::Combine(
::testing::ValuesIn(inputShapes_2D),
::testing::Values(InferenceEngine::Precision::FP32),
::testing::ValuesIn(acrossChannels),
::testing::ValuesIn(normalizeVariance),
::testing::ValuesIn(epsilon),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
::testing::Values(emptyCPUSpec),
::testing::Values(emptyFusingSpec),
::testing::ValuesIn(inpOutPrc),
::testing::ValuesIn(inpOutPrc));

INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_2D, MvnLayerCPUTest, Mvn2D, MvnLayerCPUTest::getTestCaseName);
std::vector<fusingSpecificParams> fusingParamsSet {
emptyFusingSpec,
/* activations */
fusingRelu,
fusingElu,
fusingTanh,
fusingSwish,
/* FQ */
fusingFakeQuantizePerChannel,
fusingFakeQuantizePerChannelRelu,
fusingFakeQuantizePerTensorRelu,
/* another patterns */
fusingScaleShift,
};

const auto Mvn3D = ::testing::Combine(
::testing::Combine(
Expand All @@ -180,11 +167,11 @@ const auto Mvn3D = ::testing::Combine(
::testing::ValuesIn(epsilon),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
::testing::Values(emptyCPUSpec),
::testing::Values(emptyFusingSpec),
::testing::ValuesIn(inpOutPrc),
::testing::ValuesIn(inpOutPrc));
::testing::ValuesIn(fusingParamsSet),
::testing::ValuesIn(inpPrc),
::testing::ValuesIn(outPrc));

INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_3D, MvnLayerCPUTest, Mvn3D, MvnLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Mvn3D, MvnLayerCPUTest, Mvn3D, MvnLayerCPUTest::getTestCaseName);

const auto Mvn4D = ::testing::Combine(
::testing::Combine(
Expand All @@ -195,11 +182,11 @@ const auto Mvn4D = ::testing::Combine(
::testing::ValuesIn(epsilon),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)),
::testing::Values(emptyFusingSpec),
::testing::ValuesIn(inpOutPrc),
::testing::ValuesIn(inpOutPrc));
::testing::ValuesIn(fusingParamsSet),
::testing::ValuesIn(inpPrc),
::testing::ValuesIn(outPrc));

INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D, MvnLayerCPUTest, Mvn4D, MvnLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Mvn4D, MvnLayerCPUTest, Mvn4D, MvnLayerCPUTest::getTestCaseName);

const auto Mvn5D = ::testing::Combine(
::testing::Combine(
Expand All @@ -210,86 +197,67 @@ const auto Mvn5D = ::testing::Combine(
::testing::ValuesIn(epsilon),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)),
::testing::Values(emptyFusingSpec),
::testing::ValuesIn(inpOutPrc),
::testing::ValuesIn(inpOutPrc));
::testing::ValuesIn(fusingParamsSet),
::testing::ValuesIn(inpPrc),
::testing::ValuesIn(outPrc));

INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D, MvnLayerCPUTest, Mvn5D, MvnLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Mvn5D, MvnLayerCPUTest, Mvn5D, MvnLayerCPUTest::getTestCaseName);

std::vector<fusingSpecificParams> fusingParamsSet {
// 1D 2D case
std::vector<fusingSpecificParams> fusingUnaryEltwiseParamsSet {
/* activations */
fusingRelu,
fusingElu,
fusingTanh,
fusingSwish,
/* FQ */
fusingFakeQuantizePerChannel,
fusingFakeQuantizePerChannelRelu,
fusingFakeQuantizePerTensorRelu,
/* another patterns */
fusingScaleShift,
};

const auto Mvn2DFuse = ::testing::Combine(
const auto Mvn1D = ::testing::Combine(
::testing::Combine(
::testing::ValuesIn(inputShapes_2D),
::testing::ValuesIn(inputShapes_1D),
::testing::Values(InferenceEngine::Precision::FP32),
::testing::Values(false),
::testing::Values(true),
::testing::ValuesIn(acrossChannels),
::testing::ValuesIn(normalizeVariance),
::testing::ValuesIn(epsilon),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
::testing::Values(emptyCPUSpec),
::testing::ValuesIn(fusingParamsSet),
::testing::ValuesIn(inpOutPrc),
::testing::ValuesIn(inpOutPrc));

INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_2D_Fuse, MvnLayerCPUTest, Mvn2DFuse, MvnLayerCPUTest::getTestCaseName);

const auto Mvn3DFuse = ::testing::Combine(
::testing::Combine(
::testing::ValuesIn(inputShapes_3D),
::testing::Values(InferenceEngine::Precision::FP32),
::testing::Values(false),
::testing::Values(true),
::testing::ValuesIn(epsilon),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
::testing::Values(emptyCPUSpec),
::testing::ValuesIn(fusingParamsSet),
::testing::ValuesIn(inpOutPrc),
::testing::ValuesIn(inpOutPrc));
::testing::ValuesIn(fusingUnaryEltwiseParamsSet),
::testing::ValuesIn(inpPrc),
::testing::ValuesIn(outPrc));

INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_3D_Fuse, MvnLayerCPUTest, Mvn3DFuse, MvnLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Mvn1D, MvnLayerCPUTest, Mvn1D, MvnLayerCPUTest::getTestCaseName);

const auto Mvn4DFuse = ::testing::Combine(
// 2D no transformed
const auto Mvn2D = ::testing::Combine(
::testing::Combine(
::testing::ValuesIn(inputShapes_4D),
::testing::ValuesIn(inputShapes_2D),
::testing::Values(InferenceEngine::Precision::FP32),
::testing::Values(false),
::testing::Values(true),
::testing::ValuesIn(normalizeVariance),
::testing::ValuesIn(epsilon),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
::testing::Values(emptyCPUSpec),
::testing::ValuesIn(fusingParamsSet),
::testing::ValuesIn(inpOutPrc),
::testing::ValuesIn(inpOutPrc));
::testing::ValuesIn(inpPrc),
::testing::ValuesIn(outPrc));

INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_Fuse, MvnLayerCPUTest, Mvn4DFuse, MvnLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Mvn2D, MvnLayerCPUTest, Mvn2D, MvnLayerCPUTest::getTestCaseName);

const auto Mvn5DFuse = ::testing::Combine(
// 2d transformed
const auto Mvn2DTrans = ::testing::Combine(
::testing::Combine(
::testing::ValuesIn(inputShapes_5D),
::testing::ValuesIn(inputShapes_2D),
::testing::Values(InferenceEngine::Precision::FP32),
::testing::Values(false),
::testing::Values(true),
::testing::ValuesIn(normalizeVariance),
::testing::ValuesIn(epsilon),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)),
::testing::ValuesIn(fusingParamsSet),
::testing::ValuesIn(inpOutPrc),
::testing::ValuesIn(inpOutPrc));

INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_Fuse, MvnLayerCPUTest, Mvn5DFuse, MvnLayerCPUTest::getTestCaseName);
::testing::Values(CommonTestUtils::DEVICE_CPU)),
::testing::Values(emptyCPUSpec),
::testing::ValuesIn(fusingUnaryEltwiseParamsSet),
::testing::ValuesIn(inpPrc),
::testing::ValuesIn(outPrc));

INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_MVN2DTrans, MvnLayerCPUTest, Mvn2DTrans, MvnLayerCPUTest::getTestCaseName);

} // namespace
} // namespace CPULayerTestsDefinitions

0 comments on commit 1b0fea4

Please sign in to comment.