From 8eab81f3e510c2d941cbca0c25fbfd0defc4afbb Mon Sep 17 00:00:00 2001 From: Chen Xu Date: Wed, 26 Jun 2024 14:47:20 +0800 Subject: [PATCH] [CPU] Round towards zero for ReduceMean (#25217) ### Details: - *vdivps instruction rounds to the nearest value. Here we append vroundps instruction to make the result round towards zero, to align behavior with Plugins/Frameworks.* - *Add the test case that can reproduce the issue beforehand.* ### Tickets: - *[issue#20815](https://github.com/openvinotoolkit/openvino/issues/20815)* --- src/plugins/intel_cpu/src/nodes/reduce.cpp | 36 +++++++++++++------ .../instances/x64/reduce.cpp | 25 +++++++++++++ 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index 814b1a0a6eeab9..63675b364d4425 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -319,8 +319,14 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene mov(reg_divisor, ptr[reg_params + GET_OFF(divisor)]); uni_vbroadcastss(vmm_aux, ptr[reg_divisor]); uni_vdivps(vmm_dst, vmm_dst, vmm_aux); + if (!isFloatCompatible(jcp_.dst_dt)) { + uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero + } if (isa == cpu::x64::sse41) { uni_vdivps(vmm_dst_aux, vmm_dst_aux, vmm_aux); + if (!isFloatCompatible(jcp_.dst_dt)) { + uni_vroundps(vmm_dst_aux, vmm_dst_aux, 3); + } } } L(reduce_divide_end_label); @@ -1374,14 +1380,14 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi jl(reduce_loop_end_label, T_NEAR); wrap_load_vector(vmm_dst, 0); - reduce_map_kernel(vmm_dst); + reduce_map_kernel(vmm_dst, jcp_.dst_dt); if (post_ops_fusing) apply_post_ops(jcp_.dst_dt, jcp_.fuse_broadcast); store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt); if (isa == cpu::x64::sse41) { wrap_load_vector(vmm_dst, 4); - reduce_map_kernel(vmm_dst); + reduce_map_kernel(vmm_dst, jcp_.dst_dt); if (post_ops_fusing) { if (jcp_.layout != ReduceLayoutType::reduce_ncsp) add(reg_oc_off, 4 * sizeof(float)); @@ -1462,7 +1468,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi wrap_load_scalar(xmm_dst, 0); // reduce - reduce_map_kernel_scalar(xmm_dst); + reduce_map_kernel_scalar(xmm_dst, jcp_.dst_dt); // store if (post_ops_fusing) @@ -1554,22 +1560,30 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi } } - inline void reduce_map_kernel(Vmm vmm_dst) { - if (jcp_.reduce_mode == Algorithm::ReduceMean) + inline void reduce_map_kernel(Vmm vmm_dst, memory::data_type dst_dt) { + if (jcp_.reduce_mode == Algorithm::ReduceMean) { uni_vdivps(vmm_dst, vmm_dst, vmm_aux); - else if (jcp_.reduce_mode == Algorithm::ReduceL2) + if (!isFloatCompatible(dst_dt)) { + uni_vroundps(vmm_dst, vmm_dst, 3); + } + } else if (jcp_.reduce_mode == Algorithm::ReduceL2) { uni_vsqrtps(vmm_dst, vmm_dst); - else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) + } else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) { log_injector->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1); + } } - inline void reduce_map_kernel_scalar(Xmm xmm_dst) { - if (jcp_.reduce_mode == Algorithm::ReduceMean) + inline void reduce_map_kernel_scalar(Xmm xmm_dst, memory::data_type dst_dt) { + if (jcp_.reduce_mode == Algorithm::ReduceMean) { uni_vdivps(xmm_dst, xmm_dst, xmm_aux); - else if (jcp_.reduce_mode == Algorithm::ReduceL2) + if (!isFloatCompatible(dst_dt)) { + uni_vroundps(xmm_dst, xmm_dst, 3); + } + } else if (jcp_.reduce_mode == Algorithm::ReduceL2) { uni_vsqrtps(xmm_dst, xmm_dst); - else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) + } else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) { log_injector->compute_vector_range(xmm_dst.getIdx(), xmm_dst.getIdx() + 1); + } } inline void wrap_load_vector(Vmm vmm_val, size_t offset) { diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/reduce.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/reduce.cpp index 87b301a5d78f66..1a18dbbb015ede 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/reduce.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/reduce.cpp @@ -43,6 +43,10 @@ std::vector> inputShapes_NativeInt32Gather_dyn {{{{1, 5}, 6, {1, 5}, {1, 10}}, {{1, 6, 4, 3}, {1, 6, 4, 4}}}}, }; +std::vector> inputShapes_Rounding_dyn = { + {{{{1, 5}, 3, {1, 5}, {1, 5}}, {{1, 3, 3, 1}, {1, 3, 3, 3}}}}, +}; + std::vector> inputShapes_SmallChannel_dyn = { {{{{1, 5}, 3, {1, 5}, {1, 10}}, {{2, 3, 2, 2}, {2, 3, 2, 9}}}}, }; @@ -295,6 +299,20 @@ const auto params_NativeInt32Gather = testing::Combine( testing::Values(emptyFusingSpec), testing::ValuesIn(additionalConfigFP32())); +const auto params_Rounding = testing::Combine( + testing::Combine( + testing::Values(axesND()[3]), + testing::Values(ov::test::utils::OpType::VECTOR), + testing::Values(keepDims()[1]), + testing::Values(reductionTypes()[0]), + testing::Values(ElementType::i32), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_Rounding_dyn)), + testing::Values(emptyCPUSpec), + testing::Values(emptyFusingSpec), + testing::ValuesIn(additionalConfigFP32())); + const auto params_NHWC_SmallChannel = testing::Combine( testing::Combine( testing::ValuesIn(axesHW), @@ -386,6 +404,13 @@ INSTANTIATE_TEST_SUITE_P( ReduceCPULayerTest::getTestCaseName ); +INSTANTIATE_TEST_SUITE_P( + smoke_Reduce_Rounding_CPU, + ReduceCPULayerTest, + params_Rounding, + ReduceCPULayerTest::getTestCaseName +); + INSTANTIATE_TEST_SUITE_P( smoke_Reduce_NHWC_SmallChannel_CPU, ReduceCPULayerTest,