From 8eab81f3e510c2d941cbca0c25fbfd0defc4afbb Mon Sep 17 00:00:00 2001
From: Chen Xu <chen.xu@intel.com>
Date: Wed, 26 Jun 2024 14:47:20 +0800
Subject: [PATCH] [CPU] Round towards zero for ReduceMean (#25217)

### Details:
- *vdivps instruction rounds to the nearest value. Here we append
vroundps instruction to make the result round towards zero, to align
behavior with Plugins/Frameworks.*
 - *Add the test case that can reproduce the issue beforehand.*

### Tickets:
-
*[issue#20815](https://github.com/openvinotoolkit/openvino/issues/20815)*
---
 src/plugins/intel_cpu/src/nodes/reduce.cpp    | 36 +++++++++++++------
 .../instances/x64/reduce.cpp                  | 25 +++++++++++++
 2 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp
index 814b1a0a6eeab9..63675b364d4425 100644
--- a/src/plugins/intel_cpu/src/nodes/reduce.cpp
+++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp
@@ -319,8 +319,14 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
                     mov(reg_divisor, ptr[reg_params + GET_OFF(divisor)]);
                     uni_vbroadcastss(vmm_aux, ptr[reg_divisor]);
                     uni_vdivps(vmm_dst, vmm_dst, vmm_aux);
+                    if (!isFloatCompatible(jcp_.dst_dt)) {
+                        uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero
+                    }
                     if (isa == cpu::x64::sse41) {
                         uni_vdivps(vmm_dst_aux, vmm_dst_aux, vmm_aux);
+                        if (!isFloatCompatible(jcp_.dst_dt)) {
+                            uni_vroundps(vmm_dst_aux, vmm_dst_aux, 3);
+                        }
                     }
                 }
                 L(reduce_divide_end_label);
@@ -1374,14 +1380,14 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
                     jl(reduce_loop_end_label, T_NEAR);
 
                     wrap_load_vector(vmm_dst, 0);
-                    reduce_map_kernel(vmm_dst);
+                    reduce_map_kernel(vmm_dst, jcp_.dst_dt);
                     if (post_ops_fusing)
                         apply_post_ops(jcp_.dst_dt, jcp_.fuse_broadcast);
                     store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt);
 
                     if (isa == cpu::x64::sse41) {
                         wrap_load_vector(vmm_dst, 4);
-                        reduce_map_kernel(vmm_dst);
+                        reduce_map_kernel(vmm_dst, jcp_.dst_dt);
                         if (post_ops_fusing) {
                             if (jcp_.layout != ReduceLayoutType::reduce_ncsp)
                                 add(reg_oc_off, 4 * sizeof(float));
@@ -1462,7 +1468,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
                 wrap_load_scalar(xmm_dst, 0);
 
                 // reduce
-                reduce_map_kernel_scalar(xmm_dst);
+                reduce_map_kernel_scalar(xmm_dst, jcp_.dst_dt);
 
                 // store
                 if (post_ops_fusing)
@@ -1554,22 +1560,30 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
         }
     }
 
-    inline void reduce_map_kernel(Vmm vmm_dst) {
-        if (jcp_.reduce_mode == Algorithm::ReduceMean)
+    inline void reduce_map_kernel(Vmm vmm_dst, memory::data_type dst_dt) {
+        if (jcp_.reduce_mode == Algorithm::ReduceMean) {
             uni_vdivps(vmm_dst, vmm_dst, vmm_aux);
-        else if (jcp_.reduce_mode == Algorithm::ReduceL2)
+            if (!isFloatCompatible(dst_dt)) {
+                uni_vroundps(vmm_dst, vmm_dst, 3);
+            }
+        } else if (jcp_.reduce_mode == Algorithm::ReduceL2) {
             uni_vsqrtps(vmm_dst, vmm_dst);
-        else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp)
+        } else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) {
             log_injector->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1);
+        }
     }
 
-    inline void reduce_map_kernel_scalar(Xmm xmm_dst) {
-        if (jcp_.reduce_mode == Algorithm::ReduceMean)
+    inline void reduce_map_kernel_scalar(Xmm xmm_dst, memory::data_type dst_dt) {
+        if (jcp_.reduce_mode == Algorithm::ReduceMean) {
             uni_vdivps(xmm_dst, xmm_dst, xmm_aux);
-        else if (jcp_.reduce_mode == Algorithm::ReduceL2)
+            if (!isFloatCompatible(dst_dt)) {
+                uni_vroundps(xmm_dst, xmm_dst, 3);
+            }
+        } else if (jcp_.reduce_mode == Algorithm::ReduceL2) {
             uni_vsqrtps(xmm_dst, xmm_dst);
-        else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp)
+        } else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) {
             log_injector->compute_vector_range(xmm_dst.getIdx(), xmm_dst.getIdx() + 1);
+        }
     }
 
     inline void wrap_load_vector(Vmm vmm_val, size_t offset) {
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/reduce.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/reduce.cpp
index 87b301a5d78f66..1a18dbbb015ede 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/reduce.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/reduce.cpp
@@ -43,6 +43,10 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_NativeInt32Gather_dyn
     {{{{1, 5}, 6, {1, 5}, {1, 10}}, {{1, 6, 4, 3}, {1, 6, 4, 4}}}},
 };
 
+std::vector<std::vector<ov::test::InputShape>> inputShapes_Rounding_dyn = {
+    {{{{1, 5}, 3, {1, 5}, {1, 5}}, {{1, 3, 3, 1}, {1, 3, 3, 3}}}},
+};
+
 std::vector<std::vector<ov::test::InputShape>> inputShapes_SmallChannel_dyn = {
     {{{{1, 5}, 3, {1, 5}, {1, 10}}, {{2, 3, 2, 2}, {2, 3, 2, 9}}}},
 };
@@ -295,6 +299,20 @@ const auto params_NativeInt32Gather = testing::Combine(
         testing::Values(emptyFusingSpec),
         testing::ValuesIn(additionalConfigFP32()));
 
+const auto params_Rounding = testing::Combine(
+        testing::Combine(
+            testing::Values(axesND()[3]),
+            testing::Values(ov::test::utils::OpType::VECTOR),
+            testing::Values(keepDims()[1]),
+            testing::Values(reductionTypes()[0]),
+            testing::Values(ElementType::i32),
+            testing::Values(ElementType::undefined),
+            testing::Values(ElementType::undefined),
+            testing::ValuesIn(inputShapes_Rounding_dyn)),
+        testing::Values(emptyCPUSpec),
+        testing::Values(emptyFusingSpec),
+        testing::ValuesIn(additionalConfigFP32()));
+
 const auto params_NHWC_SmallChannel = testing::Combine(
         testing::Combine(
                 testing::ValuesIn(axesHW),
@@ -386,6 +404,13 @@ INSTANTIATE_TEST_SUITE_P(
         ReduceCPULayerTest::getTestCaseName
 );
 
+INSTANTIATE_TEST_SUITE_P(
+        smoke_Reduce_Rounding_CPU,
+        ReduceCPULayerTest,
+        params_Rounding,
+        ReduceCPULayerTest::getTestCaseName
+);
+
 INSTANTIATE_TEST_SUITE_P(
         smoke_Reduce_NHWC_SmallChannel_CPU,
         ReduceCPULayerTest,