Skip to content

Commit

Permalink
[CPU] Round towards zero for ReduceMean (openvinotoolkit#25217)
Browse files Browse the repository at this point in the history
### Details:
- *vdivps instruction rounds to the nearest value. Here we append
vroundps instruction to make the result round towards zero, to align
behavior with Plugins/Frameworks.*
 - *Add the test case that can reproduce the issue beforehand.*

### Tickets:
-
*[issue#20815](openvinotoolkit#20815
  • Loading branch information
xuchen-intel authored and allnes committed Jun 26, 2024
1 parent 5c07762 commit 8eab81f
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 11 deletions.
36 changes: 25 additions & 11 deletions src/plugins/intel_cpu/src/nodes/reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,14 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
mov(reg_divisor, ptr[reg_params + GET_OFF(divisor)]);
uni_vbroadcastss(vmm_aux, ptr[reg_divisor]);
uni_vdivps(vmm_dst, vmm_dst, vmm_aux);
if (!isFloatCompatible(jcp_.dst_dt)) {
uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero
}
if (isa == cpu::x64::sse41) {
uni_vdivps(vmm_dst_aux, vmm_dst_aux, vmm_aux);
if (!isFloatCompatible(jcp_.dst_dt)) {
uni_vroundps(vmm_dst_aux, vmm_dst_aux, 3);
}
}
}
L(reduce_divide_end_label);
Expand Down Expand Up @@ -1374,14 +1380,14 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
jl(reduce_loop_end_label, T_NEAR);

wrap_load_vector(vmm_dst, 0);
reduce_map_kernel(vmm_dst);
reduce_map_kernel(vmm_dst, jcp_.dst_dt);
if (post_ops_fusing)
apply_post_ops(jcp_.dst_dt, jcp_.fuse_broadcast);
store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt);

if (isa == cpu::x64::sse41) {
wrap_load_vector(vmm_dst, 4);
reduce_map_kernel(vmm_dst);
reduce_map_kernel(vmm_dst, jcp_.dst_dt);
if (post_ops_fusing) {
if (jcp_.layout != ReduceLayoutType::reduce_ncsp)
add(reg_oc_off, 4 * sizeof(float));
Expand Down Expand Up @@ -1462,7 +1468,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
wrap_load_scalar(xmm_dst, 0);

// reduce
reduce_map_kernel_scalar(xmm_dst);
reduce_map_kernel_scalar(xmm_dst, jcp_.dst_dt);

// store
if (post_ops_fusing)
Expand Down Expand Up @@ -1554,22 +1560,30 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
}
}

inline void reduce_map_kernel(Vmm vmm_dst) {
if (jcp_.reduce_mode == Algorithm::ReduceMean)
inline void reduce_map_kernel(Vmm vmm_dst, memory::data_type dst_dt) {
if (jcp_.reduce_mode == Algorithm::ReduceMean) {
uni_vdivps(vmm_dst, vmm_dst, vmm_aux);
else if (jcp_.reduce_mode == Algorithm::ReduceL2)
if (!isFloatCompatible(dst_dt)) {
uni_vroundps(vmm_dst, vmm_dst, 3);
}
} else if (jcp_.reduce_mode == Algorithm::ReduceL2) {
uni_vsqrtps(vmm_dst, vmm_dst);
else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp)
} else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) {
log_injector->compute_vector_range(vmm_dst.getIdx(), vmm_dst.getIdx() + 1);
}
}

inline void reduce_map_kernel_scalar(Xmm xmm_dst) {
if (jcp_.reduce_mode == Algorithm::ReduceMean)
inline void reduce_map_kernel_scalar(Xmm xmm_dst, memory::data_type dst_dt) {
if (jcp_.reduce_mode == Algorithm::ReduceMean) {
uni_vdivps(xmm_dst, xmm_dst, xmm_aux);
else if (jcp_.reduce_mode == Algorithm::ReduceL2)
if (!isFloatCompatible(dst_dt)) {
uni_vroundps(xmm_dst, xmm_dst, 3);
}
} else if (jcp_.reduce_mode == Algorithm::ReduceL2) {
uni_vsqrtps(xmm_dst, xmm_dst);
else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp)
} else if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) {
log_injector->compute_vector_range(xmm_dst.getIdx(), xmm_dst.getIdx() + 1);
}
}

inline void wrap_load_vector(Vmm vmm_val, size_t offset) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_NativeInt32Gather_dyn
{{{{1, 5}, 6, {1, 5}, {1, 10}}, {{1, 6, 4, 3}, {1, 6, 4, 4}}}},
};

std::vector<std::vector<ov::test::InputShape>> inputShapes_Rounding_dyn = {
{{{{1, 5}, 3, {1, 5}, {1, 5}}, {{1, 3, 3, 1}, {1, 3, 3, 3}}}},
};

std::vector<std::vector<ov::test::InputShape>> inputShapes_SmallChannel_dyn = {
{{{{1, 5}, 3, {1, 5}, {1, 10}}, {{2, 3, 2, 2}, {2, 3, 2, 9}}}},
};
Expand Down Expand Up @@ -295,6 +299,20 @@ const auto params_NativeInt32Gather = testing::Combine(
testing::Values(emptyFusingSpec),
testing::ValuesIn(additionalConfigFP32()));

const auto params_Rounding = testing::Combine(
testing::Combine(
testing::Values(axesND()[3]),
testing::Values(ov::test::utils::OpType::VECTOR),
testing::Values(keepDims()[1]),
testing::Values(reductionTypes()[0]),
testing::Values(ElementType::i32),
testing::Values(ElementType::undefined),
testing::Values(ElementType::undefined),
testing::ValuesIn(inputShapes_Rounding_dyn)),
testing::Values(emptyCPUSpec),
testing::Values(emptyFusingSpec),
testing::ValuesIn(additionalConfigFP32()));

const auto params_NHWC_SmallChannel = testing::Combine(
testing::Combine(
testing::ValuesIn(axesHW),
Expand Down Expand Up @@ -386,6 +404,13 @@ INSTANTIATE_TEST_SUITE_P(
ReduceCPULayerTest::getTestCaseName
);

INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_Rounding_CPU,
ReduceCPULayerTest,
params_Rounding,
ReduceCPULayerTest::getTestCaseName
);

INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_NHWC_SmallChannel_CPU,
ReduceCPULayerTest,
Expand Down

0 comments on commit 8eab81f

Please sign in to comment.