From 4f7e38ffd5d0d5f9f1d5ab7c3505b548dee5454f Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Fri, 23 Feb 2024 14:35:56 +0400 Subject: [PATCH 1/2] [CPU] Updated aux GPR count for Load/Store Emitters --- .../plugin/x64/jit_load_store_emitters.cpp | 2 +- .../intel_cpu/tests/unit/jit_kernel_test.cpp | 35 ++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp index 283396a0818c00..e25120515988c9 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp @@ -62,7 +62,7 @@ size_t store_emitter_params::hash() const { } static int get_aux_regs_as_temp(const size_t byte_size, const bool is_fill = false) { - if (!one_of(byte_size, 64u, 32u, 16u)) + if (one_of(byte_size % 16, 1u, 2u, 3u)) return 1; if (mayiuse(cpu::x64::avx512_core) && is_fill) return 1; diff --git a/src/plugins/intel_cpu/tests/unit/jit_kernel_test.cpp b/src/plugins/intel_cpu/tests/unit/jit_kernel_test.cpp index 608ca1b86b041f..a4de792a305227 100644 --- a/src/plugins/intel_cpu/tests/unit/jit_kernel_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/jit_kernel_test.cpp @@ -324,13 +324,23 @@ TEST(JitKernel, variable_load_and_store) { { jit_variable_load_store_test_kernel kernel; if (mayiuse(cpu_isa_t::avx512_core)) { - kernel.test<16, 11, false>(); + kernel.test<16, 16, false>(); + kernel.test<16, 15, false>(); + kernel.test<16, 10, false>(); + kernel.test<16, 1, false>(); } if (mayiuse(cpu_isa_t::avx2)) { + kernel.test<8, 8, false>(); + kernel.test<8, 7, false>(); + kernel.test<8, 6, false>(); kernel.test<8, 5, false>(); + kernel.test<8, 4, false>(); } if (mayiuse(cpu_isa_t::sse41)) { + kernel.test<4, 4, false>(); kernel.test<4, 3, false>(); + kernel.test<4, 2, false>(); + kernel.test<4, 1, false>(); } } @@ -360,6 +370,29 @@ TEST(JitKernel, variable_load_and_store) { } } + { + jit_variable_load_store_test_kernel kernel; + if (mayiuse(cpu_isa_t::avx512_core)) { + kernel.test<16, 16, true>(); + kernel.test<16, 10, true>(); + kernel.test<16, 2, true>(); + kernel.test<16, 1, true>(); + } + if (mayiuse(cpu_isa_t::avx2)) { + kernel.test<8, 8, true>(); + kernel.test<8, 7, true>(); + kernel.test<8, 6, true>(); + kernel.test<8, 5, true>(); + kernel.test<8, 4, true>(); + } + if (mayiuse(cpu_isa_t::sse41)) { + kernel.test<4, 4, true>(); + kernel.test<4, 3, true>(); + kernel.test<4, 2, true>(); + kernel.test<4, 1, true>(); + } + } + { jit_variable_load_store_test_kernel kernel; if (mayiuse(cpu_isa_t::avx512_core)) { From 186c1e24e1f081a426f3c733009627d416d2bb36 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 26 Feb 2024 10:53:29 +0400 Subject: [PATCH 2/2] [CPU] Applied Chenhu comment --- .../plugin/x64/jit_load_store_emitters.cpp | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp index e25120515988c9..5f662fa4d971c8 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp @@ -61,11 +61,16 @@ size_t store_emitter_params::hash() const { return seed; } -static int get_aux_regs_as_temp(const size_t byte_size, const bool is_fill = false) { - if (one_of(byte_size % 16, 1u, 2u, 3u)) - return 1; +static int get_aux_regs_as_temp(const int elem_count, const int data_size, bool is_pure_move, + const int avx512_threshold_for_mask = 0, const bool is_fill = false) { if (mayiuse(cpu::x64::avx512_core) && is_fill) return 1; + + const int byte_size = elem_count * data_size; + if ((is_pure_move && one_of(byte_size, 16, 32, 64)) || (!is_pure_move && one_of(elem_count, 4, 8, 16))) + return 0; + if ((mayiuse(cpu::x64::avx512_core) && (byte_size > avx512_threshold_for_mask)) || (one_of(byte_size % 16, 1, 2, 3))) + return 1; return 0; } @@ -84,7 +89,10 @@ size_t jit_load_emitter::get_inputs_num() const { return 1; } size_t jit_load_emitter::aux_gprs_count() const { // 0 for temp reg for mask load in avx512 if needed - int count = get_aux_regs_as_temp(load_num_ * dst_prc_.size(), is_fill_); + const auto is_pure_load = (src_prc_ == dst_prc_) || + (one_of(src_prc_, ov::element::f32, ov::element::i32) && + one_of(dst_prc_, ov::element::f32, ov::element::i32)); + int count = get_aux_regs_as_temp(load_num_, static_cast(src_prc_.size()), is_pure_load, threshold_for_mask_emu_load, is_fill_); // 1 for table address if (is_fill_) @@ -619,7 +627,10 @@ inline bool jit_store_emitter::is_truncation_emulation() const { size_t jit_store_emitter::aux_gprs_count() const { // for temp reg for store(mask version or special number cases) - int count = get_aux_regs_as_temp(store_num_ * src_prc_.size()); + const auto is_pure_store = (src_prc_ == dst_prc_) || + (one_of(src_prc_, ov::element::f32, ov::element::i32) && + one_of(dst_prc_, ov::element::f32, ov::element::i32)); + int count = get_aux_regs_as_temp(store_num_, static_cast(dst_prc_.size()), is_pure_store, threshold_for_mask_emu_store); // for table value in truncation arithmetic mode if (is_truncation_emulation())