From 11db1e187e281e8cd6bf8b7389485c8a79c71afa Mon Sep 17 00:00:00 2001 From: "ayraa.ai" <141430616+geeky33@users.noreply.github.com> Date: Fri, 29 Nov 2024 15:54:24 +0530 Subject: [PATCH] [CPU][ARM] JIT Floor Mod Operation (#27706) ### Details: - Added JIT emitter for Eltwise Floor Mod operation on ARM64 SIMD - Implemented fp32 optimization replacing C++ Math implementation - Modified ARM64 executor to support new JIT emitter - Updated kernel files to include Floor Mod in Eltwise operations - Added test coverage for JIT implementation verification - Transitioned operation type from Math to Eltwise for better performance @a-sidorova can you please review the code ? :) ### Tickets: - #27501 ![image](https://github.com/user-attachments/assets/b7501b8f-1c67-493d-9d18-5175d5de090d) --- .../plugin/aarch64/jit_eltwise_emitters.cpp | 45 +++++++++++++++++++ .../plugin/aarch64/jit_eltwise_emitters.hpp | 21 +++++++++ .../nodes/executors/aarch64/jit_eltwise.cpp | 1 + .../aarch64/jit_uni_eltwise_generic.cpp | 2 + .../single_layer_tests/classes/eltwise.cpp | 3 ++ 5 files changed, 72 insertions(+) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index 83cdd252f9bc6f..4aec56d98873fa 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -516,6 +516,51 @@ std::set> jit_floor_emitter::get_supported_precisions return {{element::f32}}; } +/// FLOOR_MOD /// +jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { +} + +jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc): jit_emitter(host, host_isa, exec_prc) { +} + +size_t jit_floor_mod_emitter::get_inputs_count() const { return 2; } + +size_t jit_floor_mod_emitter::get_aux_vecs_count() const { return 1; } + +void jit_floor_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + + TReg dividend = TReg(in_vec_idxs[0]); + TReg divisor = TReg(in_vec_idxs[1]); + TReg r = TReg(out_vec_idxs[0]); + TReg aux = TReg(aux_vec_idxs[0]); + + h->fdiv(aux.s, dividend.s, divisor.s); + h->frintm(aux.s, aux.s); + h->fmul(aux.s, aux.s, divisor.s); + h->fsub(r.s, dividend.s, aux.s); +} + +std::set> jit_floor_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + /// CEILING /// //Initialization of the emitter, taking node as input jit_ceiling_emitter::jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index fa4f4141c388e4..2cb7e6928ade3e 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -213,7 +213,28 @@ class jit_floor_emitter : public jit_emitter { template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; }; +class jit_floor_mod_emitter : public jit_emitter { +public: + jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); + + size_t get_inputs_count() const override; + + size_t get_aux_vecs_count() const override; + + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; class jit_ceiling_emitter : public jit_emitter { public: // Constructor with explicit precision diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index 660db85cd61529..5f63904fbb9342 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -26,6 +26,7 @@ bool JitEltwiseExecutor::isSupported( Algorithm::EltwiseEqual, Algorithm::EltwiseExp, Algorithm::EltwiseFloor, + Algorithm::EltwiseFloorMod, Algorithm::EltwiseCeiling, Algorithm::EltwiseGeluErf, Algorithm::EltwiseGeluTanh, diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index 7ac3b603353541..9a1662e2c5dab5 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -648,6 +648,7 @@ std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitte OV_CASE(Algorithm::EltwiseEqual, ov::intel_cpu::aarch64::jit_equal_emitter), OV_CASE(Algorithm::EltwiseExp, ov::intel_cpu::aarch64::jit_exp_emitter), OV_CASE(Algorithm::EltwiseFloor, ov::intel_cpu::aarch64::jit_floor_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, ov::intel_cpu::aarch64::jit_floor_mod_emitter), OV_CASE(Algorithm::EltwiseCeiling, ov::intel_cpu::aarch64::jit_ceiling_emitter), OV_CASE(Algorithm::EltwiseHswish, ov::intel_cpu::aarch64::jit_hswish_emitter), OV_CASE(Algorithm::EltwiseIsFinite, ov::intel_cpu::aarch64::jit_is_finite_emitter), @@ -830,6 +831,7 @@ std::set> eltwise_precision_helper::get_supported_pre OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), OV_CASE(Algorithm::EltwiseGeluErf, jit_gelu_erf_emitter), OV_CASE(Algorithm::EltwiseGeluTanh, jit_gelu_tanh_emitter), diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp index d7cfe80d22f617..1696f35fc1bc4a 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp @@ -259,6 +259,7 @@ std::string EltwiseLayerCPUTest::getPrimitiveType(const utils::EltwiseTypes& elt (eltwise_type == utils::EltwiseTypes::MULTIPLY) || (eltwise_type == utils::EltwiseTypes::SUBTRACT) || (eltwise_type == utils::EltwiseTypes::DIVIDE) || + (eltwise_type == utils::EltwiseTypes::FLOOR_MOD) || (eltwise_type == utils::EltwiseTypes::MOD)) { return "jit"; } @@ -317,6 +318,8 @@ const std::vector& eltwiseOpTypesBinInp() { utils::EltwiseTypes::SUBTRACT, // TODO: Fix CVS-105430 utils::EltwiseTypes::DIVIDE, // TODO: Fix CVS-105430 utils::EltwiseTypes::FLOOR_MOD, // TODO: Fix CVS-111875 +#elif defined(OPENVINO_ARCH_ARM64) + utils::EltwiseTypes::FLOOR_MOD, #endif utils::EltwiseTypes::SQUARED_DIFF, utils::EltwiseTypes::MOD,