diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index 72784b793277c8..7da1110f953df4 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -87,7 +87,10 @@ if(NOT X86_64) endif() if(NOT (AARCH64 OR ARM)) - list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/*) + list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/* + ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/aarch64/* + ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/aarch64/* + ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/aarch64/*) endif() if (NOT ENABLE_MLAS_FOR_CPU) diff --git a/src/plugins/intel_cpu/src/emitters/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/aarch64/jit_eltwise_emitters.cpp new file mode 100644 index 00000000000000..d22deceefc86fc --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/aarch64/jit_eltwise_emitters.cpp @@ -0,0 +1,464 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_eltwise_emitters.hpp" + +#include +#include "ie_ngraph_utils.hpp" +#include "common/utils.hpp" + +namespace ov { +namespace intel_cpu { +namespace aarch64 { + +using namespace InferenceEngine; +using namespace dnnl::impl::utils; +using namespace dnnl::impl::cpu; +using namespace Xbyak_aarch64; + +namespace { +InferenceEngine::Precision get_arithmetic_binary_exec_precision(const std::shared_ptr& n) { + std::vector input_precisions; + for (const auto& input : n->inputs()) { + input_precisions.push_back( + InferenceEngine::details::convertPrecision(input.get_source_output().get_element_type())); + } + + assert(std::all_of( + input_precisions.begin(), + input_precisions.end(), + [&input_precisions](const InferenceEngine::Precision& precision) {return precision == input_precisions[0]; })); + + return input_precisions[0]; +} +} // namespace + +/// ADD /// +jit_add_emitter::jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node, + const float alpha) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node), alpha) { +} + +jit_add_emitter::jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const Precision exec_prc, + const float alpha) : jit_emitter(host, host_isa, exec_prc, alpha) { +} + +size_t jit_add_emitter::get_inputs_count() const { return 2; } + +void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + IE_THROW() << "Can't create jit eltwise kernel"; + } +} + +template +void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if ((exec_prc_ != Precision::FP16) && (exec_prc_ != Precision::FP32)) { + IE_THROW() << "unsupported precision: " << exec_prc_; + } + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + TReg src0 = TReg(in_vec_idxs[0]); + TReg src1 = TReg(in_vec_idxs[1]); + TReg dst = TReg(out_vec_idxs[0]); + + switch (exec_prc_) { + case Precision::FP16: { + h->uni_fadd(dst.h, src0.h, src1.h); + break; + } + case Precision::FP32: { + h->uni_fadd(dst.s, src0.s, src1.s); + break; + } + default: { + assert(!"unsupported precision"); + } + } +} + +std::set> jit_add_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + +/// MUL_ADD /// +jit_mul_add_emitter::jit_mul_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node, + const float alpha) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node), alpha) { +} + +jit_mul_add_emitter::jit_mul_add_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const Precision exec_prc, + const float alpha) + : jit_emitter(host, host_isa, exec_prc, alpha) { +} + +size_t jit_mul_add_emitter::get_inputs_count() const { return 3; } + +void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + IE_THROW() << "Can't create jit eltwise kernel"; + } +} + +template +void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if ((exec_prc_ != Precision::FP16) && (exec_prc_ != Precision::FP32)) { + IE_THROW() << "unsupported precision: " << exec_prc_; + } + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + TReg src0 = TReg(in_vec_idxs[0]); + TReg src1 = TReg(in_vec_idxs[1]); + TReg src2 = TReg(in_vec_idxs[2]); + TReg dst = TReg(out_vec_idxs[0]); + + // uni_fmad implementation + switch (exec_prc_) { + case Precision::FP16: { + h->fmul(dst.h, src0.h, src1.h); + h->fadd(dst.h, dst.h, src2.h); + break; + } + case Precision::FP32: { + h->fmul(dst.s, src0.s, src1.s); + h->fadd(dst.s, dst.s, src2.s); + break; + } + default: { + assert(!"unsupported precision"); + } + } +} + +std::set> jit_mul_add_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32, element::f32}}; +} + +/// MULTIPLY /// +jit_multiply_emitter::jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node, + const float alpha) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node), alpha) {} + +jit_multiply_emitter::jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const Precision exec_prc, + const float alpha) + : jit_emitter(host, host_isa, exec_prc, alpha) {} + +size_t jit_multiply_emitter::get_inputs_count() const { return 2; } + +void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + IE_THROW() << "Can't create jit eltwise kernel"; + } +} + +template +void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if ((exec_prc_ != Precision::FP16) && (exec_prc_ != Precision::FP32)) { + IE_THROW() << "unsupported precision: " << exec_prc_; + } + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + TReg src0 = TReg(in_vec_idxs[0]); + TReg src1 = TReg(in_vec_idxs[1]); + TReg dst = TReg(out_vec_idxs[0]); + + switch (exec_prc_) { + case Precision::FP16: { + h->uni_fmul(dst.h, src0.h, src1.h); + break; + } + case Precision::FP32: { + h->uni_fmul(dst.s, src0.s, src1.s); + break; + } + default: { + assert(!"unsupported precision"); + } + } +} + +std::set> jit_multiply_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + +/// POWER /// +jit_power_emitter::jit_power_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const float power, + const float scale, + const float shift, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)), power(power), scale(scale), shift(shift) { + auto powerStaticNode = ov::as_type_ptr(node); + if (powerStaticNode == nullptr) { + IE_THROW() << "Can't cast to snippets::op::PowerStatic"; + } + + prepare_table(); +} + +jit_power_emitter::jit_power_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const float power, + const float scale, + const float shift, + const Precision exec_prc) + : jit_emitter(host, host_isa, exec_prc), power(power), scale(scale), shift(shift) { + prepare_table(); +} + +size_t jit_power_emitter::get_inputs_count() const { return 1; } + +size_t jit_power_emitter::get_aux_vecs_count() const { return 2; } + +size_t jit_power_emitter::get_aux_gprs_count() const { return 1; } + +void jit_power_emitter::register_table_entries() { + push_arg_entry_of("power", dnnl::impl::float2int(power), true); + push_arg_entry_of("scale", dnnl::impl::float2int(scale), true); + push_arg_entry_of("shift", dnnl::impl::float2int(shift), true); + // push_arg_entry_of("one", float2int(1.f), true); +} + +std::set> jit_power_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + +void jit_power_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + IE_THROW() << "Can't create jit eltwise kernel"; + } +} + +namespace { +extern "C" float pow_f32(float v1, float v2); +float pow_f32(float v1, float v2) { + return pow(v1, v2); +} +} // namespace + +template +void jit_power_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if ((exec_prc_ != Precision::FP16) && (exec_prc_ != Precision::FP32)) { + IE_THROW() << "unsupported precision: " << exec_prc_; + } + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + TReg src = TReg(in_vec_idxs[0]); + TReg dst = TReg(out_vec_idxs[0]); + TReg aux = TReg(aux_vec_idxs[0]); + + + std::cout << "power=" << power << ", scale=" << scale << ", shift=" << shift << std::endl; + + if (scale != 1.f) { + auto adr = table_val2("scale"); + h->ld1r(aux.s, adr); + //h->fmov(aux.s, -1.); + h->fmul(src.s, src.s, aux.s); + } + + if (shift != 0.f) { + auto adr = table_val2("shift"); + h->ld1r(aux.s, adr); + h->fadd(src.s, src.s, aux.s); + } + + if (power == 0.f) { + h->fmov(dst.s, 1.); + return; + } + + if (power == 1.f) { + if (src.getIdx() != dst.getIdx()) { + h->uni_orr(dst, src, src); + } + return; + } + + if (std::floor(power) == power && power > 0) { + h->fmov(dst.s, 1.); + + auto current_power = static_cast(power); + while (current_power > 0) { + if (current_power & 1) { + h->fmul(dst.s, dst.s, src.s); + } + if (current_power > 1) { + h->fmul(src.s, src.s, src.s); + } + current_power = current_power >> 1; + } + } else { + auto pow_f32_addr = reinterpret_cast(pow_f32); + + // TODO: debug: hardcode + Xbyak_aarch64::XReg func_reg(15); + h->mov(func_reg, pow_f32_addr); + + Xbyak_aarch64::SReg s0(0); + Xbyak_aarch64::SReg s1(1); + + for (auto i = 0; i < 4; i++) { + h->mov(s0, src.s[i]); + + // TODO: debug: only + //const float power2 = 1.f; + //h->fmov(s1, power2); + h->ldr(s1, table_val("power")); + + // X29: The register x29 represents the base pointer (also known as the frame pointer or FP) + // X30: In A64 systems, the return address is stored in register x30 (also known as LR) + + h->stp(h->x29, h->x30, pre_ptr(h->sp, -16)); + //h->sub(h->sp, h->sp, 16); + // h->stp(h->x0, h->x1, pre_ptr(h->sp, -16)); + // //h->sub(h->sp, h->sp, 16); + // h->stp(h->x9, h->x10, pre_ptr(h->sp, -16)); + // //h->sub(h->sp, h->sp, 16); + + static constexpr Xbyak_aarch64::Operand::Code save_gpr_regs[] = { + Xbyak_aarch64::Operand::X0, Xbyak_aarch64::Operand::X1, + Xbyak_aarch64::Operand::X2, Xbyak_aarch64::Operand::X3, + Xbyak_aarch64::Operand::X4, Xbyak_aarch64::Operand::X5, + Xbyak_aarch64::Operand::X6, Xbyak_aarch64::Operand::X7, + Xbyak_aarch64::Operand::X8, Xbyak_aarch64::Operand::X9, // 9 + Xbyak_aarch64::Operand::X10, Xbyak_aarch64::Operand::X11, + Xbyak_aarch64::Operand::X12, Xbyak_aarch64::Operand::X13, + Xbyak_aarch64::Operand::X14, Xbyak_aarch64::Operand::X15, + Xbyak_aarch64::Operand::X16, Xbyak_aarch64::Operand::X17, + Xbyak_aarch64::Operand::X18, Xbyak_aarch64::Operand::X19, + Xbyak_aarch64::Operand::X20, Xbyak_aarch64::Operand::X21, + Xbyak_aarch64::Operand::X22, Xbyak_aarch64::Operand::X23, + Xbyak_aarch64::Operand::X24, Xbyak_aarch64::Operand::X25, + Xbyak_aarch64::Operand::X26, Xbyak_aarch64::Operand::X27, + Xbyak_aarch64::Operand::X28, Xbyak_aarch64::Operand::X29, // 29 + }; + + + static constexpr size_t save_gpr_regs_size = sizeof(save_gpr_regs) / sizeof(save_gpr_regs[0]); + const int32_t xreg_len = 8; + //const size_t preserved_stack_size = xreg_len * (2 + save_gpr_regs_size); + + //h->sub(h->sp, h->sp, static_cast(preserved_stack_size) - 16); + //h->mov(h->x9, h->sp); + for (size_t i = 0; i < save_gpr_regs_size; i += 2) { + h->stp( + Xbyak_aarch64::XReg(save_gpr_regs[i]), + Xbyak_aarch64::XReg(save_gpr_regs[i + 1]), + pre_ptr(h->sp, -xreg_len * 2)); + } + + h->blr(func_reg); + + // //h->add(h->sp, h->sp, 16); + // h->ldp(h->x9, h->x10, post_ptr(h->sp, 16)); + // //h->add(h->sp, h->sp, 16); + // h->ldp(h->x0, h->x1, post_ptr(h->sp, 16)); + + //h->mov(h->x9, h->sp); + for (size_t i = 0; i < save_gpr_regs_size; i += 2) { + h->ldp( + Xbyak_aarch64::XReg(save_gpr_regs[save_gpr_regs_size - 1 - (i + 1)]), + Xbyak_aarch64::XReg(save_gpr_regs[save_gpr_regs_size - 1 - i]), + post_ptr(h->sp, xreg_len * 2)); + } + + //h->add(h->sp, h->sp, 16); + h->ldp(h->x29, h->x30, post_ptr(h->sp, 16)); + + Xbyak_aarch64::WReg w0(0); + h->fmov(w0, s0); + h->mov(dst.s[i], w0); + } + } +} + +/// RELU /// +jit_relu_emitter::jit_relu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node, + const float alpha) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node), alpha) { +} + +jit_relu_emitter::jit_relu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const Precision exec_prc, + const float alpha) + : jit_emitter(host, host_isa, exec_prc, alpha) { +} + +size_t jit_relu_emitter::get_inputs_count() const { return 1; } + +size_t jit_relu_emitter::get_aux_vecs_count() const { return 1; } + +std::set> jit_relu_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32}}; +} + +void jit_relu_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + IE_THROW() << "Can't create jit eltwise kernel"; + } +} + +template +void jit_relu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if ((exec_prc_ != Precision::FP16) && (exec_prc_ != Precision::FP32)) { + IE_THROW() << "unsupported precision: " << exec_prc_; + } + + if (alpha != 0.f) { + IE_THROW() << "not zero alpha is not supported"; + } + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + + TReg tmp = TReg(aux_vec_idxs[0]); + TReg src = TReg(in_vec_idxs[0]); + TReg dst = TReg(out_vec_idxs[0]); + + switch (exec_prc_) { + case Precision::FP16: { + h->movi(tmp.h, 0); + h->fmaxnm(dst.h, src.h, tmp.h); + break; + } + case Precision::FP32: { + h->movi(tmp.s, 0); + h->fmaxnm(dst.s, src.s, tmp.s); + break; + } + default: { + assert(!"unsupported precision"); + } + } +} + +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/aarch64/jit_eltwise_emitters.hpp new file mode 100644 index 00000000000000..12df3ffcfbe09b --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/aarch64/jit_eltwise_emitters.hpp @@ -0,0 +1,146 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "jit_emitter.hpp" + +namespace ov { +namespace intel_cpu { +namespace aarch64 { + +class jit_add_emitter : public jit_emitter { +public: + jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, + const float alpha = 0.f); + + jit_add_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node, + const float alpha = 0.f); + + size_t get_inputs_count() const override; + + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + + +class jit_mul_add_emitter : public jit_emitter { +public: + jit_mul_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, + const float alpha = 0.f); + + jit_mul_add_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node, + const float alpha = 0.f); + + size_t get_inputs_count() const override; + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + + +class jit_multiply_emitter : public jit_emitter { +public: + jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, + const float alpha = 0.f); + + jit_multiply_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node, + const float alpha = 0.f); + + size_t get_inputs_count() const override; + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + +// TODO: jit_power_emitter => jit_power_static_emitter +class jit_power_emitter : public jit_emitter { +public: + jit_power_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const float power, + const float scale, + const float shift, + const InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + jit_power_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const float power, + const float scale, + const float shift, + const std::shared_ptr& node); + + size_t get_inputs_count() const override; + + size_t get_aux_vecs_count() const override; + + size_t get_aux_gprs_count() const override; + + void register_table_entries() override; + + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + +private: + const float power; + const float scale; + const float shift; + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + +class jit_relu_emitter : public jit_emitter { +public: + jit_relu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, + const float alpha = 0.f); + + jit_relu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node, + const float alpha = 0.f); + + size_t get_inputs_count() const override; + + size_t get_aux_vecs_count() const override; + + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/aarch64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/aarch64/jit_emitter.cpp new file mode 100644 index 00000000000000..d5b48a8966197c --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/aarch64/jit_emitter.cpp @@ -0,0 +1,116 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_emitter.hpp" +#include +#include "utils/general_utils.h" + +using namespace dnnl::impl::cpu; +using namespace dnnl::impl; + +namespace ov { +namespace intel_cpu { +namespace aarch64 { + +void jit_emitter::emit_code(const std::vector &in_idxs, + const std::vector &out_idxs, + const std::vector &pool_vec_idxs, + const std::vector &pool_gpr_idxs) const { + emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); + + emit_impl(in_idxs, out_idxs); + + emitter_postamble(); +} + +void jit_emitter::emit_data() const { + h->align(64); + h->L(*l_table.get()); + + // Assumption: entries can be inserted with dd, so they should be 4 bytes. + assert(sizeof(table_entry_val_t) == 4); + + // Run through the map and insert values stored there + for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { + const auto &te = (*it).second; // get map entry for a given key + const auto len = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); + for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) + h->dd(te.val); + } +} + +std::set> jit_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {}; +} + +size_t jit_emitter::get_aux_gprs_count() const { + return 0; +} + +size_t jit_emitter::get_max_vecs_count() const { + return 32; +} + +size_t jit_emitter::get_vec_length() const { + return 16; +} + +size_t jit_emitter::get_aux_vecs_count() const { + return 0; +} + +void jit_emitter::prepare_table() { + register_table_entries(); + + // Now that we registered the entries, we set the offsets. No + // entries should be registered after this point. This allows to + // expect the same order when injecting the table entries in + // prepare_table. + size_t off = 0; + for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { + auto &te = (*it).second; + te.off = off; + off += te.bcast ? get_vec_length() : sizeof(table_entry_val_t); + } +} + +void jit_emitter::emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_aux_vec_idxs, + const std::vector& pool_aux_gpr_idxs) const { + if (pool_aux_vec_idxs.size() < get_aux_vecs_count()) { + IE_THROW() << "Failed to allocate required number of vector registers"; + } + + if (pool_aux_gpr_idxs.size() < get_aux_gprs_count()) { + IE_THROW() << "Failed to allocate required number of gpr registers"; + } + + for (auto idx : pool_aux_vec_idxs) { + aux_vec_idxs.push_back(static_cast(idx)); + } + + for (auto idx : pool_aux_gpr_idxs) { + aux_gpr_idxs.push_back(static_cast(idx)); + } + + if (!entry_map_.empty()) { + // last aux_gpr_idx is for p_table, we can use aux_gpr_idxs from idx 0 for other purpose + //p_table = Xbyak_aarch64::XReg(aux_gpr_idxs[aux_gpr_idxs.size() - 1]); + // TODO: debug: hardcode + p_table = Xbyak_aarch64::XReg(26); + aux_gpr_idxs.erase(aux_gpr_idxs.end() - 1); + } + + if (!entry_map_.empty()) { + load_table_addr(); + } +} + +void jit_emitter::emitter_postamble() const { +} + +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/aarch64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/aarch64/jit_emitter.hpp new file mode 100644 index 00000000000000..c227196c2052e9 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/aarch64/jit_emitter.hpp @@ -0,0 +1,175 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include +#include + +#include "snippets/snippets_isa.hpp" +#include "snippets/generator.hpp" +#include "node.h" + + +namespace ov { +namespace intel_cpu { +namespace aarch64 { + +enum emitter_in_out_map { + vec_to_vec, + vec_to_gpr, + gpr_to_vec, + gpr_to_gpr, +}; + +// structure for storage of emitter parameters to hash in map +struct emitter_params { + virtual size_t hash() const = 0; +}; + +class jit_emitter : public ov::snippets::Emitter { +public: + jit_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, + const float alpha = 0.f, + emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) : + Emitter(), h(host), host_isa_(host_isa), exec_prc_(exec_prc), + alpha(alpha), in_out_type_(in_out_type), p_table(0), l_table (new Xbyak_aarch64::Label()) { + } + + jit_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, + const float alpha = 0.f, + emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) : + Emitter(), h(host), host_isa_(host_isa), exec_prc_(exec_prc), + alpha(alpha), in_out_type_(in_out_type), p_table(0), l_table (new Xbyak_aarch64::Label()) { + } + + void emit_code( + const std::vector &in_idxs, + const std::vector &out_idxs, + const std::vector &pool_vec_idxs = {}, + const std::vector &pool_gpr_idxs = {}) const override; + + void emit_data() const override; + + virtual size_t get_inputs_count() const = 0; + virtual size_t get_aux_vecs_count() const; + virtual size_t get_aux_gprs_count() const; + + /** + * @brief Returns supported precisions. + * Precisions are ordered, the first bigger bitness precision with the same type will be selected. + * Empty collection means the emitter supports any input precisions. + */ + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + +protected: + size_t get_max_vecs_count() const; + size_t get_vec_length() const; + + mutable std::vector aux_vec_idxs; + mutable std::vector aux_gpr_idxs; + + dnnl::impl::cpu::aarch64::jit_generator* h; + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa_; + InferenceEngine::Precision exec_prc_; + const float alpha; + + emitter_in_out_map in_out_type_; + + virtual void prepare_table(); + virtual void register_table_entries() {} + + void load_table_addr() const { h->adr(p_table, *l_table.get()); } + + // we accept only 32bit hexadecimal table values to avoid any rounding + using table_entry_val_t = uint32_t; + using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table + using table_entry_bcast_t = bool; // true => bcast value + + struct table_entry_t { + table_entry_val_t val; + table_entry_bcast_t bcast; + }; + struct mapped_table_entry_t { + table_entry_offset_t off; + table_entry_val_t val; + table_entry_bcast_t bcast; + }; + + mutable Xbyak_aarch64::XReg p_table; + mutable std::shared_ptr l_table; + + virtual void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const = 0; + + virtual void emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_aux_vec_idxs, + const std::vector& pool_aux_gpr_idxs) const; + + virtual void emitter_postamble() const; + + // XReg table_val(std::string key, size_t key_off_val_shift = 0) const { + // auto off = table_off(key, key_off_val_shift); + // return h->ptr[p_table + off]; + // } + + using table_t = std::multimap; + using mapped_table_t = std::multimap; + + mapped_table_t entry_map_; + + Xbyak_aarch64::AdrImm table_val(std::string key, size_t key_off_val_shift = 0) const { + //auto off = table_off(key, key_off_val_shift); + int32_t off = table_off(key, key_off_val_shift); + return Xbyak_aarch64::ptr(p_table, off); + } + + Xbyak_aarch64::AdrNoOfs table_val2(std::string key, size_t key_off_val_shift = 0) const { + //auto off = table_off(key, key_off_val_shift); + int32_t off = table_off(key, key_off_val_shift); + + h->add_imm(h->X_DEFAULT_ADDR, p_table, off, h->X_TMP_0); + return Xbyak_aarch64::ptr(h->X_DEFAULT_ADDR); + } + + void push_arg_entry_of(const std::string key, const table_entry_val_t val, const bool broadcast) { + mapped_table_entry_t te {0, val, broadcast}; + entry_map_.insert(std::make_pair(key, te)); + } + + void push_entries_of(const table_t &t) { + for (auto it = t.begin(); it != t.end(); it++) { + auto key = (*it).first; + auto te = (*it).second; // copy values from table + push_arg_entry_of(key, te.val, te.bcast); + } + } + +private: + mutable std::vector preserved_vec_idxs; + mutable std::vector preserved_gpr_idxs; + + size_t table_off(std::string& key, size_t key_off_val_shift = 0) const { + // assumption: all table entries sharing the same key also + // share their broadcast property + // TODO: enforce through data structure + const auto it = entry_map_.find(key); // search an entry for a key + assert(it != entry_map_.end()); + const auto &te = (*it).second; + const auto scale = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); + return te.off + key_off_val_shift * scale; + } +}; + +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index e8fe6b89a00afc..5de11e2f0692d3 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -1028,13 +1028,18 @@ const std::vector& Node::getDefaultImplPriority() { impl_desc_type::jit_sse42_dw, impl_desc_type::jit_sse42_1x1, impl_desc_type::jit_sse42, +#if defined(OPENVINO_ARCH_ARM64) + impl_desc_type::jit_asimd, +#endif impl_desc_type::gemm_any, impl_desc_type::gemm_blas, impl_desc_type::gemm_avx512, impl_desc_type::gemm_avx2, impl_desc_type::gemm_avx, impl_desc_type::gemm_sse42, +#if defined(OV_CPU_WITH_ACL) impl_desc_type::acl, +#endif impl_desc_type::jit_gemm, impl_desc_type::ref_any, impl_desc_type::ref, diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index 852acc7487d318..3721501f5f7776 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -15,7 +16,6 @@ #include "cpu_types.h" #include "utils/bfloat16.hpp" #include "ie_ngraph_utils.hpp" -#include #include #include @@ -25,10 +25,14 @@ #include "input.h" #include "common/cpu_convert.h" +#if defined(OPENVINO_ARCH_X86_64) +#include #include "emitters/x64/jit_emitter.hpp" #include "emitters/x64/jit_eltwise_emitters.hpp" #include "emitters/x64/jit_dnnl_emitters.hpp" #include "emitters/x64/jit_bf16_emitters.hpp" +#endif + #include #include "utils/general_utils.h" #include "utils/cpu_utils.hpp" @@ -50,11 +54,25 @@ #include "memory_desc/dnnl_blocked_memory_desc.h" #include "shape_inference/custom/eltwise.hpp" +#if defined(OPENVINO_ARCH_ARM64) +#include "cpu/aarch64/cpu_isa_traits.hpp" +#include "kernels/aarch64/jit_uni_eltwise_generic.hpp" +#include "executors/aarch64/jit_eltwise.hpp" +#endif + using namespace InferenceEngine; using namespace dnnl::impl::utils; using namespace dnnl::impl::cpu; + +#if defined(OPENVINO_ARCH_X86_64) using namespace dnnl::impl::cpu::x64; using namespace Xbyak; +#endif + +#if defined(OPENVINO_ARCH_ARM64) +using namespace ov::intel_cpu::aarch64; +using namespace dnnl::impl::cpu::aarch64; +#endif #define GET_OFF(field) offsetof(jit_eltwise_call_args_ptrs, field) @@ -75,7 +93,7 @@ struct EltwiseEmitterContext { std::shared_ptr emitter; jit_generator *host; cpu_isa_t host_isa; - const Eltwise::EltwiseData& opData; + const EltwiseData& opData; InferenceEngine::Precision exec_prc; }; @@ -133,7 +151,7 @@ static void set_intersection(const std::set>& precisi InferenceEngine::Precision eltwise_precision_helper::get_precision(const size_t inputs_number, const InferenceEngine::Precision(&src_prc)[MAX_ELTWISE_INPUTS], - const std::vector& eltwise_data) { + const std::vector& eltwise_data) { Precision exec_prc = Precision::UNSPECIFIED; std::set> supported_precision_intersection = get_supported_precisions(eltwise_data.front().algo); @@ -257,7 +275,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_generic) explicit jit_uni_eltwise_generic(const jit_eltwise_params& jep, - const std::vector& eltwise_data, + const std::vector& eltwise_data, const std::vector& ops_list, const dnnl::post_ops& post_ops) : jit_uni_eltwise_kernel(jep), jit_generator(jit_name()), eltwise_data_(eltwise_data), ops_list_(ops_list), post_ops_(post_ops) {} @@ -559,11 +577,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener std::vector>> quantization_injectors = {}; - const std::vector& eltwise_data_; + const std::vector& eltwise_data_; const std::vector& ops_list_; const dnnl::post_ops& post_ops_; - std::shared_ptr create_eltwise_emitter(const Eltwise::EltwiseData& data, Precision exec_prec) { + std::shared_ptr create_eltwise_emitter(const EltwiseData& data, Precision exec_prec) { EltwiseEmitterContext ctx = { nullptr, this, @@ -1166,8 +1184,9 @@ const std::map& Eltwise::g namespace { + struct EltwiseKey { - std::vector eltwise_data; + std::vector eltwise_data; std::vector ops_list; VectorDims outBlkDims; VectorDims outOrder; @@ -1181,7 +1200,7 @@ struct EltwiseKey { using namespace dnnl::impl; using namespace dnnl::impl::primitive_hashing; size_t seed = 0; - auto hash_combine_eltwiseData = [](size_t seed, const Eltwise::EltwiseData& eltwiseData) { + auto hash_combine_eltwiseData = [](size_t seed, const EltwiseData& eltwiseData) { seed = hash_combine(seed, eltwiseData.algo); seed = hash_combine(seed, eltwiseData.onednnAlgorithm); seed = hash_combine(seed, eltwiseData.alpha); @@ -1189,7 +1208,7 @@ struct EltwiseKey { seed = hash_combine(seed, eltwiseData.gamma); return seed; }; - std::for_each(eltwise_data.begin(), eltwise_data.end(), [&](const Eltwise::EltwiseData& item) { + std::for_each(eltwise_data.begin(), eltwise_data.end(), [&](const EltwiseData& item) { seed = hash_combine_eltwiseData(seed, item); }); seed = get_vector_hash(seed, ops_list); @@ -1266,7 +1285,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { } } - EltwiseJitExecutor(const std::vector& eltwise_data, + EltwiseJitExecutor(const std::vector& eltwise_data, const std::vector& ops_list, const VectorDims& outBlkDims, const VectorDims& outOrder, @@ -1473,6 +1492,15 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { IE_THROW() << "Can't create jit eltwise kernel"; } #endif // OPENVINO_ARCH_X86_64 + +#if defined(OPENVINO_ARCH_ARM64) + if (mayiuse(aarch64::asimd)) { + _pKernel.reset(new jit_uni_eltwise_generic(jep, eltwise_data, ops_list, post_ops)); + } else { + IE_THROW() << "Can't create jit eltwise kernel"; + } +#endif // OPENVINO_ARCH_ARM64 + if (_pKernel) _pKernel->create_ker(); } @@ -1551,7 +1579,7 @@ template class EltwiseRefExecutor : public Eltwise::IEltwiseExecutor { public: - EltwiseRefExecutor(Eltwise::EltwiseData opData, + EltwiseRefExecutor(EltwiseData opData, const VectorDims& outBlkDims, std::vector inpDims) : _opData(std::move(opData)), _inpDims(inpDims) { @@ -1750,7 +1778,7 @@ class EltwiseRefExecutor : public Eltwise::IEltwiseExecutor { } private: - const Eltwise::EltwiseData _opData; + const EltwiseData _opData; VectorDims _dims; VectorDims _src_offsets[MAX_ELTWISE_INPUTS]; VectorDims _dst_offsets; @@ -1762,14 +1790,6 @@ class EltwiseRefExecutor : public Eltwise::IEltwiseExecutor { } // namespace -bool Eltwise::EltwiseData::operator==(const EltwiseData &rhs) const noexcept { - return algo == rhs.algo && - onednnAlgorithm == rhs.onednnAlgorithm && - alpha == rhs.alpha && - beta == rhs.beta && - gamma == rhs.gamma; -} - static Eltwise::executorPtr buildRefExecutor(const EltwiseKey& key) { if (key.outPrc == Precision::FP16) { return std::make_shared>(key.eltwise_data.front(), @@ -1924,10 +1944,19 @@ void Eltwise::initSupportedPrimitiveDescriptors() { return; // if dim rank is greater than the maximum possible, we should use the reference execution +#ifdef OPENVINO_ARCH_X86_64 bool canUseOptimizedImpl = mayiuse(x64::sse41) && getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK; +#endif + +#ifdef OPENVINO_ARCH_ARM64 + bool canUseOptimizedImpl = mayiuse(dnnl::impl::cpu::aarch64::asimd) && getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK; +#endif + // TODO: Add EltwiseLog algorithm support for JIT implementation canUseOptimizedImpl &= !one_of(getAlgorithm(), Algorithm::EltwiseLog); +#ifdef OPENVINO_ARCH_X86_64 bool canUseOptimizedShapeAgnosticImpl = isDynamicNode() && canUseOptimizedImpl; +#endif if (!canUseOptimizedImpl && !fusedWith.empty()) { IE_THROW(Unexpected) << "Eltwise node with name '" << getName() << "' uses reference impl, but unexpectedly fused with other ops"; @@ -1960,12 +1989,12 @@ void Eltwise::initSupportedPrimitiveDescriptors() { inputPrecisions.push_back(fusedNode->getOriginalInputPrecisionAtPort(i)); } } +#ifdef OPENVINO_ARCH_X86_64 if (fusedNode->getType() == Type::FakeQuantize) { canUseOptimizedShapeAgnosticImpl = false; } +#endif } - implType = canUseOptimizedShapeAgnosticImpl ? EltwiseImplType::optimizedShapeAgnostic : - canUseOptimizedImpl ? EltwiseImplType::optimized : EltwiseImplType::reference; if (inputPrecisions.size() != getParentEdges().size()) IE_THROW() << "Eltwise node with name `" << getName() << "` has invalid input precisions configuration."; @@ -1975,6 +2004,10 @@ void Eltwise::initSupportedPrimitiveDescriptors() { outputPrecision = fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0); } +#if defined(OPENVINO_ARCH_X86_64) + implType = canUseOptimizedShapeAgnosticImpl ? EltwiseImplType::optimizedShapeAgnostic : + canUseOptimizedImpl ? EltwiseImplType::optimized : EltwiseImplType::reference; + if (!mayiuse(avx512_core)) { bool hasBF16 = false; for (auto &inPrc : inputPrecisions) @@ -1984,8 +2017,23 @@ void Eltwise::initSupportedPrimitiveDescriptors() { if (outputPrecision == Precision::BF16 || hasBF16) IE_THROW() << "Eltwise node with name `" << getName() << "` doesn't support BF16 precision on this target."; } +#elif defined(OPENVINO_ARCH_ARM64) + const bool useJit = canUseOptimizedImpl && + executors::aarch64::JitEltwiseExecutor::isSupported(this, getAlpha(), getBeta(), getGamma()); + if (useJit) { + //outputPrecision = Precision::FP32; + } else { + canUseOptimizedImpl = false; + } + + implType = (useJit && canUseOptimizedImpl) ? EltwiseImplType::optimized : EltwiseImplType::reference; +#else + IE_THROW() << "Unknow CPU architecture"; +#endif #if defined(OV_CPU_WITH_ACL) + const bool useAcl = !useJit; + if (useAcl) { // Use original output precision as a reference point since some eltwise algorithms have non-float inputs (i.e. EltwiseSelect) Precision forcedPrec = getOriginalOutputPrecisionAtPort(0) == Precision::FP16 ? Precision::FP16 : Precision::FP32; // ACL implementation supports only identical precisions on inputs/outputs so they are aligned it to highest one @@ -2006,7 +2054,8 @@ void Eltwise::initSupportedPrimitiveDescriptors() { inputPrecisions[i] = forcedPrec; } outputPrecision = forcedPrec; -#else + } else { +#endif auto filterPrecision = [&](Precision& prc) { if (implType == EltwiseImplType::reference) { return Precision(Precision::FP32); @@ -2025,6 +2074,8 @@ void Eltwise::initSupportedPrimitiveDescriptors() { inputPrecisions[i] = filterPrecision(inputPrecisions[i]); } outputPrecision = filterPrecision(outputPrecision); +#if defined(OV_CPU_WITH_ACL) + } #endif // TODO: delete after new LPT (ngraph based) is merged @@ -2065,7 +2116,13 @@ void Eltwise::initSupportedPrimitiveDescriptors() { // bad accuracy for shape {1, 1, 4, 11}, {2, 5, 1, 1} // same for disabled collapse dims } else if (lt == Blocked && shape.getRank() != 1 && (shape.getMinDims()[1] != Shape::UNDEFINED_DIM && shape.getMinDims()[1] > 1)) { + #ifdef OPENVINO_ARCH_X86_64 size_t blockSize = mayiuse(x64::avx512_core) ? 16 : 8; + #endif + + #ifdef OPENVINO_ARCH_ARM64 + size_t blockSize = cpu_isa_traits::vlen / 4; + #endif VectorDims blocks = dims; VectorDims order(blocks.size()); @@ -2138,6 +2195,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { } else { impl_desc_type impl_type = impl_desc_type::ref; if (canUseOptimizedImpl) { + #ifdef OPENVINO_ARCH_X86_64 if (mayiuse(x64::avx512_core)) { impl_type = impl_desc_type::jit_avx512; } else if (mayiuse(x64::avx2)) { @@ -2145,6 +2203,15 @@ void Eltwise::initSupportedPrimitiveDescriptors() { } else if (mayiuse(x64::sse41)) { impl_type = impl_desc_type::jit_sse42; } + #endif + + #ifdef OPENVINO_ARCH_ARM64 + if (mayiuse(dnnl::impl::cpu::aarch64::asimd)) { + impl_type = impl_desc_type::jit_asimd; + } else { + IE_THROW() << "not supported architecture"; + } + #endif } return {config, impl_type}; @@ -2174,6 +2241,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { currentInBlkDims.resize(inputNum); #if defined (OV_CPU_WITH_ACL) + if (useAcl) { eltwiseAttrs = {algorithm, alpha, beta, gamma}; auto addDesc = [&initDesc](std::vector& supportedPrimitiveDescriptors, const LayoutType layoutType) { @@ -2196,6 +2264,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { canUseAclExecutor = !supportedPrimitiveDescriptors.empty(); if (canUseAclExecutor) return; + } #endif if (isChannelsFirstApplicable) @@ -2402,7 +2471,6 @@ void Eltwise::execute(dnnl::stream strm) { } args_ptrs.dst_offsets = execParams.outOffsets.data(); } - execPtr->exec(args_ptrs, dims_out); } else if (aclExecPtr) { std::vector srcMemory; @@ -2675,8 +2743,26 @@ bool Eltwise::canFuse(const NodePtr& node) const { return true; }; +#ifdef OPENVINO_ARCH_X86_64 if (!mayiuse(x64::sse41) || getInputShapeAtPort(0).getRank() > MAX_ELTWISE_DIM_RANK) return false; +#endif + +#ifdef OPENVINO_ARCH_ARM64 + if (!mayiuse(dnnl::impl::cpu::aarch64::asimd) || (getInputShapeAtPort(0).getRank() > MAX_ELTWISE_DIM_RANK)) + return false; + + if (!executors::aarch64::JitEltwiseExecutor::isSupported(this, getAlpha(), getBeta(), getGamma())) { + return false; + } + const auto eltwise = dynamic_cast(node.get()); + if ((eltwise == nullptr) || (!executors::aarch64::JitEltwiseExecutor::isSupported(eltwise, + eltwise->getAlpha(), + eltwise->getBeta(), + eltwise->getGamma()))) { + return false; + } +#endif // TODO: EltwiseLog is supported only via reference executor if (getAlgorithm() == Algorithm::EltwiseLog || node->getAlgorithm() == Algorithm::EltwiseLog) diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.h b/src/plugins/intel_cpu/src/nodes/eltwise.h index 7feb2ba88c9bdb..6b6e7927794b70 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.h +++ b/src/plugins/intel_cpu/src/nodes/eltwise.h @@ -11,13 +11,17 @@ #include #include #include "executors/eltwise_list.hpp" +#include "nodes/kernels/jit_eltwise_call_args_ptrs.hpp" + +#if defined(OPENVINO_ARCH_ARM64) +#include "kernels/aarch64/jit_uni_eltwise_generic.hpp" +#endif namespace ov { namespace intel_cpu { namespace node { -#define MAX_ELTWISE_INPUTS 7 -#define MAX_ELTWISE_DIM_RANK 12 +#if defined(OPENVINO_ARCH_X86_64) struct jit_eltwise_params { size_t inputs_number; @@ -39,18 +43,6 @@ struct jit_eltwise_params { bool use_runtime_ptrs; }; -struct jit_eltwise_call_args_ptrs { - const void *src_ptr[MAX_ELTWISE_INPUTS]; - void *dst_ptr; - //ptr to array of post op inputs pointers (flat list) - const void** post_op_data; - - // shape agnostic kernel - size_t work_amount; - const void *src_offsets[MAX_ELTWISE_INPUTS]; - const void *dst_offsets; -}; - struct jit_eltwise_call_args_indexes { size_t indexes[MAX_ELTWISE_DIM_RANK]; }; @@ -73,24 +65,34 @@ struct jit_uni_eltwise_kernel { jit_eltwise_params jep_; }; +#endif + enum class EltwiseImplType { reference = 0, optimized = 1, optimizedShapeAgnostic = 2 }; +#if defined (OPENVINO_ARCH_X86_64) +struct EltwiseData { + Algorithm algo; + dnnl::algorithm onednnAlgorithm; + float alpha; + float beta; + float gamma; + + bool operator==(const EltwiseData& rhs) const noexcept { + return algo == rhs.algo && + onednnAlgorithm == rhs.onednnAlgorithm && + alpha == rhs.alpha && + beta == rhs.beta && + gamma == rhs.gamma; + } +}; +#endif + class Eltwise : public Node { public: - struct EltwiseData { - Algorithm algo; - dnnl::algorithm onednnAlgorithm; - float alpha; - float beta; - float gamma; - - bool operator==(const EltwiseData& rhs) const noexcept; - }; - class IEltwiseExecutor { public: IEltwiseExecutor() = default; @@ -207,7 +209,12 @@ class eltwise_precision_helper { public: static InferenceEngine::Precision get_precision(const size_t inputs_number, const InferenceEngine::Precision (&src_prc)[MAX_ELTWISE_INPUTS], - const std::vector& eltwise_data); + #if defined(OPENVINO_ARCH_X86_64) + const std::vector& eltwise_data); + #endif + #if defined(OPENVINO_ARCH_ARM64) + const std::vector& eltwise_data); + #endif private: static std::set> get_supported_precisions(const Algorithm& algo); diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp new file mode 100644 index 00000000000000..f2a2c658a8fb8d --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -0,0 +1,90 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_eltwise.hpp" +#include + +namespace ov { +namespace intel_cpu { +namespace executors { +namespace aarch64 { + +bool JitEltwiseExecutor::isSupported( + const Node* node, + const float alpha, + const float beta, + const float gamma) { + const Algorithm& algorithm = node->getAlgorithm(); + const auto is_supported = one_of(algorithm, + Algorithm::EltwiseAdd, + Algorithm::EltwiseMultiply, + Algorithm::EltwiseMulAdd, + Algorithm::EltwisePowerStatic, + Algorithm::EltwiseRelu); + if (!is_supported) { + return false; + } + + const auto check_precisions = [&node](const std::set& precisions, const bool& all_precisions_equal) { + const auto& input_precisions = node->getOriginalInputPrecisions(); + if (std::any_of(input_precisions.begin(), + input_precisions.end(), + [&precisions, &all_precisions_equal, &input_precisions](const InferenceEngine::Precision& precision) { + return (all_precisions_equal && (input_precisions[0] != precision)) || + (precisions.find(precision) == precisions.end()); + })) { + return false; + } + + const auto& output_precisions = node->getOriginalOutputPrecisions(); + if (std::any_of(output_precisions.begin(), + output_precisions.end(), + [&precisions, &all_precisions_equal, &input_precisions](const InferenceEngine::Precision& precision) { + return (all_precisions_equal && (input_precisions[0] != precision)) || + (precisions.find(precision) == precisions.end()); + })) { + return false; + } + + return true; + }; + + // TODO: remove + const bool all_precisions_equal = false; //algorithm != Algorithm::EltwisePowerStatic; + const std::set supported_precisions = + std::set{InferenceEngine::Precision::FP16, InferenceEngine::Precision::FP32}; + if (!check_precisions(supported_precisions, all_precisions_equal)) { + return false; + } + + if ((algorithm == Algorithm::EltwiseRelu) && ((alpha != 0.f) || (beta != 0.f) || (gamma != 0.f))) { + return false; + } + + // if ((algorithm == Algorithm::EltwisePowerStatic) && (beta != 0.f)) { + // return false; + // } + + return true; +} + +JitEltwiseExecutor::JitEltwiseExecutor(const ExecutorContext::CPtr context) : EltwiseExecutor(context) {} + +bool JitEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, + const std::vector &srcDescs, + const std::vector &dstDescs, + const std::vector &postOps) { + return true; +} + +void JitEltwiseExecutor::exec(const std::vector &src, + const std::vector &dst, + const void *post_ops_data_) { + exec_func(); +} + +} // namespace aarch64 +} // namespace executors +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.hpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.hpp new file mode 100644 index 00000000000000..673e96459c99cb --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "cpu_types.h" +#include "../executor.hpp" +#include "../eltwise.hpp" +#include + +namespace ov { +namespace intel_cpu { +namespace executors { +namespace aarch64 { + +using namespace InferenceEngine; + +class JitEltwiseExecutor : public EltwiseExecutor { +public: + explicit JitEltwiseExecutor(const ExecutorContext::CPtr context); + + static bool isSupported( + const Node* node, + const float alpha, + const float beta, + const float gamma); + + bool init(const EltwiseAttrs& eltwiseAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const std::vector& postOps) override; + + void exec(const std::vector& src, + const std::vector& dst, + const void *post_ops_data_) override; + + impl_desc_type getImplType() const override { + return impl_desc_type::asimd; + } +private: + std::function exec_func; +}; + +} // namespace aarch64 +} // namespace executors +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp new file mode 100644 index 00000000000000..57dff4f05b8302 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -0,0 +1,659 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_uni_eltwise_generic.hpp" +#include "ie_ngraph_utils.hpp" + +namespace ov { +namespace intel_cpu { +namespace aarch64 { + +using namespace Xbyak_aarch64; +using namespace dnnl::impl::cpu; +using namespace dnnl::impl::cpu::aarch64; +using namespace InferenceEngine; + +void jit_uni_eltwise_kernel::operator()( + const node::jit_eltwise_call_args_ptrs* const_args, + const jit_eltwise_call_args_indexes* indexes) { + assert(ker_); + ker_(const_args, indexes); +} + +template +jit_uni_eltwise_generic::jit_uni_eltwise_generic(const jit_eltwise_params& jep, + const std::vector& eltwise_data, + const std::vector& ops_list, + const dnnl::post_ops& post_ops) : + jit_uni_eltwise_kernel(jep), + jit_generator(), + eltwise_data_(eltwise_data), + ops_list_(ops_list), + post_ops_(post_ops) {} + +template +void jit_uni_eltwise_generic::generate() { + preamble(); + + auto const exec_prc = eltwise_precision_helper::get_precision(jep_.inputs_number, jep_.src_prc, eltwise_data_); + + eltwise_emitter = create_eltwise_emitter(eltwise_data_.front(), exec_prc); + for (size_t i = 1; i < eltwise_data_.size(); ++i) { + post_op_emitters.push_back(create_eltwise_emitter(eltwise_data_[i], exec_prc)); + } + + const auto &jep = jep_; + + XReg param2 = abi_param2; + const int offset_count = jep.input_size - 1; + + // ptrs initializing + if (jep.use_runtime_ptrs) { + IE_THROW(NotImplemented) << "jit_uni_eltwise_generic::generate: jep.use_runtime_ptrs is not implemented"; + } else { + auto init_ptrs_with_offsets = [this, offset_count, param2](XReg pointer, const std::vector& offsets) { + for (int j = 0; j < offset_count; j++) { + if (jep_.dims[j] != 1 && offsets[j] != 0) { + XReg offset_reg(get_aux_gpr(0)); + mov(offset_reg, offsets[j]); + + XReg index_reg(get_aux_gpr(1)); + ldr(index_reg, ptr(param2, static_cast(j * sizeof(size_t)))); + madd(pointer, offset_reg, index_reg, pointer); + } + } + }; + + for (size_t i = 0; i < jep.inputs_number; i++) { + ldr(get_src_reg(i), ptr(param1, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, src_ptr) + i * sizeof(size_t)))); + init_ptrs_with_offsets(get_src_reg(i), jep.src_offsets[i]); + } + + ldr(reg_dst, ptr(param1, static_cast(offsetof(node::jit_eltwise_call_args_ptrs, dst_ptr)))); + init_ptrs_with_offsets(reg_dst, jep.dst_offsets); + + mov(reg_work_amount, jep.work_amount); + } + + Label unroll_loop_label; + Label unroll_loop_end_label; + Label main_loop_label; + Label main_loop_end_label; + Label tail_loop_label; + Label tail_loop_end_label; + + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] == 1) { + uni_ldr(get_vmm_reg(i), get_src_reg(i), jep.src_prc[i], exec_prc, true); + } + } + + size_t min_src_size = jep.dst_size; + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) + min_src_size = std::min(min_src_size, jep.src_size[i]); + } + if (jep_.oc_size > 1) + min_src_size = std::min(min_src_size, jep_.oc_size); + + if (min_src_size != jep.dst_size) { + bool is_valid_configuration = true; + if (jep.dst_size % min_src_size != 0) + is_valid_configuration = false; + + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1 && jep.src_size[i] != min_src_size && jep.src_size[i] != jep.dst_size) + is_valid_configuration = false; + } + + if (jep.oc_size > 1 && jep.oc_size != min_src_size && jep.oc_size != jep.dst_size) + is_valid_configuration = false; + + if (!is_valid_configuration) + IE_THROW() << "Eltwise jitter has invalid configuration for Eltwise node"; + + L(unroll_loop_label); + { + const size_t loop_step = min_src_size; + const size_t vec_step = cpu_isa_traits::vlen / exec_prc.size(); + + cmp(reg_work_amount, loop_step); + b(LO, unroll_loop_end_label); + + for (size_t j = 0; j < min_src_size / vec_step; j++) { + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) { + uni_ldr(get_vmm_reg(i), get_src_reg(i), jep.src_prc[i], exec_prc, false, j * vec_step * jep.src_prc[i].size()); + } + } + + compute_eltwise_op(); + + apply_post_ops(); + + uni_str(reg_dst, vmm_dst, exec_prc, jep.dst_prc, j * vec_step * jep.dst_prc.size()); + } + + size_t tail_start = min_src_size - min_src_size % vec_step; + for (size_t j = tail_start; j < min_src_size; j++) { + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) { + uni_ldr(get_scl_reg(i), get_src_reg(i), jep.src_prc[i], exec_prc, j * jep.src_prc[i].size()); + } + } + + compute_eltwise_op(); + + apply_post_ops(); + + // TODO: TRegS + SReg sc_dst_reg{vmm_dst.getIdx()}; + uni_str(reg_dst, sc_dst_reg, exec_prc, jep.dst_prc, j * jep.dst_prc.size()); + } + + for (size_t i = 0; i < jep.inputs_number; i++) + if (jep.src_size[i] == jep.dst_size) + add(get_src_reg(i), get_src_reg(i), jep.src_prc[i].size() * loop_step); + + add(reg_dst, reg_dst, jep.dst_prc.size() * loop_step); + sub(reg_work_amount, reg_work_amount, loop_step); + if (jep_.oc_size > 1 && jep_.oc_size != min_src_size) + IE_THROW(NotImplemented) << "jit_uni_eltwise_generic::generate: reg_oc_off"; + + b(AL, unroll_loop_label); + } + + L(unroll_loop_end_label); + } + + if (min_src_size == jep.dst_size) { + L(main_loop_label); + { + const size_t vlen = cpu_isa_traits::vlen; + const size_t exec_prc_size = exec_prc.size(); + const size_t loop_step = vlen / exec_prc_size; + + cmp(reg_work_amount, loop_step); + b(LO, main_loop_end_label); + + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) { + uni_ldr(get_vmm_reg(i), get_src_reg(i), jep.src_prc[i], exec_prc, false); + } + } + + compute_eltwise_op(); + + apply_post_ops(); + + uni_str(reg_dst, vmm_dst, exec_prc, jep.dst_prc); + + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) { + add(get_src_reg(i), get_src_reg(i), jep.src_prc[i].size() * loop_step); + } + } + + add(reg_dst, reg_dst, jep.dst_prc.size() * loop_step); + sub(reg_work_amount, reg_work_amount, loop_step); + if (jep_.oc_size > 1) + IE_THROW(NotImplemented) << "jit_uni_eltwise_generic::generate: reg_oc_off"; + + b(AL, main_loop_label); + } + L(main_loop_end_label); + } + + L(tail_loop_label); + { + const size_t loop_step = 1; + + cmp(reg_work_amount, 0x0); + b(EQ, tail_loop_end_label); + + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) { + uni_ldr(get_scl_reg(i), get_src_reg(i), jep.src_prc[i], exec_prc); + } + } + + compute_eltwise_op(); + + apply_post_ops(); + + SReg sc_dst_reg{vmm_dst.getIdx()}; + uni_str(reg_dst, sc_dst_reg, exec_prc, jep.dst_prc); + + for (size_t i = 0; i < jep.inputs_number; i++) { + if (jep.src_size[i] != 1) { + add(get_src_reg(i), get_src_reg(i), jep.src_prc[i].size() * loop_step); + } + } + + add(reg_dst, reg_dst, jep.dst_prc.size() * loop_step); + sub(reg_work_amount, reg_work_amount, loop_step); + if (jep_.oc_size > 1) + IE_THROW(NotImplemented) << "jit_uni_eltwise_generic::generate: reg_oc_off"; + + b(AL, tail_loop_label); + } + L(tail_loop_end_label); + + postamble(); + + eltwise_emitter->emit_data(); + for (size_t i = 0; i < post_op_emitters.size(); i++) { + post_op_emitters[i]->emit_data(); + } +} + +template +void jit_uni_eltwise_generic::uni_ldr(const TReg& data, + const XReg& ptr_reg, + const Precision& src_prc, + const Precision& dst_prc, + const bool broadcast, + const int32_t offset) { + switch (src_prc) { + case Precision::FP16: { + if (broadcast) { + if (offset == 0) { + ld1r(data.h, ptr(ptr_reg)); + } else { + add_imm(X_DEFAULT_ADDR, ptr_reg, offset, X_TMP_0); + ld1r(data.h, ptr(X_DEFAULT_ADDR)); + } + } else { + ldr(Xbyak_aarch64::DReg(data.getIdx()), Xbyak_aarch64::ptr(ptr_reg, offset)); + } + break; + } + case Precision::FP32: { + if (broadcast) { + jit_generator::uni_ld1rw(data.s, ptr_reg, offset); + } else { + jit_generator::uni_ldr(data, ptr_reg, offset); + } + break; + } + default: { + IE_THROW(Unexpected) << "src_prc " << src_prc << " is not supported";; + } + } + + if (dst_prc != src_prc) { + switch (dst_prc) { + case Precision::FP32: + switch (src_prc) { + case Precision::FP16: { + // TODO: remove temporary register if posible + auto tmp = get_aux_vmm(0); + fcvtl(tmp.s4, data.h4); + uni_orr(data, tmp, tmp); + break; + } + default: + IE_THROW(Unexpected) << "src_prc " << src_prc << " is not supported";; + } + break; + default: + IE_THROW(Unexpected) << "dst_prc " << dst_prc << " is not supported";; + } + } +} + +template +void jit_uni_eltwise_generic::uni_ldr(const SReg& data, + const XReg& ptr, + const Precision& src_prc, + const Precision& dst_prc, + const int32_t offset) { + switch (src_prc) { + case Precision::FP16: { + ldr(Xbyak_aarch64::HReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, offset)); + break; + } + case Precision::FP32: { + ldr(data, Xbyak_aarch64::ptr(ptr, offset)); + break; + } + default: { + IE_THROW(Unexpected) << "dst_prc " << dst_prc << " is not supported";; + } + } + + if (dst_prc != src_prc) { + switch (dst_prc) { + case Precision::FP32: + switch (src_prc) { + case Precision::FP16: { + // TODO: remove temporary register if posible + auto tmp = Xbyak_aarch64::SReg(get_aux_vmm(0).getIdx()); + fcvt(tmp, Xbyak_aarch64::HReg(data.getIdx())); + fmov(data, tmp); + break; + } + default: + IE_THROW(Unexpected) << "src_prc " << src_prc << " is not supported";; + } + break; + default: + IE_THROW(Unexpected) << "dst_prc " << dst_prc << " is not supported";; + } + } +} + +template +void jit_uni_eltwise_generic::uni_str(const XReg& ptr, + const TReg& data, + const Precision& src_prc, + const Precision& dst_prc, + const int32_t offset) { + if (src_prc != dst_prc) { + switch (src_prc) { + case Precision::FP32: { + switch (dst_prc) { + case Precision::FP16: { + // TODO: remove temporary register if posible + auto tmp = get_aux_vmm(0); + fcvtn(tmp.h4, data.s4); + uni_orr(data, tmp, tmp); + break; + } + default: { + IE_THROW(Unexpected) << "src_prc " << src_prc << " is not supported";; + } + } + break; + } + default: { + IE_THROW(Unexpected) << "src_prc " << src_prc << " is not supported";; + } + } + } + + switch (dst_prc) { + case Precision::FP16: { + str(Xbyak_aarch64::DReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, offset)); + break; + } + case Precision::FP32: { + str(Xbyak_aarch64::QReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, offset)); + break; + } + default: { + IE_THROW(Unexpected) << "dst_prc " << dst_prc << " is not supported";; + } + } +} + +template +void jit_uni_eltwise_generic::uni_str(const XReg& ptr, + const SReg& data, + const Precision& src_prc, + const Precision& dst_prc, + const int32_t offset) { + if (src_prc != dst_prc) { + switch (src_prc) { + case Precision::FP32: { + switch (dst_prc) { + case Precision::FP16: { + // TODO: remove temporary register if posible + auto tmp = Xbyak_aarch64::SReg(get_aux_vmm(0).getIdx()); + fcvt(Xbyak_aarch64::HReg(tmp.getIdx()), data); + fmov(data, tmp); + break; + } + default: { + IE_THROW(Unexpected) << "src_prc " << src_prc << " is not supported";; + } + } + break; + } + default: { + IE_THROW(Unexpected) << "src_prc " << src_prc << " is not supported";; + } + } + } + + switch (dst_prc) { + case Precision::FP16: { + str(Xbyak_aarch64::HReg(data.getIdx()), Xbyak_aarch64::ptr(ptr, offset)); + break; + } + case Precision::FP32: { + str(data, Xbyak_aarch64::ptr(ptr, offset)); + break; + } + default: { + IE_THROW(Unexpected) << "dst_prc " << src_prc << " is not supported";; + } + } +} + +struct EltwiseEmitterContext { + std::shared_ptr emitter; + dnnl::impl::cpu::aarch64::jit_generator *host; + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa; + const EltwiseData& opData; + InferenceEngine::Precision exec_prc; +}; + +template +struct EltwiseEmitter { + void operator()(EltwiseEmitterContext& ctx) { + ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.exec_prc, ctx.opData.alpha); + } +}; + +template<> +struct EltwiseEmitter { + void operator()(EltwiseEmitterContext& ctx) { + ctx.emitter = std::make_shared(ctx.host, + ctx.host_isa, + ctx.opData.alpha, + ctx.opData.beta, + ctx.opData.gamma, + ctx.exec_prc); + } +}; + +template +std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitter(const EltwiseData& data, const Precision& exec_prec) { + EltwiseEmitterContext ctx = { + nullptr, + this, + isa, + data, + exec_prec + }; + + OV_SWITCH(intel_cpu, EltwiseEmitter, ctx, data.algo, + OV_CASE(Algorithm::EltwiseAdd, ov::intel_cpu::aarch64::jit_add_emitter), + OV_CASE(Algorithm::EltwiseMulAdd, ov::intel_cpu::aarch64::jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseMultiply, ov::intel_cpu::aarch64::jit_multiply_emitter), + OV_CASE(Algorithm::EltwisePowerStatic, ov::intel_cpu::aarch64::jit_power_emitter), + OV_CASE(Algorithm::EltwiseRelu, ov::intel_cpu::aarch64::jit_relu_emitter)); + + if (!ctx.emitter) + IE_THROW() << "Unsupported operation type '" << algToString(data.algo) << "' for Eltwise emitter"; + + return ctx.emitter; +} + +template +void jit_uni_eltwise_generic::compute_eltwise_op() { + std::vector in_idxs; + for (size_t i = 0; i < eltwise_emitter->get_inputs_count(); i++) { + in_idxs.push_back(get_vmm_reg(i).getIdx()); + } + + std::vector aux_idxs; + for (size_t i = 0; i < eltwise_emitter->get_aux_vecs_count(); i++) { + aux_idxs.push_back(get_aux_vmm(i).getIdx()); + } + + std::vector out_idxs; + out_idxs.push_back(vmm_dst.getIdx()); + + std::vector gpr_idxs; + for (size_t i = 0; i < eltwise_emitter->get_aux_vecs_count(); i++) { + gpr_idxs.push_back(get_aux_gpr(i).getIdx()); + } + + eltwise_emitter->emit_code(in_idxs, out_idxs, aux_idxs, gpr_idxs); +} + +template +void jit_uni_eltwise_generic::apply_post_ops() { + int input_idx = eltwise_emitter->get_inputs_count(); + int eltwise_post_op_idx = 0; + for (size_t i = 1; i < ops_list_.size(); i++) { + if (ops_list_[i] == ov::intel_cpu::Type::Eltwise) { + std::vector in_idxs; + in_idxs.push_back(vmm_dst.getIdx()); + for (size_t j = 1; j < post_op_emitters[eltwise_post_op_idx]->get_inputs_count(); j++) + in_idxs.push_back(get_vmm_reg(input_idx++).getIdx()); + + std::vector out_idxs; + out_idxs.push_back(vmm_dst.getIdx()); + + std::vector aux_vmm_idxs; + for (size_t j = 0; j < post_op_emitters[eltwise_post_op_idx]->get_aux_vecs_count(); j++) + aux_vmm_idxs.push_back(get_aux_vmm(j).getIdx()); + + std::vector aux_gpr_idxs; + for (size_t j = 0; j < post_op_emitters[eltwise_post_op_idx]->get_aux_gprs_count(); j++) + aux_gpr_idxs.push_back(get_aux_gpr(j).getIdx()); + + post_op_emitters[eltwise_post_op_idx]->emit_code(in_idxs, out_idxs, aux_vmm_idxs, aux_gpr_idxs); + + eltwise_post_op_idx++; + } else if (ops_list_[i] == ov::intel_cpu::Type::FakeQuantize) { + IE_THROW(Unexpected) << "Eltwise jit kernel: FakeQuantize is not supported"; + } else { + IE_THROW(Unexpected) << "Eltwise jit kernel: unexpected operation type"; + } + } +} + +namespace { + +// TODO: copy/paste: refactor +template +struct SupportedPrecisions { + void operator()(std::set> &precisions) { + precisions = T::get_supported_precisions(); + } +}; + +// TODO: copy/paste: refactor +static void set_intersection(const std::set>& precisions1, + const std::set>& precisions2, + std::set>& intersection) { + std::map intersection_types; + + for (auto it1 = precisions1.begin(); it1 != precisions1.end(); ++it1) { + for (auto it2 = precisions2.begin(); it2 != precisions2.end(); ++it2) { + const auto& it1_precisions = *it1; + // all element types are equal + if (it1_precisions[0] == (*it2)[0]) { + // first precisions size is used + intersection_types.emplace(it1_precisions[0], it1_precisions.size()); + } + } + } + + for (auto it = intersection_types.begin(); it != intersection_types.end(); ++it) { + intersection.insert(std::vector(it->second, it->first)); + } +} +} // namespace + +InferenceEngine::Precision eltwise_precision_helper::get_precision( + const size_t inputs_number, + const InferenceEngine::Precision (&src_prc)[MAX_ELTWISE_INPUTS], + const std::vector& eltwise_data) { + Precision exec_prc = Precision::UNSPECIFIED; + + const auto algorithm = eltwise_data.front().algo; + std::set> supported_precision_intersection = get_supported_precisions(algorithm); + + // for element-wise operations all inputs must to have the same precisions + auto has_same_precision = [](const std::vector& precisions) { + return std::all_of(precisions.begin(), precisions.end(), [&precisions](const element::Type precision) { + return precision == precisions[0]; + }); + }; + + // TODO: should we convert all inputs to fp16 for PowerStatic + assert((algorithm == Algorithm::EltwisePowerStatic) || + std::all_of(supported_precision_intersection.begin(), + supported_precision_intersection.end(), + has_same_precision)); + + + for (size_t i = 1; i < eltwise_data.size(); ++i) { + std::set> prcs = get_supported_precisions(eltwise_data[i].algo); + std::set> prcs_intersect = {}; + + OPENVINO_ASSERT((algorithm == Algorithm::EltwisePowerStatic) || + std::all_of(prcs.begin(), prcs.end(), has_same_precision), + "for element-wise nodes all precisions have to be equal"); + + set_intersection(supported_precision_intersection, prcs, prcs_intersect); + + supported_precision_intersection = prcs_intersect; + } + + static const element::Type exec_precisions_priority[] = { + element::f16, + element::f32 + }; + + for (const auto prc : exec_precisions_priority) { + if (std::any_of( + supported_precision_intersection.begin(), + supported_precision_intersection.end(), + [&prc](const std::vector& precisions) { return std::find(precisions.begin(), precisions.end(), prc) != precisions.end(); })) { + exec_prc = InferenceEngine::details::convertPrecision(prc); + break; + } + } + + for (size_t i = 0; i < inputs_number; i++) { + if (src_prc[i] != exec_prc) { + exec_prc = Precision::FP32; + break; + } + } + + if (exec_prc == Precision::UNSPECIFIED) { + IE_THROW() << "Eltwise jitter failed to specify execution precision for Eltwise node"; + } + + return exec_prc; +} + +std::set> eltwise_precision_helper::get_supported_precisions(const Algorithm& algo) { + std::set> precisions; + + OV_SWITCH(intel_cpu, SupportedPrecisions, precisions, algo, + OV_CASE(Algorithm::EltwiseRelu, jit_relu_emitter), + OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), + OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), + OV_CASE(Algorithm::EltwisePowerStatic, jit_power_emitter)); + + if (precisions.empty()) + IE_THROW() << "Unsupported operation type for Eltwise emitter"; + + return precisions; +} + +template struct jit_uni_eltwise_generic; + +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp new file mode 100644 index 00000000000000..0a468c247f37fe --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.hpp @@ -0,0 +1,249 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include +#include +#include + +#include "utils/general_utils.h" +#include "utils/cpu_utils.hpp" + +#include +#include +#include "nodes/kernels/jit_eltwise_call_args_ptrs.hpp" + +namespace ov { +namespace intel_cpu { +namespace aarch64 { + +using namespace Xbyak_aarch64; +using namespace dnnl::impl::cpu; +using namespace dnnl::impl::cpu::aarch64; +using namespace InferenceEngine; + +struct jit_eltwise_params { + size_t inputs_number; + size_t input_size; + + InferenceEngine::Precision src_prc[MAX_ELTWISE_INPUTS]; + InferenceEngine::Precision dst_prc; + + VectorDims dims; + VectorDims src_offsets[MAX_ELTWISE_INPUTS]; + VectorDims dst_offsets; + VectorDims oc_offsets; + + size_t src_size[MAX_ELTWISE_INPUTS]; + size_t dst_size; + size_t oc_size; + + size_t work_amount; + bool use_runtime_ptrs; +}; + +struct jit_eltwise_call_args_indexes { + size_t indexes[MAX_ELTWISE_DIM_RANK]; +}; + +struct jit_uni_eltwise_kernel { + void (*ker_)(const node::jit_eltwise_call_args_ptrs*, const jit_eltwise_call_args_indexes*); + + void operator()(const node::jit_eltwise_call_args_ptrs* const_args, const jit_eltwise_call_args_indexes* indexes); + + jit_uni_eltwise_kernel() {} + jit_uni_eltwise_kernel(const jit_eltwise_params& jep) : ker_(nullptr), jep_(jep) {} + virtual ~jit_uni_eltwise_kernel() {} + + virtual void create_ker() = 0; + + jit_eltwise_params jep_; +}; + +struct EltwiseData { + Algorithm algo; + dnnl::algorithm onednnAlgorithm; + float alpha; + float beta; + float gamma; + + bool operator==(const EltwiseData& rhs) const noexcept { + return algo == rhs.algo && + onednnAlgorithm == rhs.onednnAlgorithm && + alpha == rhs.alpha && + beta == rhs.beta && + gamma == rhs.gamma; + } +}; + +template +struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, jit_generator { +public: + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_generic) + + jit_uni_eltwise_generic(const jit_eltwise_params& jep, + const std::vector& eltwise_data, + const std::vector& ops_list, + const dnnl::post_ops& post_ops); + + jit_uni_eltwise_generic() {} + + void create_ker() override { + jit_generator::create_kernel(); + ker_ = (decltype(ker_))jit_ker(); + } + + void generate() override; + +private: + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + using TRegS = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TRegS; + + // TODO: update + // Scalar architecture specific registers mapping + // + // aarch64| function | x64 | function + // =========================================== + // X0 | | RAX | post_op_ptrs + // X1 | | RBX | dst ptr + // X2 | | RCX | [not used] + // X3 | | RDX | work amount + // X4 | | RDI | [not used] + // X5 | | RSI | d_bias + // X6 | | RBP | d_weights + // X7 | | RSP | + // X8 | [not used] | R8 | src ptr + // X9 | work amount | R9 | src ptr + // X10 | dst ptr | R10 | src ptr + // X11 | src ptr | R11 | src ptr + // X12 | src ptr | R12 | src ptr + // X13 | src ptr | R13 | src ptr + // X14 | src ptr | R14 | src ptr + // X15 | src ptr | R15 | temporary + // X16 | src ptr + // X17 | temporary + // X18 | temporary + // X19-30 | [not used] + + const XReg reg_work_amount = x9; + const XReg reg_dst = x10; + + inline XReg get_src_reg(uint32_t idx) { + if (idx > MAX_ELTWISE_INPUTS) { + IE_THROW(Unexpected) << "source vector ptr register " << idx << " is not supported"; + } + return XReg(11 + idx); + } + + // TODO: update + // Vector registers mapping + // + // A64/X64 | function + // ======================= + // 0 | [not used] + // 01 | srs + // 02 | srs + // 03 | srs + // 04 | srs + // 05 | srs + // 06 | srs + // 07 | srs + // 08 | srs + // 09 | dst + // 10 | aux + // 11 | aux + // 12 | d_weights + // 13 | d_bias + // 14 | [not used] + // 15 | zero + // 16 - 30 | [not used] + + TReg vmm_dst {9}; + + inline TReg get_vmm_reg(const uint32_t idx) { + if (idx > MAX_ELTWISE_INPUTS) { + IE_THROW(Unexpected) << "source vector register " << idx << " is not supported"; + } + return TReg(16 + idx); + } + + inline SReg get_scl_reg(const uint32_t idx) { + if (idx > MAX_ELTWISE_INPUTS) { + IE_THROW(Unexpected) << "source scalar register " << idx << " is not supported"; + } + return SReg(16 + idx); + } + + inline TReg get_aux_vmm(const uint32_t idx) { + if (idx > 2) { + IE_THROW(Unexpected) << "aux vector register " << idx << " is not supported"; + } + return TReg(10 + idx); + } + + inline XReg get_aux_gpr(const uint32_t idx) { + if (idx > 2) { + IE_THROW(Unexpected) << "aux gpr register " << idx << " is not supported"; + } + return XReg(17 + idx); + } + + // TODO: rename to load_vector + void uni_ldr(const TReg& data, + const XReg& ptr, + const Precision& src_prc, + const Precision& dst_prc, + const bool broadcast, + const int32_t offset = 0); + + // TODO: rename to load_scalar + void uni_ldr(const SReg& data, + const XReg& ptr, + const Precision& src_prc, + const Precision& dst_prc, + const int32_t offset = 0); + + // TODO: rename to store_vector + void uni_str(const XReg& ptr, + const TReg& data, + const Precision& src_prc, + const Precision& dst_prc, + const int32_t offset = 0); + + // TODO: rename to store_scalar + void uni_str(const XReg& ptr, + const SReg& data, + const Precision& src_prc, + const Precision& dst_prc, + const int32_t offset = 0); + + std::shared_ptr create_eltwise_emitter(const EltwiseData& data, const Precision& exec_prec); + + void compute_eltwise_op(); + void apply_post_ops(); + + const std::vector eltwise_data_; + const std::vector ops_list_; + const dnnl::post_ops post_ops_; + + std::shared_ptr eltwise_emitter = nullptr; + std::vector> post_op_emitters; +}; + +class eltwise_precision_helper { +public: + static InferenceEngine::Precision get_precision(const size_t inputs_number, + const InferenceEngine::Precision (&src_prc)[MAX_ELTWISE_INPUTS], + const std::vector& eltwise_data); + +private: + static std::set> get_supported_precisions(const Algorithm& algo); +}; + +} // namespace aarch64 +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/jit_eltwise_call_args_ptrs.hpp b/src/plugins/intel_cpu/src/nodes/kernels/jit_eltwise_call_args_ptrs.hpp new file mode 100644 index 00000000000000..7370bb824d8c62 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/jit_eltwise_call_args_ptrs.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include + +namespace ov { +namespace intel_cpu { +namespace node { + +#define MAX_ELTWISE_INPUTS 7 +#define MAX_ELTWISE_DIM_RANK 12 + +struct jit_eltwise_call_args_ptrs { + const void *src_ptr[MAX_ELTWISE_INPUTS]; + void *dst_ptr; + //ptr to array of post op inputs pointers (flat list) + const void** post_op_data; + + // shape agnostic kernel + size_t work_amount; + const void *src_offsets[MAX_ELTWISE_INPUTS]; + const void *dst_offsets; +}; + +} // namespace node +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp index fa247408b32e06..2e793bbe416d4b 100644 --- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp +++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp @@ -42,6 +42,11 @@ impl_desc_type parse_impl_name(std::string impl_desc_name) { SEARCH_WORD(reorder); SEARCH_WORD(sparse); SEARCH_WORD(acl); + SEARCH_WORD(asimd); + SEARCH_WORD(sve128); + SEARCH_WORD(sve256); + SEARCH_WORD(sve384); + SEARCH_WORD(sve512); if ((res & impl_desc_type::avx2) != impl_desc_type::avx2 && (res & impl_desc_type::avx512) != impl_desc_type::avx512) SEARCH_WORD(avx); @@ -120,6 +125,11 @@ const char* impl_type_to_string(impl_desc_type type) { CASE(gemm_acl); CASE(winograd_acl); CASE(gemm_mlas); + CASE(jit_asimd); + CASE(jit_sve128); + CASE(jit_sve256); + CASE(jit_sve384); + CASE(jit_sve512); #undef CASE return "unknown"; diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h index e30511de60cea7..b1e32e6733b42b 100644 --- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h +++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h @@ -42,6 +42,12 @@ enum impl_desc_type { //mlas backend mlas = 1<<26, + asimd = 1<<27, + sve128 = 1<<28, + sve256 = 1<<29, + sve384 = 1<<30, + sve512 = 1<<31, + // real types ref_any = ref | any, @@ -100,7 +106,13 @@ enum impl_desc_type { dw_acl = _dw | acl, gemm_acl = gemm | acl, winograd_acl = winograd | acl, - gemm_mlas = gemm | mlas + gemm_mlas = gemm | mlas, + + jit_asimd = jit | asimd, + jit_sve128 = jit | sve128, + jit_sve256 = jit | sve256, + jit_sve384 = jit | sve384, + jit_sve512 = jit | sve512 }; const char * impl_type_to_string(impl_desc_type type); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp index 3e2158e7a383f7..b0b3fc274f5ea2 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp @@ -38,7 +38,7 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr &nGraphF CPU_REGISTER_PASS_X64(manager, ov::pass::Validate); CPU_REGISTER_PASS_COMMON(manager, AlignMatMulInputRanks); CPU_REGISTER_PASS_COMMON(manager, ConvertTileToSeqTiles); - CPU_REGISTER_PASS_X64(manager, ConvertToPowerStatic); + CPU_REGISTER_PASS_COMMON(manager, ConvertToPowerStatic); CPU_REGISTER_PASS_COMMON(manager, ConvertToLeakyRelu); CPU_REGISTER_PASS_COMMON(manager, ConvertToSwishCPU); CPU_REGISTER_PASS_COMMON(manager, OptimizeSequenceTransposes); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp index f8a80efe1f6223..6b833639b6ab26 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/eltwise.cpp @@ -87,6 +87,7 @@ std::vector eltwise_op_types_dynamic = { EltwiseTypes::ADD, EltwiseTypes::MULTIPLY, EltwiseTypes::SUBTRACT, + EltwiseTypes::POWER, }; ov::test::Config additional_config = {}; @@ -145,6 +146,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_dynamic_large_upper_bound, std::vector> inShapesSingleThread = { + {{1, 1, 1, 2}}, + {{1, 1, 1, 4}}, {{1, 2, 3, 4}}, {{2, 2, 2, 2}}, {{2, 1, 2, 1, 2, 2}}, @@ -152,6 +155,7 @@ std::vector> inShapesSingleThread = { std::vector eltwise_op_typesSingleThread = { EltwiseTypes::ADD, + EltwiseTypes::MULTIPLY, EltwiseTypes::POWER, }; diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.cpp index f061a6482ded74..5196521aaae5b2 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/activation.cpp @@ -102,7 +102,8 @@ void ActivationLayerCPUTest::SetUp() { inType = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(inPrecision); outType = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(outPrecision); - selectedType = getPrimitiveType() + "_" + netPrecision.name(); + const auto primitiveType = getPrimitiveType(activationType, inType, inputShapes); + selectedType = primitiveType.empty() ? "" : getPrimitiveType(activationType, inType, inputShapes) + "_" + netPrecision.name(); #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) # if defined(OPENVINO_ARCH_ARM) diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/eltwise.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/eltwise.cpp index 3595284e84784c..8dbd842c30fcd1 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/eltwise.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/classes/eltwise.cpp @@ -122,7 +122,10 @@ void EltwiseLayerCPUTest::SetUp() { init_input_shapes(shapes); configuration.insert(additionalConfig.begin(), additionalConfig.end()); - updateSelectedType(getPrimitiveType(), netType, configuration); + updateSelectedType( + getPrimitiveType(eltwiseType, netType, shapes), + netType, + configuration); // selectedType = makeSelectedTypeStr(getPrimitiveType(), netType); #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) if (eltwiseType == POWER) { diff --git a/src/plugins/intel_cpu/tests/functional/test_utils/cpu_test_utils.cpp b/src/plugins/intel_cpu/tests/functional/test_utils/cpu_test_utils.cpp index e355ee79b75bcc..3e136da286a6b8 100644 --- a/src/plugins/intel_cpu/tests/functional/test_utils/cpu_test_utils.cpp +++ b/src/plugins/intel_cpu/tests/functional/test_utils/cpu_test_utils.cpp @@ -125,7 +125,8 @@ void CPUTestsBase::CheckPluginRelatedResults(InferenceEngine::ExecutableNetwork void CPUTestsBase::CheckPluginRelatedResults(const ov::CompiledModel &execNet, const std::set& nodeType) const { if (!execNet || nodeType.empty()) return; - ASSERT_TRUE(!selectedType.empty()) << "Node type is not defined."; + // selectedType can be empty if node is decomposed and executed by different kernels + // ASSERT_TRUE(!selectedType.empty()) << "Node type is not defined."; auto function = execNet.get_runtime_model(); CheckPluginRelatedResultsImpl(function, nodeType); } @@ -140,6 +141,7 @@ void CPUTestsBase::CheckPluginRelatedResults(const ov::CompiledModel &execNet, c void CPUTestsBase::CheckPluginRelatedResultsImpl(const std::shared_ptr& function, const std::set& nodeType) const { ASSERT_NE(nullptr, function); + for (const auto &node : function->get_ops()) { const auto & rtInfo = node->get_rt_info(); auto getExecValue = [&rtInfo](const std::string & paramName) -> std::string { @@ -209,7 +211,6 @@ void CPUTestsBase::CheckPluginRelatedResultsImpl(const std::shared_ptrget_output_partial_shape(i); @@ -219,9 +220,11 @@ void CPUTestsBase::CheckPluginRelatedResultsImpl(const std::shared_ptr>>& input_shapes) const { + if ((eltwise_type == ngraph::helpers::EltwiseTypes::ADD) || + (eltwise_type == ngraph::helpers::EltwiseTypes::MULTIPLY) || + (eltwise_type == ngraph::helpers::EltwiseTypes::SUBTRACT) || + (eltwise_type == ngraph::helpers::EltwiseTypes::DIVIDE)) { + return "jit"; + } + return "acl"; +} + +std::string CPUTestsBase::getPrimitiveType(const ngraph::helpers::ActivationTypes& activation_type, + const ov::element::Type_t& element_type, + const std::vector>>& input_shapes) const { + if ((element_type == ov::element::f32) && (activation_type == ngraph::helpers::ActivationTypes::Relu)) { + return "jit"; + } + + if (activation_type == ngraph::helpers::ActivationTypes::Mish) { + // operation is decomposed and executed by different kernels + return ""; + } + + return "acl"; +} + std::string CPUTestsBase::getPrimitiveType() const { +#if defined(OV_CPU_WITH_ACL) return "acl"; +#else + return "ref"; +#endif } #else +std::string CPUTestsBase::getPrimitiveType(const ngraph::helpers::EltwiseTypes& eltwise_type, + const ov::element::Type_t& element_type, + const std::vector>>& input_shapes) const { + return getPrimitiveType(); +} + +std::string CPUTestsBase::getPrimitiveType(const ngraph::helpers::ActivationTypes& activation_type, + const ov::element::Type_t& element_type, + const std::vector>>& input_shapes) const { + return getPrimitiveType(); +} + std::string CPUTestsBase::getPrimitiveType() const { std::string isaType; if (InferenceEngine::with_cpu_x86_avx512f()) { diff --git a/src/plugins/intel_cpu/tests/functional/test_utils/cpu_test_utils.hpp b/src/plugins/intel_cpu/tests/functional/test_utils/cpu_test_utils.hpp index d8deddfebe5d69..b333b26cdb2095 100644 --- a/src/plugins/intel_cpu/tests/functional/test_utils/cpu_test_utils.hpp +++ b/src/plugins/intel_cpu/tests/functional/test_utils/cpu_test_utils.hpp @@ -160,6 +160,14 @@ class CPUTestsBase { virtual bool primTypeCheck(std::string primType) const; protected: + std::string getPrimitiveType(const ngraph::helpers::EltwiseTypes& eltwise_type, + const ov::element::Type_t& element_type, + const std::vector>>& input_shapes) const; + + std::string getPrimitiveType(const ngraph::helpers::ActivationTypes& activation_type, + const ov::element::Type_t& element_type, + const std::vector>>& input_shapes) const; + std::string getPrimitiveType() const; std::string getISA(bool skip_amx) const; std::vector inFmts, outFmts; diff --git a/src/plugins/intel_cpu/tests/unit/nodes/eltwise_node_test.cpp b/src/plugins/intel_cpu/tests/unit/nodes/eltwise_node_test.cpp index 4b405b9d7b6ff5..5feed937bff86b 100644 --- a/src/plugins/intel_cpu/tests/unit/nodes/eltwise_node_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/nodes/eltwise_node_test.cpp @@ -19,7 +19,7 @@ TEST(EltwisePrecisionHelperTest, get_precision_mixed) { src_prc[i] = InferenceEngine::Precision::I32; } - std::vector eltwise_data = { + std::vector eltwise_data = { {Algorithm::EltwiseMultiply}, {Algorithm::EltwiseMulAdd} }; @@ -35,7 +35,7 @@ TEST(EltwisePrecisionHelperTest, get_precision_single) { src_prc[i] = InferenceEngine::Precision::I32; } - std::vector eltwise_data = { + std::vector eltwise_data = { {Algorithm::EltwiseMultiply}, {Algorithm::EltwiseMod} }; diff --git a/src/tests/functional/shared_test_classes/src/single_layer/eltwise.cpp b/src/tests/functional/shared_test_classes/src/single_layer/eltwise.cpp index 685aa4517c473e..fa11c95ddeca8e 100644 --- a/src/tests/functional/shared_test_classes/src/single_layer/eltwise.cpp +++ b/src/tests/functional/shared_test_classes/src/single_layer/eltwise.cpp @@ -104,7 +104,7 @@ void EltwiseLayerTest::SetUp() { secondaryInput = param; parameters.push_back(param); } else { - ov::Shape shape = inputDynamicShapes.back().get_max_shape(); + ov::Shape shape = shape_input_secondary.get_max_shape(); switch (eltwiseType) { case ngraph::helpers::EltwiseTypes::DIVIDE: case ngraph::helpers::EltwiseTypes::MOD: @@ -114,7 +114,7 @@ void EltwiseLayerTest::SetUp() { break; } case ngraph::helpers::EltwiseTypes::POWER: - secondaryInput = ngraph::builder::makeConstant(netType, shape, {}, true, 3); + secondaryInput = ngraph::builder::makeConstant(netType, shape, {}, true, 3.5); break; default: secondaryInput = ngraph::builder::makeConstant(netType, shape, {}, true);