From 434e66d7fbbe1c763968a6e0aded8fc4f713f7e3 Mon Sep 17 00:00:00 2001 From: Marina Kolpakova Date: Sun, 14 Feb 2021 15:40:34 +0300 Subject: [PATCH] [CPU] Refactors jitters for nGraph interop (#4255) --- .../emitters/jit_bf16_emitters.hpp | 73 +++++ .../jit_eltwise_emitters.cpp | 274 +++++++++++++----- .../jit_eltwise_emitters.hpp | 179 +++++++++--- .../emitter.cpp => emitters/jit_emitter.cpp} | 23 +- .../emitter.h => emitters/jit_emitter.hpp} | 43 +-- .../jit_load_store_emitters.cpp | 80 ++--- .../jit_load_store_emitters.hpp} | 14 +- .../jit_mkldnn_emitters.cpp | 40 ++- .../jit_mkldnn_emitters.hpp | 43 ++- .../mkldnn_plugin/nodes/common/softmax.cpp | 5 +- .../nodes/mkldnn_eltwise_node.cpp | 101 +++---- .../nodes/mkldnn_interpolate_node.cpp | 5 +- .../mkldnn_plugin/nodes/mkldnn_mvn_node.cpp | 19 +- .../nodes/mkldnn_normalize_node.cpp | 5 +- .../nodes/mkldnn_reduce_node.cpp | 9 +- .../src/mkldnn_plugin/nodes/region_yolo.cpp | 5 +- .../src/mkldnn_plugin/utils/bfloat16.hpp | 65 +---- 17 files changed, 630 insertions(+), 353 deletions(-) create mode 100644 inference-engine/src/mkldnn_plugin/emitters/jit_bf16_emitters.hpp rename inference-engine/src/mkldnn_plugin/{nodes => emitters}/jit_eltwise_emitters.cpp (84%) rename inference-engine/src/mkldnn_plugin/{nodes => emitters}/jit_eltwise_emitters.hpp (62%) rename inference-engine/src/mkldnn_plugin/{nodes/common/emitter.cpp => emitters/jit_emitter.cpp} (91%) rename inference-engine/src/mkldnn_plugin/{nodes/common/emitter.h => emitters/jit_emitter.hpp} (73%) rename inference-engine/src/mkldnn_plugin/{nodes/common => emitters}/jit_load_store_emitters.cpp (88%) rename inference-engine/src/mkldnn_plugin/{nodes/common/jit_load_store_emitters.h => emitters/jit_load_store_emitters.hpp} (94%) rename inference-engine/src/mkldnn_plugin/{nodes => emitters}/jit_mkldnn_emitters.cpp (61%) rename inference-engine/src/mkldnn_plugin/{nodes => emitters}/jit_mkldnn_emitters.hpp (52%) diff --git a/inference-engine/src/mkldnn_plugin/emitters/jit_bf16_emitters.hpp b/inference-engine/src/mkldnn_plugin/emitters/jit_bf16_emitters.hpp new file mode 100644 index 00000000000000..1c6aa581278e2c --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_bf16_emitters.hpp @@ -0,0 +1,73 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once + +#include "jit_emitter.hpp" + +namespace MKLDNNPlugin { + +class jit_emu_vcvtneps2bf16 : public jit_emitter { +public: + jit_emu_vcvtneps2bf16(mkldnn::impl::cpu::x64::jit_generator* host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::BF16) : jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); + }; + + size_t get_inputs_num() const override { return 1; }; + +private: + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs, + const std::vector& pool_vec_idxs, const std::vector& pool_gpr_idxs, + const emitter_context *emit_context) const override { + if (host_isa_ == mkldnn::impl::cpu::x64::cpu_isa_t::avx512_common) { + Xbyak::Zmm in = Xbyak::Zmm(in_vec_idxs[0]); + Xbyak::Ymm out = Xbyak::Ymm(out_vec_idxs[0]); + Xbyak::Zmm aux = Xbyak::Zmm(aux_vec_idxs[0]); + Xbyak::Zmm aux1 = Xbyak::Zmm(aux_vec_idxs[1]); + + h->uni_vpsrld(aux, in, 16); + h->vpandd(aux, aux, table_val("one")); + h->uni_vmovups(aux1, table_val("even")); + h->uni_vpaddd(aux, aux1, aux); + h->uni_vpaddd(aux, in, aux); + h->vfixupimmps(aux, in, table_val("selector"), 0); + h->vpsrad(aux, aux, 16); + h->vpmovdw(out, aux); + } else { + assert(!"unsupported isa"); + } + }; + + + inline int encode_fixup_selector(int input, int output) { + return ((output) << (4 * (input))); + } + + void register_table_entries() override { + enum { + fixup_input_code_qnan_ = 0, + fixup_input_code_snan_ = 1, + fixup_input_code_ninf_ = 4, + fixup_input_code_pinf_ = 5, + fixup_output_code_copy_input_ = 1, + fixup_output_code_qnan_input_ = 2, + }; + const int selector_int32 = + /* qnan input to qnan output (presenrving input bits 0..21) */ + encode_fixup_selector(fixup_input_code_snan_, fixup_output_code_qnan_input_) | + /* snan input to qnan output (presenrving input bits 0..21) */ + encode_fixup_selector(fixup_input_code_qnan_, fixup_output_code_qnan_input_) | + /* neg inf input copied to output */ + encode_fixup_selector(fixup_input_code_ninf_, fixup_output_code_copy_input_) | + /* pos inf input copied to output */ + encode_fixup_selector(fixup_input_code_pinf_, fixup_output_code_copy_input_); + push_arg_entry_of("one", 0x00000001, true); + push_arg_entry_of("even", 0x00007fff, true); + push_arg_entry_of("selector", selector_int32, true); + } + + size_t aux_vecs_count() const override { return 2; } +}; + +} // namespace MKLDNNPlugin { \ No newline at end of file diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp b/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp similarity index 84% rename from inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp rename to inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp index 447c812f5ac191..aefca48ca3927b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.cpp @@ -2,13 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "common/emitter.h" #include "jit_eltwise_emitters.hpp" -#include "mkldnn_eltwise_node.h" #include #include "legacy/ie_layers.h" - +#include using namespace InferenceEngine; using namespace mkldnn::impl::utils; @@ -19,14 +17,16 @@ using namespace Xbyak; namespace MKLDNNPlugin { /// ADD /// +jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -size_t jit_add_emitter::get_inputs_num() { return 2; } +size_t jit_add_emitter::get_inputs_num() const { return 2; } void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -54,14 +54,16 @@ void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std } /// MUL_ADD /// +jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -size_t jit_mul_add_emitter::get_inputs_num() { return 3; } +size_t jit_mul_add_emitter::get_inputs_num() const { return 3; } void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -114,14 +116,16 @@ size_t jit_mul_add_emitter::aux_vecs_count() const { } /// SUB /// +jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -size_t jit_subtract_emitter::get_inputs_num() { return 2; } +size_t jit_subtract_emitter::get_inputs_num() const { return 2; } void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -150,14 +154,16 @@ void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, cons /// MULTIPLY /// +jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -size_t jit_multiply_emitter::get_inputs_num() { return 2; } +size_t jit_multiply_emitter::get_inputs_num() const { return 2; } void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -186,14 +192,16 @@ void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, cons /// DIVIDE /// +jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -size_t jit_divide_emitter::get_inputs_num() { return 2; } +size_t jit_divide_emitter::get_inputs_num() const { return 2; } void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -251,14 +259,16 @@ size_t jit_divide_emitter::aux_vecs_count() const { } /// FLOOR_MOD /// +jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -size_t jit_floor_mod_emitter::get_inputs_num() { return 2; } +size_t jit_floor_mod_emitter::get_inputs_num() const { return 2; } void jit_floor_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -301,14 +311,16 @@ size_t jit_floor_mod_emitter::aux_vecs_count() const { } /// MOD /// +jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -size_t jit_mod_emitter::get_inputs_num() { return 2; } +size_t jit_mod_emitter::get_inputs_num() const { return 2; } void jit_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -351,14 +363,16 @@ size_t jit_mod_emitter::aux_vecs_count() const { } /// MAXIMUM /// +jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -size_t jit_maximum_emitter::get_inputs_num() { return 2; } +size_t jit_maximum_emitter::get_inputs_num() const { return 2; } void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -399,14 +413,16 @@ std::set jit_maximum_emitter::get_supported_precisio } /// MINIMUM /// +jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -size_t jit_minimum_emitter::get_inputs_num() { return 2; } +size_t jit_minimum_emitter::get_inputs_num() const { return 2; } void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -447,14 +463,17 @@ std::set jit_minimum_emitter::get_supported_precisio } /// SQUARED_DIFFERENCE /// +jit_squared_difference_emitter::jit_squared_difference_emitter( + jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} jit_squared_difference_emitter::jit_squared_difference_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -size_t jit_squared_difference_emitter::get_inputs_num() { return 2; } +size_t jit_squared_difference_emitter::get_inputs_num() const { return 2; } void jit_squared_difference_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -486,14 +505,16 @@ void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_ /// POWER_DYNAMIC /// +jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -size_t jit_power_dynamic_emitter::get_inputs_num() { return 2; } +size_t jit_power_dynamic_emitter::get_inputs_num() const { return 2; } void jit_power_dynamic_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -593,16 +614,20 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, /// EQUAL /// +jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -size_t jit_equal_emitter::get_inputs_num() { return 2; } +size_t jit_equal_emitter::get_inputs_num() const { return 2; } void jit_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -650,16 +675,20 @@ size_t jit_equal_emitter::aux_vecs_count() const { } /// NOT_EQUAL /// +jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -size_t jit_not_equal_emitter::get_inputs_num() { return 2; } +size_t jit_not_equal_emitter::get_inputs_num() const { return 2; } void jit_not_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -707,16 +736,20 @@ size_t jit_not_equal_emitter::aux_vecs_count() const { } /// GREATER /// +jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -size_t jit_greater_emitter::get_inputs_num() { return 2; } +size_t jit_greater_emitter::get_inputs_num() const { return 2; } void jit_greater_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -764,16 +797,20 @@ size_t jit_greater_emitter::aux_vecs_count() const { } /// GREATER_EQUAL /// +jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -size_t jit_greater_equal_emitter::get_inputs_num() { return 2; } +size_t jit_greater_equal_emitter::get_inputs_num() const { return 2; } void jit_greater_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -821,16 +858,20 @@ size_t jit_greater_equal_emitter::aux_vecs_count() const { } /// LESS /// +jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -size_t jit_less_emitter::get_inputs_num() { return 2; } +size_t jit_less_emitter::get_inputs_num() const { return 2; } void jit_less_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -878,16 +919,20 @@ size_t jit_less_emitter::aux_vecs_count() const { } /// LESS_EQUAL /// +jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -size_t jit_less_equal_emitter::get_inputs_num() { return 2; } +size_t jit_less_equal_emitter::get_inputs_num() const { return 2; } void jit_less_equal_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -936,16 +981,20 @@ size_t jit_less_equal_emitter::aux_vecs_count() const { } /// LOGICAL_AND /// +jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -size_t jit_logical_and_emitter::get_inputs_num() { return 2; } +size_t jit_logical_and_emitter::get_inputs_num() const { return 2; } void jit_logical_and_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -1014,16 +1063,20 @@ size_t jit_logical_and_emitter::aux_vecs_count() const { /// LOGICAL_OR /// +jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -size_t jit_logical_or_emitter::get_inputs_num() { return 2; } +size_t jit_logical_or_emitter::get_inputs_num() const { return 2; } void jit_logical_or_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -1091,16 +1144,20 @@ size_t jit_logical_or_emitter::aux_vecs_count() const { } /// LOGICAL_XOR /// +jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -size_t jit_logical_xor_emitter::get_inputs_num() { return 2; } +size_t jit_logical_xor_emitter::get_inputs_num() const { return 2; } void jit_logical_xor_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -1168,16 +1225,20 @@ size_t jit_logical_xor_emitter::aux_vecs_count() const { } /// LOGICAL_NOT /// +jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -size_t jit_logical_not_emitter::get_inputs_num() { return 1; } +size_t jit_logical_not_emitter::get_inputs_num() const { return 1; } void jit_logical_not_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -1224,16 +1285,44 @@ size_t jit_logical_not_emitter::aux_vecs_count() const { } /// POWER_STATIC /// +jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + auto parent = node->input(1).get_source_output().get_node_shared_ptr(); + if (!std::dynamic_pointer_cast(parent)) { + throw ngraph::ngraph_error("unsupported non constant power"); + } + + if (!(node->input(1).get_shape() == ngraph::Shape() || ngraph::shape_size(node->input(1).get_shape()) == 1)) { + throw ngraph::ngraph_error("unsupported non scalar power"); + } + power = ngraph::as_type_ptr(parent)->get_data_ptr()[0]; + scale = 1.f; + shift = 0.f; + push_arg_entry_of("power", float2int(power), true); + push_arg_entry_of("scale", 0x3f800000, true); + push_arg_entry_of("shift", 0x00000000, true); + push_arg_entry_of("one", 0x3f800000, true); + + prepare_table(); +} jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { + auto *powerLayer = dynamic_cast(node->getCnnLayer().get()); + if (powerLayer == nullptr) + THROW_IE_EXCEPTION << "Cannot convert power layer."; + + power = powerLayer->power; + scale = powerLayer->scale; + shift = powerLayer->offset; + prepare_table(); } -size_t jit_power_static_emitter::get_inputs_num() { return 1; } +size_t jit_power_static_emitter::get_inputs_num() const { return 1; } void jit_power_static_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -1252,14 +1341,6 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, Vmm vmm_dst = Vmm(out_vec_idxs[0]); Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - auto *powerLayer = dynamic_cast(n->getCnnLayer().get()); - if (powerLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot convert power layer."; - - float power = powerLayer->power; - float scale = powerLayer->scale; - float shift = powerLayer->offset; - Xmm xmm0 = Xmm(0), xmm1 = Xmm(1); if (scale != 1.f || shift != 0.f) { @@ -1394,17 +1475,9 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, } void jit_power_static_emitter::register_table_entries() { - auto *powerLayer = dynamic_cast(n->getCnnLayer().get()); - if (powerLayer == nullptr) - THROW_IE_EXCEPTION << "Cannot convert power layer."; - - float power_ = powerLayer->power; - float scale_ = powerLayer->scale; - float shift_ = powerLayer->offset; - - push_arg_entry_of("power", float2int(power_), true); - push_arg_entry_of("scale", float2int(scale_), true); - push_arg_entry_of("shift", float2int(shift_), true); + push_arg_entry_of("power", float2int(power), true); + push_arg_entry_of("scale", float2int(scale), true); + push_arg_entry_of("shift", float2int(shift), true); push_arg_entry_of("one", float2int(1.f), true); } @@ -1413,16 +1486,19 @@ size_t jit_power_static_emitter::aux_vecs_count() const { } /// PRELU /// +jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) { + prepare_table(); +} jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } - -size_t jit_prelu_emitter::get_inputs_num() { return 2; } +size_t jit_prelu_emitter::get_inputs_num() const { return 2; } void jit_prelu_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == cpu::x64::avx2) { @@ -1469,4 +1545,64 @@ size_t jit_prelu_emitter::aux_vecs_count() const { return 2; } +/// SQRT /// +jit_sqrt_emitter::jit_sqrt_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} +jit_sqrt_emitter::jit_sqrt_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_sqrt_emitter::get_inputs_num() const { return 1; } + +void jit_sqrt_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emit_context) const { + if (host_isa_ == cpu::x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::x64::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_sqrt_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src0 = Vmm(in_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + + h->uni_vsqrtps(vmm_dst, vmm_src0); +} + +/// Negate /// +jit_negative_emitter::jit_negative_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) +: jit_emitter(host, host_isa, node, exec_prc) {} + +size_t jit_negative_emitter::get_inputs_num() const { return 1; } + +void jit_negative_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emit_context) const { + if (host_isa_ == cpu::x64::sse41) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::x64::avx2) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else if (host_isa_ == cpu::x64::avx512_common) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + assert(!"unsupported isa"); + } +} + +template +void jit_negative_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + using Vmm = typename conditional3::type; + Vmm vmm_src = Vmm(in_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); + h->uni_vpxor(vmm_dst, vmm_dst, vmm_dst); + h->uni_vsubps(vmm_dst, vmm_dst, vmm_src); +} + } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp b/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.hpp similarity index 62% rename from inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp rename to inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.hpp index fb8d2e16fb1480..0fe92fa50180ad 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_eltwise_emitters.hpp @@ -4,8 +4,8 @@ #pragma once -#include "common/emitter.h" #include +#include "jit_emitter.hpp" #include "mkldnn_node.h" namespace MKLDNNPlugin { @@ -14,13 +14,15 @@ class jit_add_emitter : public jit_emitter { public: jit_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -30,13 +32,15 @@ class jit_mul_add_emitter : public jit_emitter { public: jit_mul_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_mul_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -49,13 +53,15 @@ class jit_subtract_emitter : public jit_emitter { public: jit_subtract_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_subtract_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -66,13 +72,15 @@ class jit_multiply_emitter : public jit_emitter { public: jit_multiply_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_multiply_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -83,14 +91,16 @@ class jit_divide_emitter : public jit_emitter { public: jit_divide_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_divide_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; static std::set get_supported_precisions(); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -102,13 +112,15 @@ class jit_floor_mod_emitter : public jit_emitter { public: jit_floor_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_floor_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -120,13 +132,15 @@ class jit_mod_emitter : public jit_emitter { public: jit_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -138,14 +152,16 @@ class jit_maximum_emitter : public jit_emitter { public: jit_maximum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_maximum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; static std::set get_supported_precisions(); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -156,14 +172,16 @@ class jit_minimum_emitter : public jit_emitter { public: jit_minimum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_minimum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; static std::set get_supported_precisions(); private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -174,13 +192,15 @@ class jit_squared_difference_emitter : public jit_emitter { public: jit_squared_difference_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_squared_difference_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -191,13 +211,15 @@ class jit_power_dynamic_emitter : public jit_emitter { public: jit_power_dynamic_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_power_dynamic_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -208,13 +230,15 @@ class jit_equal_emitter : public jit_emitter { public: jit_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -228,13 +252,15 @@ class jit_not_equal_emitter : public jit_emitter { public: jit_not_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_not_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -248,13 +274,15 @@ class jit_greater_emitter : public jit_emitter { public: jit_greater_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_greater_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -268,13 +296,15 @@ class jit_greater_equal_emitter : public jit_emitter { public: jit_greater_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_greater_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -288,13 +318,15 @@ class jit_less_emitter : public jit_emitter { public: jit_less_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_less_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -309,12 +341,15 @@ class jit_less_equal_emitter : public jit_emitter { jit_less_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + jit_less_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -328,13 +363,15 @@ class jit_logical_and_emitter : public jit_emitter { public: jit_logical_and_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_logical_and_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -348,13 +385,15 @@ class jit_logical_or_emitter : public jit_emitter { public: jit_logical_or_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_logical_or_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -368,13 +407,15 @@ class jit_logical_xor_emitter : public jit_emitter { public: jit_logical_xor_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_logical_xor_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -387,13 +428,15 @@ class jit_logical_not_emitter : public jit_emitter { public: jit_logical_not_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_logical_not_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -405,33 +448,41 @@ class jit_logical_not_emitter : public jit_emitter { class jit_power_static_emitter : public jit_emitter { public: jit_power_static_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_power_static_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; + + float power; + float scale; + float shift; }; class jit_prelu_emitter : public jit_emitter { public: jit_prelu_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_prelu_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; @@ -439,4 +490,38 @@ class jit_prelu_emitter : public jit_emitter { size_t aux_vecs_count() const override; }; +class jit_sqrt_emitter : public jit_emitter { +public: + jit_sqrt_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_sqrt_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() const override; + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, + const emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + +class jit_negative_emitter : public jit_emitter { +public: + jit_negative_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + + size_t get_inputs_num() const override; + +private: + void emit_impl(const std::vector& in, const std::vector& out, + const std::vector& pool, const std::vector& gpr, + const MKLDNNPlugin::emitter_context *emit_context) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp b/inference-engine/src/mkldnn_plugin/emitters/jit_emitter.cpp similarity index 91% rename from inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp rename to inference-engine/src/mkldnn_plugin/emitters/jit_emitter.cpp index 1aa4744b249bdf..2b3bedb1fe16a9 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_emitter.cpp @@ -2,10 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "emitter.h" - +#include "jit_emitter.hpp" #include "utils/general_utils.h" - #include using namespace mkldnn::impl::cpu; @@ -57,7 +55,7 @@ std::set jit_emitter::get_supported_precisions() { } void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { using namespace Xbyak::util; bool is_vec_input = (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::vec_to_gpr); bool is_vec_output = (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::gpr_to_vec); @@ -148,7 +146,8 @@ void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std load_table_addr(); } -void jit_emitter::emitter_postamble() { + +void jit_emitter::emitter_postamble() const { using namespace Xbyak::util; for (size_t i = 0; i < preserved_vec_idxs.size(); ++i) @@ -167,9 +166,9 @@ void jit_emitter::emitter_postamble() { aux_gpr_idxs.clear(); } -void jit_emitter::emit_table() { +void jit_emitter::emit_data() const { h->align(64); - h->L(l_table); + h->L(*l_table.get()); // Assumption: entries can be inserted with dd, so they should be 4 bytes. assert(sizeof(table_entry_val_t) == 4); @@ -198,8 +197,8 @@ void jit_emitter::prepare_table() { } } -void jit_emitter::emit(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { +void jit_emitter::emit_code(const std::vector &in_idxs, const std::vector &out_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); emit_impl(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs, nullptr); @@ -207,9 +206,9 @@ void jit_emitter::emit(const std::vector &in_idxs, const std::vector &in_idxs, const std::vector &out_idxs, - const std::shared_ptr &emit_context, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { +void jit_emitter::emit_code(const std::vector &in_idxs, const std::vector &out_idxs, + const std::shared_ptr &emit_context, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); emit_impl(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs, emit_context.get()); diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/emitter.h b/inference-engine/src/mkldnn_plugin/emitters/jit_emitter.hpp similarity index 73% rename from inference-engine/src/mkldnn_plugin/nodes/common/emitter.h rename to inference-engine/src/mkldnn_plugin/emitters/jit_emitter.hpp index 5f6428c2fa50a6..dd7fce5a570c0b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/common/emitter.h +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_emitter.hpp @@ -6,6 +6,7 @@ #include #include + #include "mkldnn_node.h" #include @@ -25,20 +26,26 @@ struct emitter_context { class jit_emitter { public: - jit_emitter(mkldnn::impl::cpu::x64::jit_generator* host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) - : h(host), host_isa_(host_isa), n(node), exec_prc_(exec_prc), in_out_type_(in_out_type) { + : h(host), host_isa_(host_isa), exec_prc_(exec_prc), in_out_type_(in_out_type), l_table (new Xbyak::Label()) { k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well } - virtual void emit(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}); + jit_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) + : h(host), host_isa_(host_isa), exec_prc_(exec_prc), in_out_type_(in_out_type), l_table (new Xbyak::Label()) { + k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well + } + + virtual void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, + const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const; + virtual void emit_data() const; - virtual void emit(const std::vector &in_idxs, const std::vector &out_idxs, + virtual void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, const std::shared_ptr &emit_context, const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}); - virtual void emit_table(); - virtual size_t get_inputs_num() = 0; + virtual size_t get_inputs_num() const = 0; virtual size_t aux_vecs_count() const; static std::set get_supported_precisions(); @@ -48,17 +55,15 @@ class jit_emitter { size_t get_max_vecs_count() const; size_t get_vec_length() const; - const MKLDNNNode* n; mkldnn::impl::cpu::x64::jit_generator* h; mkldnn::impl::cpu::x64::cpu_isa_t host_isa_; InferenceEngine::Precision exec_prc_; - Xbyak::Opmask k_mask; virtual void prepare_table(); virtual void register_table_entries() {} - void load_table_addr() { h->mov(p_table, l_table); } + void load_table_addr() const { h->mov(p_table, *l_table.get()); } // we accept only 32bit hexadecimal table values to avoid any rounding using table_entry_val_t = uint32_t; @@ -75,8 +80,8 @@ class jit_emitter { table_entry_bcast_t bcast; }; - Xbyak::Reg64 p_table; - Xbyak::Label l_table; + mutable Xbyak::Reg64 p_table; + mutable std::shared_ptr l_table; enum { _cmp_eq_oq = mkldnn::impl::cpu::x64::jit_generator::_cmp_eq_oq, @@ -89,16 +94,16 @@ class jit_emitter { virtual void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) = 0; + const emitter_context *emit_context) const = 0; virtual void emitter_preamble(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs); - virtual void emitter_postamble(); + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const; + virtual void emitter_postamble() const; emitter_in_out_map in_out_type_; - std::vector aux_vec_idxs; - std::vector aux_gpr_idxs; + mutable std::vector aux_vec_idxs; + mutable std::vector aux_gpr_idxs; static constexpr int k_mask_size = 8; @@ -126,8 +131,8 @@ class jit_emitter { } private: - std::vector preserved_vec_idxs; - std::vector preserved_gpr_idxs; + mutable std::vector preserved_vec_idxs; + mutable std::vector preserved_gpr_idxs; void push_vec(const Xbyak::Address &addr, size_t vec_idx) const; void pop_vec(size_t vec_idx, const Xbyak::Address &addr) const; diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/jit_load_store_emitters.cpp b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp similarity index 88% rename from inference-engine/src/mkldnn_plugin/nodes/common/jit_load_store_emitters.cpp rename to inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp index 75850b92b79eea..dc267ac9866b07 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/common/jit_load_store_emitters.cpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "emitter.h" -#include "jit_load_store_emitters.h" +#include "jit_emitter.hpp" +#include "jit_load_store_emitters.hpp" #include "legacy/ie_layers.h" #include #include "utils/bfloat16.hpp" @@ -21,12 +21,12 @@ namespace MKLDNNPlugin { /// LOAD /// jit_load_emitter::jit_load_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc, emitter_in_out_map in_out_type) -: jit_emitter(host, host_isa, node, exec_prc, in_out_type) { +: jit_emitter(host, host_isa, node, exec_prc, in_out_type), name(node ? node->getName() : "unknown") { prepare_table(); v_len_elt = get_vec_length() / exec_prc.size(); } -size_t jit_load_emitter::get_inputs_num() { return 1; } +size_t jit_load_emitter::get_inputs_num() const { return 1; } // 0 for temp reg for mask load, 1 for table address size_t jit_load_emitter::aux_gprs_count() const { @@ -35,10 +35,10 @@ size_t jit_load_emitter::aux_gprs_count() const { void jit_load_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { const auto* load_emitter_context = dynamic_cast(emit_context); if (load_emitter_context == nullptr) { - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " does not get load emmiter context."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " does not get load emmiter context."; } if (host_isa_ == cpu::x64::sse41) { @@ -51,7 +51,7 @@ void jit_load_emitter::emit_impl(const std::vector &in_idxs, const std:: emit_isa(Reg64(in_idxs[0]), load_emitter_context->offset_byte_, load_emitter_context->src_prc_, static_cast(out_idxs[0]), load_emitter_context->dst_prc_, load_emitter_context->load_num_, load_emitter_context->is_fill_, load_emitter_context->fill_value_); } else { - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " is performed on unsupported isa(at least x64::sse41)."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " is performed on unsupported isa(at least x64::sse41)."; } } @@ -60,10 +60,10 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 ®_src, int offset_byte, In const int out_vec_idx, InferenceEngine::Precision dst_prc, int load_num, bool is_fill, std::string fill_value) const { bool matched_prc = (dst_prc == src_prc) || (dst_prc == Precision::FP32) || (dst_prc == Precision::I32); if (!matched_prc) { - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " only support output precision of FP32 or I32 or the same precision as input."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " only support output precision of FP32 or I32 or the same precision as input."; } if (load_num > (get_vec_length() / dst_prc.size())) { - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " have unexpected number of elements to load."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " have unexpected number of elements to load."; } using Vmm = typename conditional3::type; @@ -94,7 +94,7 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 ®_src, int offset_byte, In load_words_to_dword_extension(Vmm(out_vec_idx), reg_src, offset_byte, true, false, load_num * src_prc.size(), is_fill, fill_value); break; default: - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " has unsupported src precision to load."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " has unsupported src precision to load."; } } @@ -140,12 +140,12 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o // Ensure data fits completely inside the Xmm/Ymm/Zmm register if (load_size < 0 || load_size > 64) - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " has unexpected number of values to load in load_byte."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " has unexpected number of values to load in load_byte."; // check if proper number bytes fit inside the Xmm/Ymm register if (is_ymm && load_size > 32) - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " has unexpected number of values to load to ymm in load_byte."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " has unexpected number of values to load to ymm in load_byte."; if (is_xmm && load_size > 16) - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " has unexpected number of values to load to xmm in load_byte."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " has unexpected number of values to load to xmm in load_byte."; auto xmm = Xbyak::Xmm(vmm.getIdx()); auto ymm = Xbyak::Ymm(vmm.getIdx()); @@ -239,7 +239,7 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o break; case 16: break; default: - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " has unexpected number of values to load in load_byte."; + THROW_IE_EXCEPTION << "Load emitter in " << name<< " has unexpected number of values to load in load_byte."; } if (has_xmm_block) { @@ -295,11 +295,11 @@ void jit_load_emitter::load_bytes_to_dword_extension(const Vmm &vmm, const Xbyak // For Ymm register, load capacity is halved (32 * load_size <= 256) // For Xmm register, load capacity is halved further (32 * load_size <= 128) if (load_size < 0 || load_size > 16) - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " has unexpected number of values to load in load_bytes_to_dword_extension."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " has unexpected number of values to load in load_bytes_to_dword_extension."; if (is_ymm && load_size > 8) - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " has unexpected number of values to load to ymm in load_bytes_to_dword_extension."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " has unexpected number of values to load to ymm in load_bytes_to_dword_extension."; if (is_xmm && load_size > 4) - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " has unexpected number of values to load to xmm in load_bytes_to_dword_extension."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " has unexpected number of values to load to xmm in load_bytes_to_dword_extension."; // For load_size == 4/8/16, do load/extension in one go if (load_size == 16) { @@ -380,11 +380,11 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak // For Ymm register, load capacity is halved (16/2(num) * 32 <= 128) // For Xmm register, load capacity is halved again (8/2(num) * 32 <= 128) if (load_size < 0 || load_size > 32) - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " has unexpected number of values to load in load_words_to_dword_extension."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " has unexpected number of values to load in load_words_to_dword_extension."; if (is_ymm && load_size > 16) - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " has unexpected number of values to load to ymm in load_words_to_dword_extension."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " has unexpected number of values to load to ymm in load_words_to_dword_extension."; if (is_xmm && load_size > 8) - THROW_IE_EXCEPTION << "Load emitter in " << n->getName() << " has unexpected number of values to load to xmm in load_words_to_dword_extension."; + THROW_IE_EXCEPTION << "Load emitter in " << name << " has unexpected number of values to load to xmm in load_words_to_dword_extension."; auto xmm = Xbyak::Xmm(vmm.getIdx()); auto ymm = Xbyak::Ymm(vmm.getIdx()); @@ -491,7 +491,7 @@ void jit_load_emitter::register_table_entries() { /// STORE /// jit_store_emitter::jit_store_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc, emitter_in_out_map in_out_type) -: jit_emitter(host, host_isa, node, exec_prc, in_out_type) { +: jit_emitter(host, host_isa, node, exec_prc, in_out_type), name(node ? node->getName() : "unknown") { v_len_elt = get_vec_length() / exec_prc.size(); if (!mayiuse(cpu::x64::avx512_core_bf16) && mayiuse(cpu::x64::avx512_core)) { emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(host, host_isa, nullptr)); @@ -508,14 +508,14 @@ size_t jit_store_emitter::aux_vecs_count() const { return 1; } -size_t jit_store_emitter::get_inputs_num() { return 1; } +size_t jit_store_emitter::get_inputs_num() const { return 1; } void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) { + const emitter_context *emit_context) const { const auto* store_emitter_context = dynamic_cast(emit_context); if (store_emitter_context == nullptr) { - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " does not get store emmiter context."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " does not get store emmiter context."; } if (host_isa_ == cpu::x64::sse41) { emit_isa(static_cast(in_idxs[0]), store_emitter_context->src_prc_, Reg64(out_idxs[0]), @@ -527,7 +527,7 @@ void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std: emit_isa(static_cast(in_idxs[0]), store_emitter_context->src_prc_, Reg64(out_idxs[0]), store_emitter_context->offset_byte_, store_emitter_context->dst_prc_, store_emitter_context->store_num_); } else { - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " is performed on unsupported isa(at least x64::sse41)."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " is performed on unsupported isa(at least x64::sse41)."; } } @@ -536,12 +536,12 @@ template const Xbyak::Reg64 ®_dst, int offset_byte, InferenceEngine::Precision dst_prc, int store_num) const { bool matched_prc = (src_prc == dst_prc) || (src_prc == Precision::FP32) || (src_prc == Precision::I32); if (!matched_prc) { - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " only support input precision of FP32 or I32 or the same precision as output."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " only support input precision of FP32 or I32 or the same precision as output."; } if ((src_prc == Precision::FP32) || (src_prc == Precision::I32)) { if ((isa == cpu::x64::sse41 && store_num > 4) || (isa == cpu::x64::avx2 && store_num > 8) || (isa == cpu::x64::avx512_common && store_num > 16) || store_num < 0) { - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unexpected number of values to store."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unexpected number of values to store."; } } @@ -586,7 +586,7 @@ template store_dword_to_word_extension(Vmm(in_vec_idx), reg_dst, offset_byte, true, false, store_num); break; default: - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unsupported dst precision to store."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unsupported dst precision to store."; } } } @@ -618,11 +618,11 @@ template // Ensure data fits completely inside the Xmm/Ymm/Zmm register if (store_size < 0 || store_size > 64) - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unexpected number of values to store in store_bytes."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unexpected number of values to store in store_bytes."; if (is_ymm && store_size > 32) - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unexpected number of values to store to ymm in store_bytes."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unexpected number of values to store to ymm in store_bytes."; if (is_xmm && store_size > 16) - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unexpected number of values to store to xmm in store_bytes."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unexpected number of values to store to xmm in store_bytes."; auto xmm = Xbyak::Xmm(vmm.getIdx()); auto ymm = Xbyak::Ymm(vmm.getIdx()); @@ -718,14 +718,14 @@ template break; case 16: break; default: - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unexpected number of values to store in store_bytes."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unexpected number of values to store in store_bytes."; } } } } /** -* store_dword_to_byte_extension is the utility function to +* store_dword_to_byte_extension is the utility function to * 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes with singed or unsinged saturation. * 2. store the packed byte into the memory referenced by ptr[reg + offset] address. */ @@ -743,11 +743,11 @@ template // At most 8 dwords can fit inside the Ymm register // At most 4 dwords can fit inside the Xmm register if (store_num < 0 || store_num > 16) - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unexpected number of values to store in store_dword_to_byte_extension."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unexpected number of values to store in store_dword_to_byte_extension."; if (is_ymm && store_num > 8) - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unexpected number of values to store to ymm in store_dword_to_byte_extension."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unexpected number of values to store to ymm in store_dword_to_byte_extension."; if (is_xmm && store_num > 4) - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unexpected number of values to store to xmm in store_dword_to_byte_extension."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unexpected number of values to store to xmm in store_dword_to_byte_extension."; auto ymm = Xbyak::Ymm(vmm.getIdx()); @@ -816,11 +816,11 @@ template // At most 4 dwords can fit inside the Xmm register // At most 8 dwords can fit inside the Ymm register if (store_num < 0 || store_num > 16) - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unexpected number of values to store in store_dword_to_word_extension."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unexpected number of values to store in store_dword_to_word_extension."; if (is_ymm && store_num > 8) - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unexpected number of values to store to ymm in store_dword_to_word_extension."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unexpected number of values to store to ymm in store_dword_to_word_extension."; if (is_xmm && store_num > 4) - THROW_IE_EXCEPTION << "Store emitter in " << n->getName() << " has unexpected number of values to store to xmm in store_dword_to_word_extension."; + THROW_IE_EXCEPTION << "Store emitter in " << name << " has unexpected number of values to store to xmm in store_dword_to_word_extension."; auto ymm = Xbyak::Ymm(vmm.getIdx()); auto zmm = Xbyak::Zmm(vmm.getIdx()); @@ -829,7 +829,7 @@ template if (mayiuse(cpu::x64::avx512_core_bf16)) { h->vcvtneps2bf16(ymm, zmm); } else { - emu_vcvtneps2bf16->emit({static_cast(vmm.getIdx())}, {static_cast(ymm.getIdx())}); + emu_vcvtneps2bf16->emit_code({static_cast(vmm.getIdx())}, {static_cast(ymm.getIdx())}); } if (store_num == 16) { h->vmovdqu16(ptr[reg + offset], ymm); diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/jit_load_store_emitters.h b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.hpp similarity index 94% rename from inference-engine/src/mkldnn_plugin/nodes/common/jit_load_store_emitters.h rename to inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.hpp index 332d54903e5b99..0f434c2406e3ad 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/common/jit_load_store_emitters.h +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.hpp @@ -4,10 +4,10 @@ #pragma once -#include "emitter.h" +#include "jit_emitter.hpp" #include #include "mkldnn_node.h" -#include "utils/bfloat16.hpp" +#include "jit_bf16_emitters.hpp" using namespace mkldnn::impl; using namespace mkldnn::impl::cpu::x64; @@ -66,9 +66,9 @@ class jit_load_emitter : public jit_emitter { */ void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; - size_t get_inputs_num() override; + size_t get_inputs_num() const override; private: template @@ -94,6 +94,7 @@ class jit_load_emitter : public jit_emitter { size_t aux_gprs_count() const override; + std::string name; int v_len_elt; // 4/8/16 }; @@ -119,9 +120,9 @@ class jit_store_emitter : public jit_emitter { */ void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context) override; + const emitter_context *emit_context) const override; - size_t get_inputs_num() override; + size_t get_inputs_num() const override; std::shared_ptr get_emu_vcvtneps2bf16() const { return emu_vcvtneps2bf16; @@ -144,6 +145,7 @@ class jit_store_emitter : public jit_emitter { size_t aux_gprs_count() const override; size_t aux_vecs_count() const override; + std::string name; int v_len_elt; // 4/8/16 std::shared_ptr emu_vcvtneps2bf16; }; diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp b/inference-engine/src/mkldnn_plugin/emitters/jit_mkldnn_emitters.cpp similarity index 61% rename from inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp rename to inference-engine/src/mkldnn_plugin/emitters/jit_mkldnn_emitters.cpp index 84132993e026e6..6264be3bf42872 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_mkldnn_emitters.cpp @@ -2,10 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "common/emitter.h" #include "jit_mkldnn_emitters.hpp" -#include "mkldnn_eltwise_node.h" -#include "legacy/ie_layers.h" +#include "nodes/mkldnn_eltwise_node.h" using namespace mkldnn::impl::utils; using namespace mkldnn::impl; @@ -14,30 +12,45 @@ using namespace Xbyak; namespace MKLDNNPlugin { +jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, InferenceEngine::Precision exec_prc) + : jit_emitter(host, host_isa, node, exec_prc) { + + kind = mkldnn_eltwise_tanh; + alpha = 0.f; + beta = 0.f; + + set_injector(); +} + jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) { - auto& eltwiseNode = dynamic_cast(*n); + auto eltwiseNode = dynamic_cast(node); + kind = static_cast(eltwiseNode->getAlgorithm()); + alpha = eltwiseNode->getAlpha(); + beta = eltwiseNode->getBeta(); - auto alg = static_cast(eltwiseNode.getAlgorithm()); + set_injector(); +} +void jit_mkldnn_emitter::set_injector() { if (host_isa_ == cpu::x64::sse41) { eltwise_injector_sse42 = std::make_shared>( - host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta(), 1); + h, kind, alpha, beta, 1); } else if (host_isa_ == cpu::x64::avx2) { eltwise_injector_avx2 = std::make_shared>( - host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta(), 1); + h, kind, alpha, beta, 1); } else if (host_isa_ == cpu::x64::avx512_common) { eltwise_injector_avx512_common = std::make_shared>( - host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta(), 1); + h, kind, alpha, beta, 1); } else { assert(!"unsupported isa"); } } -size_t jit_mkldnn_emitter::get_inputs_num() { return 1; } +size_t jit_mkldnn_emitter::get_inputs_num() const { return 1; } -void jit_mkldnn_emitter::emit(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { +void jit_mkldnn_emitter::emit_code(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { if (host_isa_ == cpu::x64::sse41) { if (out_vec_idxs[0] != in_vec_idxs[0]) h->uni_vmovups(Xmm(out_vec_idxs[0]), Xmm(in_vec_idxs[0])); @@ -55,7 +68,7 @@ void jit_mkldnn_emitter::emit(const std::vector &in_vec_idxs, const std: } } -void jit_mkldnn_emitter::emit_table() { +void jit_mkldnn_emitter::emit_data() const { if (host_isa_ == cpu::x64::sse41) { eltwise_injector_sse42->prepare_table(); } else if (host_isa_ == cpu::x64::avx2) { @@ -67,5 +80,8 @@ void jit_mkldnn_emitter::emit_table() { } } +jit_mkldnn_aux_emitter::jit_mkldnn_aux_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc) + : jit_mkldnn_emitter(host, host_isa, node, exec_prc) { +} } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp b/inference-engine/src/mkldnn_plugin/emitters/jit_mkldnn_emitters.hpp similarity index 52% rename from inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp rename to inference-engine/src/mkldnn_plugin/emitters/jit_mkldnn_emitters.hpp index dd3bd9daa3c148..1dfc9b5f4cf420 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_mkldnn_emitters.hpp @@ -4,34 +4,51 @@ #pragma once -#include "common/emitter.h" #include -#include "mkldnn_node.h" #include +#include "jit_emitter.hpp" +#include "mkldnn_node.h" + namespace MKLDNNPlugin { class jit_mkldnn_emitter : public jit_emitter { public: - jit_mkldnn_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + void emit_code(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, + const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const override; - size_t get_inputs_num() override; - - void emit(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) override; - - void emit_table() override; + void emit_data() const override; void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, - const emitter_context *emit_context = nullptr) override {}; + const emitter_context *emit_context = nullptr) const override {}; +protected: + jit_mkldnn_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_mkldnn_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + void set_injector(); -private: + mkldnn_alg_kind_t kind {mkldnn_alg_kind_undef}; + float alpha {0.f}; + float beta {0.f}; + +protected: std::shared_ptr> eltwise_injector_sse42; std::shared_ptr> eltwise_injector_avx2; std::shared_ptr> eltwise_injector_avx512_common; + +private: + size_t get_inputs_num() const override; +}; + +class jit_mkldnn_aux_emitter : public jit_mkldnn_emitter { +public: + jit_mkldnn_aux_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + +private: }; -} // namespace MKLDNNPlugin +} // namespace MKLDNNPlugin \ No newline at end of file diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/softmax.cpp b/inference-engine/src/mkldnn_plugin/nodes/common/softmax.cpp index 3358215c5d262a..3bb1e83e4b23b7 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/common/softmax.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/common/softmax.cpp @@ -9,6 +9,7 @@ #include #include // TODO: just to replace mkldnn->dnnl via macros #include "utils/bfloat16.hpp" +#include "emitters/jit_bf16_emitters.hpp" #include #include @@ -162,7 +163,7 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge this->postamble(); if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16->emit_table(); + emu_vcvtneps2bf16->emit_data(); exp_injector->prepare_table(); } @@ -218,7 +219,7 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge if (mayiuse(avx512_core_bf16)) vcvtneps2bf16(ymm_dst, vmm_dst); else - emu_vcvtneps2bf16->emit({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); + emu_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); vmovdqu16(op, ymm_dst); break; default: diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp index 438a0161dc3440..320309738b7ae9 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp @@ -15,9 +15,11 @@ #include "mkldnn_extension_utils.h" #include "mkldnn_quantize_node.h" #include "mkldnn_pooling_node.h" -#include "common/emitter.h" -#include "jit_eltwise_emitters.hpp" -#include "jit_mkldnn_emitters.hpp" + +#include "emitters/jit_emitter.hpp" +#include "emitters/jit_eltwise_emitters.hpp" +#include "emitters/jit_mkldnn_emitters.hpp" +#include "emitters/jit_bf16_emitters.hpp" #include #include @@ -293,11 +295,11 @@ struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, pu this->postamble(); if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16->emit_table(); + emu_vcvtneps2bf16->emit_data(); - eltwise_emitter->emit_table(); + eltwise_emitter->emit_data(); for (int i = 0; i < post_op_emitters.size(); i++) { - post_op_emitters[i]->emit_table(); + post_op_emitters[i]->emit_data(); } } @@ -363,25 +365,25 @@ struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, pu std::set precisions; OV_SWITCH(MKLDNNPlugin, SupportedPrecisions, precisions, eltwiseNode.getOpType(), - OV_CASE(Relu, jit_mkldnn_emitter), - OV_CASE(Gelu, jit_mkldnn_emitter), - OV_CASE(Elu, jit_mkldnn_emitter), - OV_CASE(Tanh, jit_mkldnn_emitter), - OV_CASE(Logistic, jit_mkldnn_emitter), - OV_CASE(Square, jit_mkldnn_emitter), - OV_CASE(Abs, jit_mkldnn_emitter), - OV_CASE(Sqrt, jit_mkldnn_emitter), - OV_CASE(Linear, jit_mkldnn_emitter), - OV_CASE(BoundedRelu, jit_mkldnn_emitter), - OV_CASE(SoftRelu, jit_mkldnn_emitter), - OV_CASE(Relu6, jit_mkldnn_emitter), - OV_CASE(Exp, jit_mkldnn_emitter), - OV_CASE(Clamp, jit_mkldnn_emitter), - OV_CASE(Swish, jit_mkldnn_emitter), - OV_CASE(Hswish, jit_mkldnn_emitter), - OV_CASE(Mish, jit_mkldnn_emitter), - OV_CASE(Hsigmoid, jit_mkldnn_emitter), - OV_CASE(Round, jit_mkldnn_emitter), + OV_CASE(Relu, jit_mkldnn_aux_emitter), + OV_CASE(Gelu, jit_mkldnn_aux_emitter), + OV_CASE(Elu, jit_mkldnn_aux_emitter), + OV_CASE(Tanh, jit_mkldnn_aux_emitter), + OV_CASE(Logistic, jit_mkldnn_aux_emitter), + OV_CASE(Square, jit_mkldnn_aux_emitter), + OV_CASE(Abs, jit_mkldnn_aux_emitter), + OV_CASE(Sqrt, jit_mkldnn_aux_emitter), + OV_CASE(Linear, jit_mkldnn_aux_emitter), + OV_CASE(BoundedRelu, jit_mkldnn_aux_emitter), + OV_CASE(SoftRelu, jit_mkldnn_aux_emitter), + OV_CASE(Relu6, jit_mkldnn_aux_emitter), + OV_CASE(Exp, jit_mkldnn_aux_emitter), + OV_CASE(Clamp, jit_mkldnn_aux_emitter), + OV_CASE(Swish, jit_mkldnn_aux_emitter), + OV_CASE(Hswish, jit_mkldnn_aux_emitter), + OV_CASE(Mish, jit_mkldnn_aux_emitter), + OV_CASE(Hsigmoid, jit_mkldnn_aux_emitter), + OV_CASE(Round, jit_mkldnn_aux_emitter), OV_CASE(Add, jit_add_emitter), OV_CASE(MulAdd, jit_mul_add_emitter), OV_CASE(Subtract, jit_subtract_emitter), @@ -413,37 +415,36 @@ struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, pu } std::shared_ptr create_eltwise_emitter(MKLDNNNode& node, Precision exec_prec) { - auto& eltwiseNode = dynamic_cast(node); - const MKLDNNNode * eltwiseNodePtr = dynamic_cast(&node); + const auto& eltwiseNode = dynamic_cast(node); EltwiseEmitterContext ctx = { nullptr, this, isa, - eltwiseNodePtr, + &node, exec_prec }; OV_SWITCH(MKLDNNPlugin, EltwiseEmitter, ctx, eltwiseNode.getOpType(), - OV_CASE(Relu, jit_mkldnn_emitter), - OV_CASE(Gelu, jit_mkldnn_emitter), - OV_CASE(Elu, jit_mkldnn_emitter), - OV_CASE(Tanh, jit_mkldnn_emitter), - OV_CASE(Logistic, jit_mkldnn_emitter), - OV_CASE(Square, jit_mkldnn_emitter), - OV_CASE(Abs, jit_mkldnn_emitter), - OV_CASE(Sqrt, jit_mkldnn_emitter), - OV_CASE(Linear, jit_mkldnn_emitter), - OV_CASE(BoundedRelu, jit_mkldnn_emitter), - OV_CASE(SoftRelu, jit_mkldnn_emitter), - OV_CASE(Relu6, jit_mkldnn_emitter), - OV_CASE(Exp, jit_mkldnn_emitter), - OV_CASE(Clamp, jit_mkldnn_emitter), - OV_CASE(Swish, jit_mkldnn_emitter), - OV_CASE(Hswish, jit_mkldnn_emitter), - OV_CASE(Mish, jit_mkldnn_emitter), - OV_CASE(Hsigmoid, jit_mkldnn_emitter), - OV_CASE(Round, jit_mkldnn_emitter), + OV_CASE(Relu, jit_mkldnn_aux_emitter), + OV_CASE(Gelu, jit_mkldnn_aux_emitter), + OV_CASE(Elu, jit_mkldnn_aux_emitter), + OV_CASE(Tanh, jit_mkldnn_aux_emitter), + OV_CASE(Logistic, jit_mkldnn_aux_emitter), + OV_CASE(Square, jit_mkldnn_aux_emitter), + OV_CASE(Abs, jit_mkldnn_aux_emitter), + OV_CASE(Sqrt, jit_mkldnn_aux_emitter), + OV_CASE(Linear, jit_mkldnn_aux_emitter), + OV_CASE(BoundedRelu, jit_mkldnn_aux_emitter), + OV_CASE(SoftRelu, jit_mkldnn_aux_emitter), + OV_CASE(Relu6, jit_mkldnn_aux_emitter), + OV_CASE(Exp, jit_mkldnn_aux_emitter), + OV_CASE(Clamp, jit_mkldnn_aux_emitter), + OV_CASE(Swish, jit_mkldnn_aux_emitter), + OV_CASE(Hswish, jit_mkldnn_aux_emitter), + OV_CASE(Mish, jit_mkldnn_aux_emitter), + OV_CASE(Hsigmoid, jit_mkldnn_aux_emitter), + OV_CASE(Round, jit_mkldnn_aux_emitter), OV_CASE(Add, jit_add_emitter), OV_CASE(MulAdd, jit_mul_add_emitter), OV_CASE(Subtract, jit_subtract_emitter), @@ -485,7 +486,7 @@ struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, pu std::vector out_idxs; out_idxs.push_back(vmm_dst.getIdx()); - eltwise_emitter->emit(in_idxs, out_idxs, aux_idxs); + eltwise_emitter->emit_code(in_idxs, out_idxs, aux_idxs); } inline void apply_post_ops(bool is_scalar, int offset = 0) { @@ -505,7 +506,7 @@ struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, pu std::vector out_idxs; out_idxs.push_back(vmm_dst.getIdx()); - post_op_emitters[eltwise_post_op_idx]->emit(in_idxs, out_idxs, aux_idxs); + post_op_emitters[eltwise_post_op_idx]->emit_code(in_idxs, out_idxs, aux_idxs); eltwise_post_op_idx++; } else { @@ -647,7 +648,7 @@ struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, pu if (mayiuse(avx512_core_bf16)) vcvtneps2bf16(ymm_dst, vmm_dst); else - emu_vcvtneps2bf16->emit({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); + emu_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); vmovdqu16(op, ymm_dst); break; case Precision::I16: diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp index b9dec20c50eb2c..a211c288d1f699 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp @@ -23,6 +23,7 @@ #include #include "common/cpu_memcpy.h" #include "utils/bfloat16.hpp" +#include "emitters/jit_bf16_emitters.hpp" using namespace mkldnn; using namespace MKLDNNPlugin; @@ -148,7 +149,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi this->postamble(); if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16->emit_table(); + emu_vcvtneps2bf16->emit_data(); for (auto& inj : eltwise_injectors) inj->prepare_table(); @@ -1483,7 +1484,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi if (mayiuse(avx512_core_bf16)) vcvtneps2bf16(ymm_dst, vmm_dst); else - emu_vcvtneps2bf16->emit({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); + emu_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); vmovdqu16(op, ymm_dst); } } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp index dd4179b2d22c09..5200b1386e5839 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp @@ -16,7 +16,8 @@ #include #include "ie_parallel.hpp" #include -#include "common/jit_load_store_emitters.h" +#include "emitters/jit_load_store_emitters.hpp" +#include "emitters/jit_bf16_emitters.hpp" #include #include @@ -200,7 +201,7 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k this->postamble(); - load_emitter->emit_table(); + load_emitter->emit_data(); } private: @@ -241,7 +242,7 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k inline void worker_full_size() { Precision dst_prc = isFloatCompatible(jcp_.src_prc) ? Precision::FP32 : Precision::I32; - load_emitter->emit({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + load_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, std::make_shared(jcp_.src_prc, dst_prc, step), {}, {load_pool_gpr_idxs}); @@ -263,7 +264,7 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k inline void worker_tail_blk() { Precision dst_prc = isFloatCompatible(jcp_.src_prc) ? Precision::FP32 : Precision::I32; - load_emitter->emit({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + load_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, std::make_shared(jcp_.src_prc, dst_prc, tail_num), {}, {load_pool_gpr_idxs}); @@ -307,7 +308,7 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k inline void worker_tail_planar() { Precision dst_prc = isFloatCompatible(jcp_.src_prc) ? Precision::FP32 : Precision::I32; - load_emitter->emit({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + load_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, std::make_shared(jcp_.src_prc, dst_prc, tail_num, true, "zero"), {}, {load_pool_gpr_idxs}); @@ -478,9 +479,9 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator this->postamble(); - load_emitter->emit_table(); + load_emitter->emit_data(); if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core) && store_emitter != nullptr && store_emitter->get_emu_vcvtneps2bf16() != nullptr) - store_emitter->get_emu_vcvtneps2bf16()->emit_table(); + store_emitter->get_emu_vcvtneps2bf16()->emit_data(); for (auto& inj : eltwise_injectors) inj->prepare_table(); @@ -531,7 +532,7 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator inline void worker_mvn(bool is_tail) { int elt_num = is_tail ? tail_num : step; - load_emitter->emit({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + load_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, std::make_shared(jcp_.src_prc, Precision::FP32, elt_num), {}, {load_pool_gpr_idxs}); @@ -541,7 +542,7 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator apply_post_ops(jcp_.dst_prc, jcp_.planar_layout); - store_emitter->emit({static_cast(vmm_val.getIdx())}, {static_cast(reg_dst.getIdx())}, + store_emitter->emit_code({static_cast(vmm_val.getIdx())}, {static_cast(reg_dst.getIdx())}, std::make_shared(Precision::FP32, jcp_.dst_prc, elt_num), {store_pool_vec_idxs}, {store_pool_gpr_idxs}); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp index 66132ae81bc794..b193716bf4d10b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp @@ -10,6 +10,7 @@ #include "mkldnn_quantize_node.h" #include "mkldnn_eltwise_node.h" #include "utils/bfloat16.hpp" +#include "emitters/jit_bf16_emitters.hpp" #include "mkldnn_extension_utils.h" #include #include @@ -207,7 +208,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji this->postamble(); if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16->emit_table(); + emu_vcvtneps2bf16->emit_data(); for (auto& inj : eltwise_injectors) inj->prepare_table(); } @@ -603,7 +604,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji if (mayiuse(avx512_core_bf16)) vcvtneps2bf16(ymm_dst, vmm_dst); else - emu_vcvtneps2bf16->emit({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); + emu_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); vmovdqu16(op, ymm_dst); } else if (dst_dt == memory::data_type::u8) { uni_vcvtps2dq(vmm_dst, vmm_dst); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp index 19740905c3e7dc..9b5b73d62792f0 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp @@ -13,6 +13,7 @@ #include #include #include "utils/bfloat16.hpp" +#include "emitters/jit_bf16_emitters.hpp" #include "ie_parallel.hpp" #include @@ -115,7 +116,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene this->postamble(); if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16->emit_table(); + emu_vcvtneps2bf16->emit_data(); if (jcp_.reduce_mode == Reduce::And || jcp_.reduce_mode == Reduce::L1 || jcp_.reduce_mode == Reduce::Max || jcp_.reduce_mode == Reduce::Min || jcp_.reduce_mode == Reduce::Prod || jcp_.reduce_mode == Reduce::Or) { @@ -622,7 +623,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene if (mayiuse(avx512_core_bf16)) vcvtneps2bf16(ymm_dst, vmm_dst); else - emu_vcvtneps2bf16->emit({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); + emu_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); vmovdqu16(op, ymm_dst); break; case memory::data_type::s8: @@ -851,7 +852,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi this->postamble(); if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16->emit_table(); + emu_vcvtneps2bf16->emit_data(); if (jcp_.reduce_mode == Reduce::LogSum || jcp_.reduce_mode == Reduce::LogSumExp) { log_injector->prepare_table(); @@ -1096,7 +1097,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi if (mayiuse(avx512_core_bf16)) vcvtneps2bf16(ymm_dst, vmm_dst); else - emu_vcvtneps2bf16->emit({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); + emu_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); vmovdqu16(op, ymm_dst); break; case memory::data_type::s8: diff --git a/inference-engine/src/mkldnn_plugin/nodes/region_yolo.cpp b/inference-engine/src/mkldnn_plugin/nodes/region_yolo.cpp index 60b3fc27917156..03abe5a0bcd3df 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/region_yolo.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/region_yolo.cpp @@ -13,6 +13,7 @@ #include #include #include "utils/bfloat16.hpp" +#include "emitters/jit_bf16_emitters.hpp" #include "common/cpu_memcpy.h" #include "mkldnn.hpp" #include @@ -120,7 +121,7 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_ this->postamble(); if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16->emit_table(); + emu_vcvtneps2bf16->emit_data(); exp_injector->prepare_table(); @@ -223,7 +224,7 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_ if (mayiuse(avx512_core_bf16)) vcvtneps2bf16(ymm_dst, vmm_dst); else - emu_vcvtneps2bf16->emit({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); + emu_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); vmovdqu16(op, ymm_dst); break; default: diff --git a/inference-engine/src/mkldnn_plugin/utils/bfloat16.hpp b/inference-engine/src/mkldnn_plugin/utils/bfloat16.hpp index dabff00462e53a..dd98a63c1b3c30 100644 --- a/inference-engine/src/mkldnn_plugin/utils/bfloat16.hpp +++ b/inference-engine/src/mkldnn_plugin/utils/bfloat16.hpp @@ -6,7 +6,7 @@ #include #include -#include "nodes/common/emitter.h" +#include /** * The bfloat16_t class can be used as an arithmetic type. All arithmetic operations goes through conversion to the float data type. @@ -73,69 +73,6 @@ class bfloat16_t { uint16_t m_value; }; - -class jit_emu_vcvtneps2bf16 : public jit_emitter { -public: - jit_emu_vcvtneps2bf16(mkldnn::impl::cpu::x64::jit_generator* host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::BF16) : jit_emitter(host, host_isa, node, exec_prc) { - prepare_table(); - }; - - size_t get_inputs_num() override { return 1; }; - -private: - void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs, - const std::vector& pool_vec_idxs, const std::vector& pool_gpr_idxs, - const emitter_context *emit_context) override { - if (host_isa_ == mkldnn::impl::cpu::x64::cpu_isa_t::avx512_common) { - Xbyak::Zmm in = Xbyak::Zmm(in_vec_idxs[0]); - Xbyak::Ymm out = Xbyak::Ymm(out_vec_idxs[0]); - Xbyak::Zmm aux = Xbyak::Zmm(aux_vec_idxs[0]); - Xbyak::Zmm aux1 = Xbyak::Zmm(aux_vec_idxs[1]); - - h->uni_vpsrld(aux, in, 16); - h->vpandd(aux, aux, table_val("one")); - h->uni_vmovups(aux1, table_val("even")); - h->uni_vpaddd(aux, aux1, aux); - h->uni_vpaddd(aux, in, aux); - h->vfixupimmps(aux, in, table_val("selector"), 0); - h->vpsrad(aux, aux, 16); - h->vpmovdw(out, aux); - } else { - assert(!"unsupported isa"); - } - }; - - - inline int encode_fixup_selector(int input, int output) { - return ((output) << (4 * (input))); - } - - void register_table_entries() override { - enum { - fixup_input_code_qnan_ = 0, - fixup_input_code_snan_ = 1, - fixup_input_code_ninf_ = 4, - fixup_input_code_pinf_ = 5, - fixup_output_code_copy_input_ = 1, - fixup_output_code_qnan_input_ = 2, - }; - const int selector_int32 = - /* qnan input to qnan output (presenrving input bits 0..21) */ - encode_fixup_selector(fixup_input_code_snan_, fixup_output_code_qnan_input_) | - /* snan input to qnan output (presenrving input bits 0..21) */ - encode_fixup_selector(fixup_input_code_qnan_, fixup_output_code_qnan_input_) | - /* neg inf input copied to output */ - encode_fixup_selector(fixup_input_code_ninf_, fixup_output_code_copy_input_) | - /* pos inf input copied to output */ - encode_fixup_selector(fixup_input_code_pinf_, fixup_output_code_copy_input_); - push_arg_entry_of("one", 0x00000001, true); - push_arg_entry_of("even", 0x00007fff, true); - push_arg_entry_of("selector", selector_int32, true); - } - - size_t aux_vecs_count() const override { return 2; } -}; } // namespace MKLDNNPlugin /**