diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index bc42ffca8a3cf6..999db4931236d8 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -49,6 +49,9 @@ target_include_directories(openvino_core_dev INTERFACE $ $) +target_include_directories(openvino_core_dev INTERFACE + $:$>>) + target_link_libraries(openvino_core_dev INTERFACE openvino::itt openvino::util) set_target_properties(openvino_core_dev PROPERTIES EXPORT_NAME core::dev) @@ -104,6 +107,8 @@ ov_ncc_naming_style(FOR_TARGET openvino_core_obj ov_add_clang_format_target(openvino_core_clang FOR_SOURCES ${LIBRARY_SRC} ${PUBLIC_HEADERS} ${DEV_HEADERS}) +target_compile_definitions(openvino_core_obj PRIVATE XBYAK_NO_OP_NAMES XBYAK64) + if(NOT BUILD_SHARED_LIBS) target_compile_definitions(openvino_core_obj PUBLIC OPENVINO_STATIC_LIBRARY) endif() diff --git a/src/core/dev_api/openvino/runtime/compute_hash.hpp b/src/core/dev_api/openvino/runtime/compute_hash.hpp new file mode 100644 index 00000000000000..55e1cc8d3dc9cf --- /dev/null +++ b/src/core/dev_api/openvino/runtime/compute_hash.hpp @@ -0,0 +1,20 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace runtime { + +/** + * @brief Computes the hash value for the input data + * @param src A pointer to the input data + * @param size The length of the input data in bytes + */ +size_t compute_hash(const void* src, size_t size); + +} // namespace reference +} // namespace ov diff --git a/src/core/reference/CMakeLists.txt b/src/core/reference/CMakeLists.txt index f7874964233cf5..24c74ede3b5fba 100644 --- a/src/core/reference/CMakeLists.txt +++ b/src/core/reference/CMakeLists.txt @@ -39,8 +39,6 @@ ov_build_target_faster(${TARGET_NAME} ov_set_threading_interface_for(${TARGET_NAME}) -target_compile_definitions(${TARGET_NAME} PRIVATE XBYAK_NO_OP_NAMES XBYAK64) - if(NOT BUILD_SHARED_LIBS) target_compile_definitions(${TARGET_NAME} PUBLIC OPENVINO_STATIC_LIBRARY) endif() @@ -50,9 +48,6 @@ target_include_directories(${TARGET_NAME} PUBLIC $ $) -target_include_directories(${TARGET_NAME} SYSTEM PRIVATE - $:$>>) - find_package(Threads REQUIRED) target_link_libraries(${TARGET_NAME} PRIVATE Threads::Threads openvino::core::dev) diff --git a/src/core/reference/include/openvino/reference/utils/combine_hash.hpp b/src/core/reference/include/openvino/reference/utils/combine_hash.hpp deleted file mode 100644 index 9f1cfdea812494..00000000000000 --- a/src/core/reference/include/openvino/reference/utils/combine_hash.hpp +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -namespace ov { -namespace runtime { - -size_t combine_hash(const void* src, size_t size); - -} // namespace runtime -} // namespace ov diff --git a/src/core/reference/include/openvino/reference/utils/jit_generator.hpp b/src/core/reference/include/openvino/reference/utils/jit_generator.hpp index 49c5cb6e0e959e..6350cad3cd27b8 100644 --- a/src/core/reference/include/openvino/reference/utils/jit_generator.hpp +++ b/src/core/reference/include/openvino/reference/utils/jit_generator.hpp @@ -67,6 +67,7 @@ namespace jit { const Xbyak::Reg64 reg_EVEX_max_8b_offt; static constexpr int EVEX_max_8b_offt = 0x200; + size_t m_vlen = ymm_len; public: static constexpr size_t xmm_len = 16; @@ -78,7 +79,7 @@ namespace jit { static bool mayiuse(const cpu_isa_t cpu_isa); static bool is_x64(); - Generator(void* code_ptr = nullptr, size_t code_size = 16 * 1024); + Generator(cpu_isa_t isa = avx2, void* code_ptr = nullptr, size_t code_size = 16 * 1024); void preamble(); void postamble(); @@ -91,8 +92,12 @@ namespace jit { void copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size); + + size_t get_vlen() { + return m_vlen; + } }; } // namespace jit -} // namespace reference +} // namespace reference } // namespace ov diff --git a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp index 59ddd11596b980..daf48e9e0b2e0f 100644 --- a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp +++ b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp @@ -12,7 +12,7 @@ #include namespace ov { -namespace runtime { +namespace reference { namespace jit { class RegistersPool { @@ -74,7 +74,7 @@ class RegistersPool { checkUniqueAndUpdate(false); } - template + template static Ptr create(std::initializer_list regsToExclude); static Ptr create(cpu_isa_t isa, std::initializer_list regsToExclude); @@ -303,10 +303,9 @@ RegistersPool::Ptr RegistersPool::create(cpu_isa_t isa, std::initializer_list - -#include - -namespace ov { -namespace reference { -namespace jit { -#ifdef XBYAK64 -static const Xbyak::Operand::Code abi_save_gpr_regs[] = { - Xbyak::Operand::RBX, - Xbyak::Operand::RBP, - Xbyak::Operand::R12, - Xbyak::Operand::R13, - Xbyak::Operand::R14, - Xbyak::Operand::R15, -# ifdef _WIN32 - Xbyak::Operand::RDI, - Xbyak::Operand::RSI, -# endif -}; - -# ifdef _WIN32 -# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX) // RCX -# else -# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI) // RDI -# endif -#endif // XBYAK64 - -class Generator : public Xbyak::CodeGenerator { - static constexpr size_t xmm_len = 16; - -#ifdef _WIN32 - static constexpr size_t xmm_to_preserve_start = 6; - static constexpr size_t xmm_to_preserve = 10; -#else - static constexpr size_t xmm_to_preserve_start = 0; - static constexpr size_t xmm_to_preserve = 0; -#endif - - static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]); - const size_t size_of_abi_save_regs; - - const Xbyak::Reg64 reg_EVEX_max_8b_offt; - static constexpr int EVEX_max_8b_offt = 0x200; - -public: - const Xbyak::Reg64 param = abi_param1; - - typedef enum { - isa_any, - sse42, - avx, - avx2, - avx512_common, - avx512_core, - avx512_core_vnni, - avx512_mic, - avx512_mic_4ops, - avx512_core_bf16, - avx512_vpopcnt, - fp16 - } cpu_isa_t; - - static bool mayiuse(const cpu_isa_t cpu_isa); - static bool is_x64(); - - Generator(void* code_ptr = nullptr, size_t code_size = 16 * 1024); - void preamble(); - void postamble(); - - void foreach (const Xbyak::Reg64& idx, - size_t step, - const Xbyak::Reg64& end, - std::function && fn); - - template - void copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size); -}; -} // namespace jit -} // namespace reference -} // namespace ov diff --git a/src/core/reference/src/op/utils/combine_hash.cpp b/src/core/reference/src/op/utils/combine_hash.cpp deleted file mode 100644 index 1835155becf711..00000000000000 --- a/src/core/reference/src/op/utils/combine_hash.cpp +++ /dev/null @@ -1,666 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -// The CRC computation is used for x86. -// The calculations were taken from the article -// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel (December, 2009)". - -#include "openvino/core/visibility.hpp" -#include "openvino/core/parallel.hpp" -#include "openvino/reference/utils/combine_hash.hpp" - -#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) -# include "openvino/reference/utils/registers_pool.hpp" -#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 - -#include - -namespace ov { -namespace runtime { - -#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) -namespace jit { - -#define GET_OFF(field) offsetof(CombineHashCallArgs, field) -#define getReg64() RegistersPool::Reg(registersPool) -#define getVmm() RegistersPool::Reg(registersPool) -#define getXmm() RegistersPool::Reg(registersPool) - -struct CombineHashCompileParams { -}; - -struct CombineHashCallArgs { - const void* src_ptr; - void* dst_ptr; - uint64_t work_amount = 0lu; - uint64_t make_64_fold = 0lu; -}; - -typedef void (*fn_t)(const CombineHashCallArgs*); - -template -class CombineHash : public Generator { -public: - explicit CombineHash(const CombineHashCompileParams& jcp) : - m_jcp(jcp) { - if (isa == avx512_core) { - vlen = zmm_len; - } else if (isa == avx2) { - vlen = ymm_len; - } else { - OPENVINO_THROW("Unsupported isa: ", isa); - } - if (!mayiuse(cpu_isa_t::pclmulqdq)) { - OPENVINO_THROW("The current CPU does not support pclmulqdq instruction, which is required for the CRC algorithm."); - } - if (mayiuse(cpu_isa_t::vpclmulqdq)) { - is_vpclmulqdq = true; - } - - generate(); - } - - void generate() { - this->preamble(); - registersPool = RegistersPool::create(isa, {rax, rcx, rsp, rdi, k0}); - - r64_src = getReg64(); - r64_dst = getReg64(); - r64_work_amount = getReg64(); - r64_make_64_fold = getReg64(); - - mov(r64_src, ptr[r64_params + GET_OFF(src_ptr)]); - mov(r64_dst, ptr[r64_params + GET_OFF(dst_ptr)]); - mov(r64_work_amount, ptr[r64_params + GET_OFF(work_amount)]); - mov(r64_make_64_fold, ptr[r64_params + GET_OFF(make_64_fold)]); - - initVectors(); - bulkFold(v_dst); - restFold(v_dst); - tailFold(v_dst); - - registersPool.reset(); - this->postamble(); - } - - static fn_t get() { - static const CombineHashCompileParams params; - static CombineHash kernel(params); - - return (fn_t)kernel.getCode(); - } - - void fillRestWorkMask(const Xbyak::Opmask& k_dst_mask, - const Xbyak::Reg64& r64_work_rest) { - Xbyak::Label l_mv_mask; - auto rOnes = getReg64(); - - mov(rOnes, 0xFFFFFFFFFFFFFFFF); - cmp(r64_work_rest, 0x3f); - jg(l_mv_mask); - - shlx(rOnes, rOnes, r64_work_rest); - not_(rOnes); - - L(l_mv_mask); - kmovq(k_dst_mask, rOnes); - } - - void partialLoad(const Xbyak::Xmm& xmm_dst, - const Xbyak::Address& src_addr, - const Xbyak::Reg64& r64_load_num) { - Xbyak::Label l_partial, l_end; - - cmp(r64_load_num, xmm_len); - jl(l_partial, T_NEAR); - vmovdqu(xmm_dst, ptr[src_addr.getRegExp()]); - jmp(l_end, T_NEAR); - - L(l_partial); { - size_t offset = xmm_len; - - for (size_t j = 0lu; j < xmm_len - 1; j++) { - pinsrb(xmm_dst, ptr[src_addr.getRegExp() + offset], j); - cmp(r64_load_num, ++offset); - jle(l_end, T_NEAR); - } - } - - L(l_end); - } - - void partialLoad(const Xbyak::Ymm& ymm_dst, - const Xbyak::Address& src_addr, - const Xbyak::Reg64& r64_load_num) { - Xbyak::Label l_xmm, l_partial, l_end; - auto xmm_dst = Xbyak::Xmm(ymm_dst.getIdx()); - - cmp(r64_load_num, ymm_len); - jl(l_xmm, T_NEAR); - vmovdqu(ymm_dst, ptr[src_addr.getRegExp()]); - jmp(l_end, T_NEAR); - - L(l_xmm); - vpxorq(ymm_dst, ymm_dst, ymm_dst); - cmp(r64_load_num, xmm_len); - jl(l_partial, T_NEAR); - vmovdqu(xmm_dst, ptr[src_addr.getRegExp()]); - je(l_end, T_NEAR); - - { - Xbyak::Label l_rest_loop, l_perm; - size_t offset = xmm_len; - - vperm2f128(ymm_dst, ymm_dst, ymm_dst, 0x1); - for (size_t j = 0lu; j < xmm_len - 1; j++) { - pinsrb(xmm_dst, ptr[src_addr.getRegExp() + offset], j); - cmp(r64_load_num, ++offset); - jle(l_perm, T_NEAR); - } - L(l_perm); - vperm2f128(ymm_dst, ymm_dst, ymm_dst, 0x1); - } - jmp(l_end, T_NEAR); - - L(l_partial); { - size_t offset = xmm_len; - - for (size_t j = 0lu; j < xmm_len - 1; j++) { - pinsrb(xmm_dst, ptr[src_addr.getRegExp() + offset], j); - cmp(r64_load_num, ++offset); - jle(l_end, T_NEAR); - } - } - - L(l_end); - } - -private: - static constexpr uint64_t CHUNK_SIZE = 32; - static const uint64_t CRC_VAL; - static const uint64_t CONST_K[12]; - static const uint8_t SHUF_MASK[16]; - - using Vmm = typename std::conditional::type; - size_t vlen = xmm_len; - bool is_vpclmulqdq = false; - - CombineHashCompileParams m_jcp; - RegistersPool::Ptr registersPool; - - RegistersPool::Reg r64_src; - RegistersPool::Reg r64_dst; - RegistersPool::Reg r64_work_amount; - RegistersPool::Reg r64_make_64_fold; - - const Xbyak::Reg64 r64_params = abi_param1; - - // Vector registers - RegistersPool::Reg v_dst; - RegistersPool::Reg v_k_1_2; - RegistersPool::Reg v_k_4_5; - RegistersPool::Reg v_k_8_9; - RegistersPool::Reg v_k_16_17; - RegistersPool::Reg v_shuf_mask; - - size_t getVlen() { - return vlen; - } - - void initVectors(); - - void bulkFold(const Vmm& v_dst); - - void restFold(const Vmm& v_dst) { - Xbyak::Label l_fold_loop, l_end; - cmp(r64_work_amount, xmm_len); - jl(l_end, T_NEAR); - - auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); - auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx()); - auto xmm_src = getXmm(); - auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); - auto xmm_aux = getXmm(); - - L(l_fold_loop); { - vmovdqu64(xmm_src, ptr[r64_src]); - vpshufb(xmm_src, xmm_src, xmm_shuf_mask); - - vpclmulqdq(xmm_aux, xmm_dst, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst, xmm_dst, xmm_k_1_2, 0b00010001); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - vpxorq(xmm_dst, xmm_dst, xmm_src); - - add(r64_src, xmm_len); - sub(r64_work_amount, xmm_len); - cmp(r64_work_amount, xmm_len); - jge(l_fold_loop, T_NEAR); - } - - L(l_end); - } - - void tailFold(const Vmm& v_dst); -}; - -template <> -void CombineHash::initVectors() { - auto r64_aux = getReg64(); - - v_k_1_2 = getVmm(); - mov(r64_aux, reinterpret_cast(CONST_K)); - vbroadcasti64x2(v_k_1_2, ptr[r64_aux]); - v_k_8_9 = getVmm(); - mov(r64_aux, reinterpret_cast(CONST_K + 6)); - vbroadcasti64x2(v_k_8_9, ptr[r64_aux]); - - v_shuf_mask = getVmm(); - mov(r64_aux, reinterpret_cast(SHUF_MASK)); - vbroadcasti64x2(v_shuf_mask, ptr[r64_aux]); - - v_dst = getVmm(); - auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); - auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); - auto xmm_aux = getXmm(); - auto k_rest_mask = RegistersPool::Reg(registersPool); - // Initial CRC - mov(r64_aux, CRC_VAL); - vpxorq(v_dst, v_dst, v_dst); - vpinsrq(xmm_dst, xmm_dst, r64_work_amount, 0x0); - vpinsrq(xmm_dst, xmm_dst, r64_aux, 0x1); - // First xor with source - fillRestWorkMask(k_rest_mask, r64_work_amount); - vmovdqu8(Xbyak::Xmm(xmm_aux.getIdx()) | k_rest_mask | T_z, ptr[r64_src]); - vpshufb(xmm_aux, xmm_aux, xmm_shuf_mask); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - sub(r64_work_amount, xmm_len); - add(r64_src, xmm_len); -} - -template -void CombineHash::initVectors() { - auto r64_aux = getReg64(); - - v_k_1_2 = getVmm(); - mov(r64_aux, reinterpret_cast(CONST_K)); - vbroadcasti128(v_k_1_2, ptr[r64_aux]); - v_k_8_9 = getVmm(); - mov(r64_aux, reinterpret_cast(CONST_K + 6)); - vbroadcasti128(v_k_8_9, ptr[r64_aux]); - - v_shuf_mask = getVmm(); - mov(r64_aux, reinterpret_cast(SHUF_MASK)); - vbroadcasti128(v_shuf_mask, ptr[r64_aux]); - - v_dst = getVmm(); - auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); - auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); - auto xmm_aux = getXmm(); - auto k_rest_mask = RegistersPool::Reg(registersPool); - // Initial CRC - mov(r64_aux, CRC_VAL); - vpxorq(v_dst, v_dst, v_dst); - vpinsrq(xmm_dst, xmm_dst, r64_aux, 0x1); - // First xor with source - partialLoad(xmm_aux, ptr[r64_src], r64_work_amount); - vpshufb(xmm_aux, xmm_aux, xmm_shuf_mask); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - sub(r64_work_amount, xmm_len); -} - -template <> -void CombineHash::bulkFold(const Vmm& v_dst) { - Xbyak::Label l_fold_loop, l_end; - cmp(r64_work_amount, zmm_len + 3 * xmm_len); - jl(l_end, T_NEAR); - - auto r64_aux = getReg64(); - - auto v_src_0 = getVmm(); - auto v_dst_0 = getVmm(); - auto v_dst_1 = getVmm(); - auto v_dst_2 = getVmm(); - auto& v_dst_3 = v_dst; - auto v_aux_0 = getVmm(); - - auto xmm_k_8_9 = Xbyak::Xmm(v_k_8_9.getIdx()); - auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx()); - auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx()); - auto xmm_src_1 = getXmm(); - auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx()); - auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx()); - auto xmm_dst_2 = Xbyak::Xmm(v_dst_2.getIdx()); - auto xmm_dst_3 = Xbyak::Xmm(v_dst_3.getIdx()); - auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx()); - - vmovdqu64(v_dst_0, v_dst_3); - - if (!is_vpclmulqdq) { - prefetchnta(ptr[r64_src + 3 * xmm_len]); - vmovdqu64(xmm_dst_1, ptr[r64_src + 0 * xmm_len]); - vmovdqu64(xmm_dst_2, ptr[r64_src + 1 * xmm_len]); - vmovdqu64(xmm_dst_3, ptr[r64_src + 2 * xmm_len]); - } - - add(r64_src, 3 * xmm_len); - sub(r64_work_amount, zmm_len + 3 * xmm_len); - - L(l_fold_loop); { - vmovdqu64(v_src_0, ptr[r64_src]); - vpshufb(v_src_0, v_src_0, v_shuf_mask); - - if (is_vpclmulqdq) { - vpclmulqdq(v_aux_0, v_dst_0, v_k_8_9, 0b00000000); - vpclmulqdq(v_dst_0, v_dst_0, v_k_8_9, 0b00010001); - vpxorq(v_aux_0, v_aux_0, v_src_0); - vpxorq(v_dst_0, v_dst_0, v_aux_0); - } else { - // 0 - vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_8_9, 0b00000000); - vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_8_9, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0); - vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0); - // 1 - vextracti64x2(xmm_src_1, v_src_0, 0x1); - vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_8_9, 0b00000000); - vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_8_9, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); - vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); - // 2 - vextracti64x2(xmm_src_1, v_src_0, 0x2); - vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_8_9, 0b00000000); - vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_8_9, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); - vpxorq(xmm_dst_2, xmm_dst_2, xmm_aux_0); - // 3 - vextracti64x2(xmm_src_1, v_src_0, 0x3); - vpclmulqdq(xmm_aux_0, xmm_dst_3, xmm_k_8_9, 0b00000000); - vpclmulqdq(xmm_dst_3, xmm_dst_3, xmm_k_8_9, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - } - - add(r64_src, zmm_len); - sub(r64_work_amount, zmm_len); - jge(l_fold_loop, T_NEAR); - } - add(r64_work_amount, zmm_len); - - if (is_vpclmulqdq) { - auto ymm_dst_0 = Xbyak::Ymm(v_dst_0.getIdx()); - auto ymm_dst_1 = Xbyak::Ymm(v_dst_1.getIdx()); - auto ymm_aux_0 = Xbyak::Ymm(v_aux_0.getIdx()); - - vextracti64x4(ymm_dst_1, v_dst_0, 0x1); - mov(r64_aux, reinterpret_cast(CONST_K + 2)); - vpclmulqdq(ymm_aux_0, ymm_dst_0, ptr[r64_aux], 0b00000000); - vpclmulqdq(ymm_dst_0, ymm_dst_0, ptr[r64_aux], 0b00010001); - vpxorq(ymm_dst_1, ymm_dst_1, ymm_aux_0); - vpxorq(ymm_dst_0, ymm_dst_0, ymm_dst_1); - - vextracti64x2(xmm_dst_3, ymm_dst_0, 0x1); - vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_1_2, 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0); - } else { - mov(r64_aux, reinterpret_cast(CONST_K + 4)); - vpclmulqdq(xmm_aux_0, xmm_dst_0, ptr[r64_aux], 0b00000000); - vpclmulqdq(xmm_dst_0, xmm_dst_0, ptr[r64_aux], 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0); - - mov(r64_aux, reinterpret_cast(CONST_K + 2)); - vpclmulqdq(xmm_aux_0, xmm_dst_1, ptr[r64_aux], 0b00000000); - vpclmulqdq(xmm_dst_1, xmm_dst_1, ptr[r64_aux], 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_1); - - vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_1_2, 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_2); - } - - L(l_end); -} - -template <> -void CombineHash::bulkFold(const Vmm& v_dst) { - Xbyak::Label l_fold_loop, l_end; - cmp(r64_work_amount, 2 * vlen - xmm_len); - jl(l_end, T_NEAR); - - auto r64_aux = getReg64(); - - auto v_src_0 = getVmm(); - auto v_dst_0 = getVmm(); - auto v_dst_1 = getVmm(); - auto v_dst_2 = getVmm(); - auto& v_dst_3 = v_dst; - auto v_aux_0 = getVmm(); - - auto xmm_k_4_5 = Xbyak::Xmm(v_k_4_5.getIdx()); - auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx()); - auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx()); - auto xmm_src_1 = getXmm(); - auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx()); - auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx()); - auto xmm_dst_2 = Xbyak::Xmm(v_dst_2.getIdx()); - auto xmm_dst_3 = Xbyak::Xmm(v_dst_3.getIdx()); - auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx()); - - if (!is_vpclmulqdq) { - vmovdqu64(xmm_dst_1, ptr[r64_src + 0 * xmm_len]); - } - - add(r64_src, vlen - xmm_len); - sub(r64_work_amount, 2 * vlen - xmm_len); - - L(l_fold_loop); { - vmovdqu64(v_src_0, ptr[r64_src]); - vpshufb(v_src_0, v_src_0, v_shuf_mask); - - if (is_vpclmulqdq) { - vpclmulqdq(v_aux_0, v_dst_0, v_k_4_5, 0b00000000); - vpclmulqdq(v_dst_0, v_dst_0, v_k_4_5, 0b00010001); - vpxorq(v_aux_0, v_aux_0, v_src_0); - vpxorq(v_dst_0, v_dst_0, v_aux_0); - } else { - // 0 - vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_4_5, 0b00000000); - vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_4_5, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0); - vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0); - // 1 - vextracti128(xmm_src_1, v_src_0, 0x1); - vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_4_5, 0b00000000); - vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_4_5, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); - vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); - } - - add(r64_src, vlen); - sub(r64_work_amount, vlen); - jge(l_fold_loop, T_NEAR); - } - add(r64_work_amount, vlen); - - if (is_vpclmulqdq) { - auto ymm_dst_0 = Xbyak::Ymm(v_dst_0.getIdx()); - auto ymm_dst_1 = Xbyak::Ymm(v_dst_1.getIdx()); - auto ymm_aux_0 = Xbyak::Ymm(v_aux_0.getIdx()); - - vextracti128(xmm_dst_3, ymm_dst_0, 0x1); - vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_1_2, 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0); - } else { - vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_1_2, 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_2); - } - - L(l_end); -} - - -template <> -void CombineHash::tailFold(const Vmm& v_dst) { - Xbyak::Label l_fold_to_64, l_save_128, l_end; - cmp(r64_work_amount, 0); - jle(l_fold_to_64, T_NEAR); - - auto r64_aux = getReg64(); - auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); - auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx()); - auto xmm_src = getXmm(); - auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); - auto xmm_aux = getXmm(); - auto xmm_aux_1 = getXmm(); - auto xmm_aux_2 = getXmm(); - auto k_rest_mask = RegistersPool::Reg(registersPool); - - fillRestWorkMask(k_rest_mask, r64_work_amount); - - vpxorq(xmm_src, xmm_src, xmm_src); - vmovdqu8(Xbyak::Xmm(xmm_src.getIdx()) | k_rest_mask | T_z, ptr[r64_src]); - vpshufb(xmm_src, xmm_src, xmm_shuf_mask); - - vpclmulqdq(xmm_aux, xmm_dst, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst, xmm_dst, xmm_k_1_2, 0b00010001); - vpxorq(xmm_aux, xmm_aux, xmm_src); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - - L(l_fold_to_64); - cmp(r64_make_64_fold, 0); - je(l_save_128, T_NEAR); - - mov(r64_aux, reinterpret_cast(CONST_K + 8)); - vpclmulqdq(xmm_aux, xmm_dst, ptr[r64_aux], 0b00000001); - vpslldq(xmm_dst, xmm_dst, 0x8); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - - mov(r64_aux, reinterpret_cast(CONST_K + 10)); - vmovdqu64(xmm_aux_2, ptr[r64_aux]); - vpclmulqdq(xmm_aux, xmm_dst, xmm_aux_2, 0b00000001); - mov(r64_aux, 0x0); - vpinsrq(xmm_aux_1, xmm_dst, r64_aux, 0x0); - vpxorq(xmm_aux, xmm_aux, xmm_aux_1); - vpinsrq(xmm_aux_1, xmm_aux, r64_aux, 0x0); - vpclmulqdq(xmm_aux, xmm_aux, xmm_aux_2, 0b00010001); - vpxorq(xmm_aux, xmm_aux, xmm_aux_1); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - - vpextrq(ptr[r64_dst], xmm_dst, 0x0); - jmp(l_end, T_NEAR); - - - L(l_save_128); - vmovdqu64(ptr[r64_dst], xmm_dst); - - L(l_end); -} - -template <> -void CombineHash::tailFold(const Vmm& v_dst) { -} - -template -const uint64_t CombineHash::CRC_VAL = 0xffffffffffffffff; - -// P(x) = 0x42F0E1EBA9EA3693 -template -const uint64_t CombineHash::CONST_K[12] = { 0x05f5c3c7eb52fab6, 0x4eb938a7d257740e, // x^(64*1), x^(64*2) - 0x571bee0a227ef92b, 0x44bef2a201b5200c, // x^(64*3), x^(64*4) - 0x54819d8713758b2c, 0x4a6b90073eb0af5a, // x^(64*5), x^(64*6) - 0x5f6843ca540df020, 0xddf4b6981205b83f, // x^(64*7), x^(64*8) - 0x05f5c3c7eb52fab6, 0x0000000000000000, // x^(64*1), x^(64*1) mod P(x) - 0x578d29d06cc4f872, 0x42f0e1eba9ea3693 // floor(x^128/P(x)) - x^64, P(x) - x^64 - }; - -template -const uint8_t CombineHash::SHUF_MASK[] = { 0b00001111, 0b00001110, 0b00001101, 0b00001100, 0b00001011, 0b00001010, 0b00001001, 0b00001000, - 0b00000111, 0b00000110, 0b00000101, 0b00000100, 0b00000011, 0b00000010, 0b00000001, 0b00000000 }; - -} // namespace jit -#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 - -size_t combine_hash(const void* src, size_t size) { -#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) - jit::fn_t kernel; - - if (jit::Generator::mayiuse(jit::avx512_core)) { - kernel = jit::CombineHash::get(); - } else if (jit::Generator::mayiuse(jit::avx2)) { - kernel = jit::CombineHash::get(); - } - - if (kernel) { - size_t res = 0lu; - - static const size_t block_size = 2lu * jit::Generator::zmm_len; - // There is no sense to perform parallel execution if there are less than 2 blocks. - if (size >= 2lu * block_size) { - const auto nthr = parallel_get_max_threads() / 2; // TODO: WA for Hyper Threading - std::vector intermediate(nthr * 2); // xmm_len * nthr - const uint64_t blocks = size / block_size; - const uint64_t el_per_thread = block_size * ((blocks + nthr - 1) / nthr); - - parallel_nt(nthr, [&](const int ithr, const int nthr) { - uint64_t start = ithr * el_per_thread; - if (start >= size) { - return; - } - uint64_t work_amount = (el_per_thread + start > size) ? size - start : el_per_thread; - - size_t res = 0lu; - jit::CombineHashCallArgs args; - - args.src_ptr = reinterpret_cast(src) + start; - args.dst_ptr = &intermediate[ithr * 2]; - args.work_amount = work_amount; - args.make_64_fold = 0lu; - kernel(&args); - }); - - - jit::CombineHashCallArgs args; - args.src_ptr = intermediate.data(); - args.dst_ptr = &res; - args.work_amount = ((size + el_per_thread - 1) / el_per_thread) * jit::Generator::xmm_len; - args.make_64_fold = 1lu; - kernel(&args); - } else { - jit::CombineHashCallArgs args; - args.src_ptr = src; - args.dst_ptr = &res; - args.work_amount = size; - args.make_64_fold = 1lu; - kernel(&args); - } - return res; - } -#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 - - constexpr auto cel_size = sizeof(size_t); - auto seed = static_cast(size); - const auto data = static_cast(src); - const auto d_end = std::next(data, size / cel_size); - // The constant value used as a magic number has been - // traditionally used e.g. in boost library's hash_combine. - // It happens to be derived from the golden ratio. - for (auto d = data; d != d_end; ++d) { - seed ^= *d + 0x9e3779b9 + (seed << 6) + (seed >> 2); - } - size_t last_bytes{0}; - std::memcpy(&last_bytes, d_end, size % cel_size); - seed ^= last_bytes + 0x9e3779b9 + (seed << 6) + (seed >> 2); - return seed; -} - -} // namespace runtime -} // namespace ov diff --git a/src/core/reference/src/op/utils/jit_generator.cpp b/src/core/reference/src/utils/jit_generator.cpp similarity index 94% rename from src/core/reference/src/op/utils/jit_generator.cpp rename to src/core/reference/src/utils/jit_generator.cpp index 174cbb9242acc4..f856ac64509898 100644 --- a/src/core/reference/src/op/utils/jit_generator.cpp +++ b/src/core/reference/src/utils/jit_generator.cpp @@ -11,9 +11,10 @@ # endif # include -# include "openvino/reference/utils/jit_generator.hpp" +# include "openvino/core/except.hpp" # include "openvino/core/type/bfloat16.hpp" # include "openvino/core/type/float16.hpp" +# include "openvino/reference/utils/jit_generator.hpp" namespace ov { namespace reference { @@ -64,10 +65,18 @@ bool Generator::mayiuse(const cpu_isa_t cpu_isa) { bool Generator::is_x64() { return sizeof(void*) == 8; } -Generator::Generator(void* code_ptr, size_t code_size) +Generator::Generator(cpu_isa_t isa, void* code_ptr, size_t code_size) : Xbyak::CodeGenerator(code_size, code_ptr), size_of_abi_save_regs(num_abi_save_gpr_regs * rax.getBit() / 8 + xmm_to_preserve * xmm_len), - reg_EVEX_max_8b_offt(rbp) {} + reg_EVEX_max_8b_offt(rbp) { + if (isa == avx512_core) { + m_vlen = zmm_len; + } else if (isa == avx2) { + m_vlen = ymm_len; + } else { + OPENVINO_THROW("Unsupported isa: ", isa); + } + } void Generator::preamble() { if (xmm_to_preserve) { diff --git a/src/core/src/pass/serialize.cpp b/src/core/src/pass/serialize.cpp index c36b681d9e034d..507f6d7b468330 100644 --- a/src/core/src/pass/serialize.cpp +++ b/src/core/src/pass/serialize.cpp @@ -22,8 +22,8 @@ #include "openvino/opsets/opset1.hpp" #include "openvino/pass/constant_folding.hpp" #include "openvino/reference/convert.hpp" -#include "openvino/reference/utils/combine_hash.hpp" #include "openvino/runtime/aligned_buffer.hpp" +#include "openvino/runtime/compute_hash.hpp" #include "openvino/runtime/string_aligned_buffer.hpp" #include "openvino/util/file_util.hpp" #include "pugixml.hpp" @@ -76,9 +76,10 @@ class ConstantWriter { using HashValue = size_t; using ConstWritePositions = std::multimap>; - ConstantWriter(std::ostream& bin_data, bool enable_compression = true) + ConstantWriter(std::ostream& bin_data, bool enable_compression = true, bool write_hash_value = false) : m_binary_output(bin_data), m_enable_compression(enable_compression), + m_write_hash_value(write_hash_value), m_blob_offset(bin_data.tellp()) {} FilePosition write(const char* ptr, @@ -116,18 +117,24 @@ class ConstantWriter { // the same hash for {2, 2} and {0, 128} arrays. // But even strong hashing algorithms sometimes give collisions. // Therefore we always have to compare values when finding a match in the hash multimap. - const HashValue hash = ov::runtime::combine_hash(ptr_to_write, *new_size); + const HashValue hash = ov::runtime::compute_hash(ptr_to_write, *new_size); + auto found = m_hash_to_file_positions.find(hash); // iterate over all matches of the key in the multimap while (found != m_hash_to_file_positions.end()) { - if (memcmp(ptr, found->second.second, size) == 0) + if (memcmp(ptr, found->second.second, size) == 0) { return found->second.first; + } found++; } // Since fp16_compressed data will be disposed at exit point and since we cannot reread it from the ostream, // we store pointer to the original uncompressed blob. m_hash_to_file_positions.insert({hash, {offset, static_cast(ptr)}}); - m_binary_output.write(ptr_to_write, *new_size); + if (m_write_hash_value) { + m_binary_output.write(reinterpret_cast(&hash), sizeof(uint64_t)); + } else { + m_binary_output.write(ptr_to_write, *new_size); + } } return offset; } @@ -172,6 +179,7 @@ class ConstantWriter { ConstWritePositions m_hash_to_file_positions; std::ostream& m_binary_output; bool m_enable_compression; + bool m_write_hash_value; FilePosition m_blob_offset; // blob offset inside output stream }; @@ -1205,7 +1213,7 @@ void serializeFunc(std::ostream& xml_file, std::string name = "net"; pugi::xml_document xml_doc; pugi::xml_node net_node = xml_doc.append_child(name.c_str()); - ConstantWriter constant_write_handler(bin_file); + ConstantWriter constant_write_handler(bin_file, true, true); XmlSerializer visitor(net_node, name, constant_write_handler, version, deterministic); visitor.on_attribute(name, model); @@ -1377,10 +1385,20 @@ bool pass::StreamSerialize::run_on_model(const std::shared_ptr& model /// -------- Hash calculation pass ------------- namespace { -template -static uint64_t hash_combine(uint64_t seed, const T& a) { - // Hash combine formula from boost - return seed ^ (std::hash()(a) + 0x9e3779b9 + (seed << 6) + (seed >> 2)); +// Hash combine formula from boost for uint64_t. +inline uint64_t hash_combine(uint64_t h, uint64_t k) +{ + constexpr uint64_t m = 0xc6a4a7935bd1e995; + constexpr int r = 47; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + + return h + 0xe6546b64; } class OstreamHashWrapper final : public std::streambuf { @@ -1392,19 +1410,23 @@ class OstreamHashWrapper final : public std::streambuf { } std::streamsize xsputn(const char* s, std::streamsize n) override { - // Reinterpret data as uint32_t and accumulate in uint64_t to avoid overflow fluctuations in parallel_sum. - auto* int_sum = reinterpret_cast(s); - const uint64_t n32 = n / sizeof(uint32_t); + uint64_t h = ov::runtime::compute_hash(s, n); + m_res = hash_combine(m_res, h); + + return n; + } +}; - m_res += parallel_sum(n32, uint64_t(0lu), [&](size_t k) -> uint32_t { - return int_sum[k]; - }); +class OstreamHashWrapperBin final : public std::streambuf { + uint64_t m_res = 0lu; - const uint64_t rest = n % sizeof(uint32_t); - for (uint64_t i = 0lu; i < rest; i++) { - m_res += s[n - rest + i]; - } +public: + uint64_t getResult() const { + return m_res; + } + std::streamsize xsputn(const char* s, std::streamsize n) override { + m_res = hash_combine(m_res, *reinterpret_cast(s)); return n; } }; @@ -1413,7 +1435,7 @@ class OstreamHashWrapper final : public std::streambuf { bool pass::Hash::run_on_model(const std::shared_ptr& model) { RUN_ON_MODEL_SCOPE(Hash); OstreamHashWrapper xmlHash; - OstreamHashWrapper binHash; + OstreamHashWrapperBin binHash; std::ostream xml(&xmlHash); std::ostream bin(&binHash); diff --git a/src/core/src/runtime/compute_hash.cpp b/src/core/src/runtime/compute_hash.cpp new file mode 100644 index 00000000000000..a23b2db084ff85 --- /dev/null +++ b/src/core/src/runtime/compute_hash.cpp @@ -0,0 +1,884 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +// The CRC computation is used for x86. +// The calculations were taken from the article +// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel (December, 2009)". + +#include "openvino/core/visibility.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/runtime/compute_hash.hpp" + +#include +#include + +#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) +# include "openvino/reference/utils/registers_pool.hpp" +#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 + +using namespace ov::reference::jit; + +namespace ov { +namespace runtime { + +#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) +namespace jit { + +#define GET_OFF(field) offsetof(ComputeHashCallArgs, field) +#define getReg64() RegistersPool::Reg(m_registers_pool) +#define getVmm() RegistersPool::Reg(m_registers_pool) +#define getXmm() RegistersPool::Reg(m_registers_pool) + +enum KernelType { + SINGLE_THREAD = 0, + FIRST_THREAD, + N_THREAD, + FINAL_FOLD +}; + +struct ComputeHashCompileParams { + KernelType type; +}; + +struct ComputeHashCallArgs { + const void* src_ptr = nullptr; + void* dst_ptr = nullptr; + const void* k_ptr = nullptr; + void* intermediate_ptr = nullptr; + uint64_t work_amount = 0lu; + uint64_t size = 0lu; + uint64_t threads_num = 1lu; +}; + +typedef void (*hash_kernel)(const ComputeHashCallArgs*); + +static const uint8_t SHUF_MASK[16] = { 0b00001111, 0b00001110, 0b00001101, 0b00001100, 0b00001011, 0b00001010, 0b00001001, 0b00001000, + 0b00000111, 0b00000110, 0b00000101, 0b00000100, 0b00000011, 0b00000010, 0b00000001, 0b00000000 }; + +constexpr uint64_t CRC_VAL = 0xffffffffffffffff; + +// POLYNOM(x) = 0x42F0E1EBA9EA3693 +constexpr uint64_t K_2 = 0x05f5c3c7eb52fab6; +constexpr uint64_t P_1 = 0x578d29d06cc4f872; +constexpr uint64_t P_2 = 0x42f0e1eba9ea3693; +static const uint64_t K_PULL[] = { + K_2, 0x0000000000000000, // x^(64*2), x^(64*1) mod P(x) + P_1, P_2, // floor(x^128/P(x))-x^64, P(x)-x^64 + K_2, 0x4eb938a7d257740e, // x^(64*2), x^(64*3) + 0x571bee0a227ef92b, 0x44bef2a201b5200c, // x^(64*4), x^(64*5) + 0x54819d8713758b2c, 0x4a6b90073eb0af5a, // x^(64*6), x^(64*7) + 0x5f6843ca540df020, 0xddf4b6981205b83f, // x^(64*8), x^(64*9) + 0x097c516e98bd2e73, 0x0b76477b31e22e7b, // x^(64*10), x^(64*11) + 0x9af04e1eff82d0dd, 0x6e82e609297f8fe8, // x^(64*12), x^(64*13) + 0xe464f4df5fb60ac1, 0xb649c5b35a759cf2, // x^(64*14), x^(64*14) + 0x05cf79dea9ac37d6, 0x001067e571d7d5c2 // x^(64*16), x^(64*17) + }; + +constexpr uint64_t K_1_0_OFF = 0lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_P_P_OFF = 1lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_2_3_OFF = 2lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_4_5_OFF = 3lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_6_7_OFF = 4lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_8_9_OFF = 5lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_10_11_OFF = 6lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_12_13_OFF = 7lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_14_15_OFF = 8lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_16_17_OFF = 9lu * 2lu * sizeof(uint64_t); + +class HashBase : public Generator { +protected: + void (*ker_fn)(const ComputeHashCallArgs*); +public: + HashBase(cpu_isa_t isa) : Generator(isa) {} + + virtual void generate() = 0; + + void operator()(const ComputeHashCallArgs* args) { + ker_fn(args); + } + + virtual void create_kernel() { + generate(); + ker_fn = (decltype(ker_fn))getCode(); + OPENVINO_ASSERT(ker_fn, "[ CORE ] Could not generate kernel code."); + } +}; + +template +class ComputeHash : public HashBase { +public: + explicit ComputeHash(const ComputeHashCompileParams& jcp) : HashBase(isa), m_jcp(jcp) { + if (!mayiuse(cpu_isa_t::pclmulqdq)) { + OPENVINO_THROW("The current CPU does not support pclmulqdq instruction, which is required for the CRC algorithm."); + } + if (mayiuse(cpu_isa_t::vpclmulqdq)) { + is_vpclmulqdq = true; + } + } + + void generate() override { + m_registers_pool = RegistersPool::create(isa, {rax, rcx, rsp, rdi, k0}); + + r64_src_ptr = getReg64(); + r64_dst_ptr = getReg64(); + r64_work_amount = getReg64(); + r64_k_ptr = getReg64(); + r64_aux = getReg64(); + v_k_2_3 = getVmm(); + v_shuf_mask = getVmm(); + auto v_dst = getVmm(); + + this->preamble(); + + initialize(v_dst); + bulk_fold(v_dst); + join(v_dst); + fold_to_128(v_dst); + fold_to_64(v_dst); + + this->postamble(); + m_registers_pool.reset(); + } + + static std::shared_ptr create(const ComputeHashCompileParams& params) { + auto kernel = std::make_shared(params); + OPENVINO_ASSERT(kernel, "[ CORE ] Could not create ComputeHash kernel."); + kernel->create_kernel(); + + return kernel; + } + +private: + using Vmm = typename std::conditional::type; + bool is_vpclmulqdq = false; + + ComputeHashCompileParams m_jcp; + RegistersPool::Ptr m_registers_pool; + + const Xbyak::Reg64 r64_params = abi_param1; + + RegistersPool::Reg r64_src_ptr; + RegistersPool::Reg r64_dst_ptr; + RegistersPool::Reg r64_work_amount; + RegistersPool::Reg r64_k_ptr; + RegistersPool::Reg r64_aux; + + // Vector registers + RegistersPool::Reg v_k_2_3; + RegistersPool::Reg v_shuf_mask; + + void initialize(const Vmm& v_dst); + + void bulk_fold(const Vmm& v_dst); + + void join(const Vmm& v_dst); + + void fold_to_128(const Vmm& v_dst); + + void fold_to_64(const Vmm& v_dst); + + void uni_vpxorq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src_0, const Xbyak::Xmm& v_src_1); + + void uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0); + + void uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0); + + void uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0); + + void partial_load(const Xbyak::Xmm& xmm_dst, const Xbyak::Address& src_addr, const Xbyak::Reg64& r64_load_num); + + void partial_load(const Xbyak::Ymm& ymm_dst, const Xbyak::Address& src_addr, const Xbyak::Reg64& r64_load_num); +}; + +template <> +void ComputeHash::uni_vpxorq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src_0, const Xbyak::Xmm& v_src_1) { + vpxorq(v_dst, v_src_0, v_src_1); +} +template +void ComputeHash::uni_vpxorq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src_0, const Xbyak::Xmm& v_src_1) { + vpxor(v_dst, v_src_0, v_src_1); +} +template <> +void ComputeHash::uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0) { + vmovdqu64(v_dst, v_src_0); +} +template +void ComputeHash::uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0) { + vmovdqu(v_dst, v_src_0); +} +template <> +void ComputeHash::uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0) { + vmovdqu64(v_dst, v_src_0); +} +template +void ComputeHash::uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0) { + vmovdqu(v_dst, v_src_0); +} +template <> +void ComputeHash::uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0) { + vbroadcasti64x2(v_dst, v_src_0); +} +template +void ComputeHash::uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0) { + vbroadcasti128(v_dst, v_src_0); +} +template <> +void ComputeHash::partial_load(const Xbyak::Xmm& xmm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + Xbyak::Label l_mv_mask; + auto rOnes = getReg64(); + auto k_load_mask = RegistersPool::Reg(m_registers_pool); + + mov(rOnes, 0xFFFFFFFFFFFFFFFF); + cmp(r64_load_num, 0x3f); + jg(l_mv_mask); + + shlx(rOnes, rOnes, r64_load_num); + not_(rOnes); + + L(l_mv_mask); + kmovq(k_load_mask, rOnes); + + vmovdqu8(Vmm(xmm_dst.getIdx()) | k_load_mask | T_z, ptr[r64_src_ptr]); +} +template +void ComputeHash::partial_load(const Xbyak::Xmm& xmm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + Xbyak::Label l_partial, l_end; + + cmp(r64_load_num, xmm_len); + jl(l_partial, T_NEAR); + uni_vmovdqu64(xmm_dst, ptr[src_addr.getRegExp()]); + jmp(l_end, T_NEAR); + + L(l_partial); { + uni_vpxorq(xmm_dst, xmm_dst, xmm_dst); + for (size_t j = 0lu; j < xmm_len - 1; j++) { + cmp(r64_load_num, j); + jle(l_end, T_NEAR); + pinsrb(xmm_dst, ptr[src_addr.getRegExp() + j], j); + } + } + + L(l_end); +} +template <> +void ComputeHash::partial_load(const Xbyak::Ymm& xmm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + partial_load(Xbyak::Xmm(xmm_dst.getIdx()), src_addr, r64_load_num); +} +template +void ComputeHash::partial_load(const Xbyak::Ymm& ymm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + Xbyak::Label l_xmm, l_partial, l_end; + auto xmm_dst = Xbyak::Xmm(ymm_dst.getIdx()); + + cmp(r64_load_num, ymm_len); + jl(l_xmm, T_NEAR); + uni_vmovdqu64(ymm_dst, ptr[src_addr.getRegExp()]); + jmp(l_end, T_NEAR); + + L(l_xmm); + uni_vpxorq(ymm_dst, ymm_dst, ymm_dst); + cmp(r64_load_num, xmm_len); + jl(l_partial, T_NEAR); + uni_vmovdqu64(xmm_dst, ptr[src_addr.getRegExp()]); + je(l_end, T_NEAR); + + { + Xbyak::Label l_rest_loop, l_perm; + + vperm2i128(ymm_dst, ymm_dst, ymm_dst, 0x1); + for (size_t j = 0lu; j < xmm_len - 1; j++) { + cmp(r64_load_num, xmm_len + j); + jle(l_perm, T_NEAR); + pinsrb(xmm_dst, ptr[src_addr.getRegExp() + xmm_len + j], j); + } + L(l_perm); + vperm2i128(ymm_dst, ymm_dst, ymm_dst, 0x1); + } + jmp(l_end, T_NEAR); + + L(l_partial); { + for (size_t j = 0lu; j < xmm_len - 1; j++) { + cmp(r64_load_num, j); + jle(l_end, T_NEAR); + pinsrb(xmm_dst, ptr[src_addr.getRegExp() + j], j); + } + } + + L(l_end); +} + +template +void ComputeHash::initialize(const Vmm& v_dst) { + mov(r64_src_ptr, ptr[r64_params + GET_OFF(src_ptr)]); + mov(r64_dst_ptr, ptr[r64_params + GET_OFF(dst_ptr)]); + mov(r64_k_ptr, ptr[r64_params + GET_OFF(k_ptr)]); + mov(r64_work_amount, ptr[r64_params + GET_OFF(work_amount)]); + + uni_vbroadcasti64x2(v_k_2_3, ptr[r64_k_ptr + K_2_3_OFF]); + + mov(r64_aux, reinterpret_cast(SHUF_MASK)); + uni_vbroadcasti64x2(v_shuf_mask, ptr[r64_aux]); + + if (m_jcp.type == SINGLE_THREAD || m_jcp.type == FIRST_THREAD) { + auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_aux = getXmm(); + + // Initial CRC + mov(r64_aux, ptr[r64_params + GET_OFF(size)]); + vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x0); + mov(r64_aux, CRC_VAL); + vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x1); + + // First xor with source. + partial_load(v_dst, ptr[r64_src_ptr], r64_work_amount); + vpshufb(v_dst, v_dst, v_shuf_mask); + pxor(xmm_dst, xmm_aux); // The SSE version is used to avoid zeroing out the rest of the Vmm. + if (m_jcp.type == SINGLE_THREAD) { + add(r64_src_ptr, xmm_len); + } + } else if (m_jcp.type == N_THREAD) { + uni_vmovdqu64(v_dst, ptr[r64_src_ptr]); + vpshufb(v_dst, v_dst, v_shuf_mask); + } + if (m_jcp.type == SINGLE_THREAD || m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + sub(r64_work_amount, xmm_len); + } +} + +template <> +void ComputeHash::bulk_fold(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FIRST_THREAD && m_jcp.type != N_THREAD) { + return; + } + Xbyak::Label l_fold_loop, l_end; + cmp(r64_work_amount, 2 * get_vlen() - xmm_len); + jl(l_end, T_NEAR); + + auto v_src_0 = getVmm(); + auto v_dst_0 = getVmm(); + auto v_dst_1 = getVmm(); + auto v_dst_2 = getVmm(); + auto& v_dst_3 = v_dst; + auto v_k_loop = getVmm(); + auto v_aux_0 = getVmm(); + + auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx()); + auto xmm_src_1 = getXmm(); + auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx()); + auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx()); + auto xmm_dst_2 = Xbyak::Xmm(v_dst_2.getIdx()); + auto xmm_dst_3 = Xbyak::Xmm(v_dst_3.getIdx()); + auto xmm_k_loop = Xbyak::Xmm(v_k_loop.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); + auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx()); + + RegistersPool::Reg r64_bulk_step; + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + r64_bulk_step = getReg64(); + mov(r64_bulk_step, ptr[r64_params + GET_OFF(threads_num)]); + sal(r64_bulk_step, static_cast(std::log2(get_vlen()))); // * vlen + } + + if (m_jcp.type == SINGLE_THREAD) { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_8_9_OFF]); + } else { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_16_17_OFF]); + } + + uni_vmovdqu64(v_dst_0, v_dst); + + if (!is_vpclmulqdq) { + vextracti64x2(xmm_dst_1, v_dst_0, 0x1); + vextracti64x2(xmm_dst_2, v_dst_0, 0x2); + vextracti64x2(xmm_dst_3, v_dst_0, 0x3); + } + + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + add(r64_src_ptr, r64_bulk_step); + prefetcht2(ptr[r64_src_ptr + 16384]); + } else { + add(r64_src_ptr, get_vlen() - xmm_len); + prefetcht2(ptr[r64_src_ptr + 4096]); + } + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + sub(r64_work_amount, 2 * get_vlen() - xmm_len); + + L(l_fold_loop); { + uni_vmovdqu64(v_src_0, ptr[r64_src_ptr]); + vpshufb(v_src_0, v_src_0, v_shuf_mask); + + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + add(r64_src_ptr, r64_bulk_step); + prefetcht2(ptr[r64_src_ptr + 16384]); + } else { + add(r64_src_ptr, get_vlen()); + prefetcht2(ptr[r64_src_ptr + 4096]); + } + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + if (is_vpclmulqdq) { + vpclmulqdq(v_aux_0, v_dst_0, v_k_loop, 0b00000000); + vpclmulqdq(v_dst_0, v_dst_0, v_k_loop, 0b00010001); + uni_vpxorq(v_aux_0, v_aux_0, v_src_0); + uni_vpxorq(v_dst_0, v_dst_0, v_aux_0); + } else { + // 0 + vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0); + uni_vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0); + + // 1 + vextracti64x2(xmm_src_1, v_src_0, 0x1); + vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); + + // 2 + vextracti64x2(xmm_src_1, v_src_0, 0x2); + vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_2, xmm_dst_2, xmm_aux_0); + + // 3 + vextracti64x2(xmm_src_1, v_src_0, 0x3); + vpclmulqdq(xmm_aux_0, xmm_dst_3, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_3, xmm_dst_3, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + } + + sub(r64_work_amount, get_vlen()); + jge(l_fold_loop, T_NEAR); + } + add(r64_work_amount, get_vlen()); + + if (m_jcp.type == SINGLE_THREAD) { + if (is_vpclmulqdq) { + vextracti64x2(xmm_dst_1, v_dst_0, 0x1); + vextracti64x2(xmm_dst_2, v_dst_0, 0x2); + vextracti64x2(xmm_dst_3, v_dst_0, 0x3); + } + + vpclmulqdq(xmm_aux_0, xmm_dst_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0); + + vpclmulqdq(xmm_aux_0, xmm_dst_1, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000); + vpclmulqdq(xmm_dst_1, xmm_dst_1, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_1); + + vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_2); + } else { + if (is_vpclmulqdq) { + uni_vmovdqu64(ptr[r64_dst_ptr], v_dst_0); + } else { + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 0lu], xmm_dst_0); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 1lu], xmm_dst_1); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 2lu], xmm_dst_2); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 3lu], xmm_dst_3); + } + } + + L(l_end); +} + +template +void ComputeHash::bulk_fold(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FIRST_THREAD && m_jcp.type != N_THREAD) { + return; + } + Xbyak::Label l_fold_loop, l_end; + cmp(r64_work_amount, 2 * get_vlen() - xmm_len); + jl(l_end, T_NEAR); + + auto v_src_0 = getVmm(); + auto v_dst_0 = getVmm(); + auto& v_dst_1 = v_dst; + auto v_aux_0 = getVmm(); + auto v_k_loop = getVmm(); + + auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx()); + auto xmm_src_1 = getXmm(); + auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx()); + auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx()); + auto xmm_k_loop = Xbyak::Xmm(v_k_loop.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx()); + + RegistersPool::Reg r64_bulk_step; + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + r64_bulk_step = getReg64(); + mov(r64_bulk_step, ptr[r64_params + GET_OFF(threads_num)]); + sal(r64_bulk_step, static_cast(std::log2(get_vlen()))); // * vlen + } + + if (m_jcp.type == SINGLE_THREAD) { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_4_5_OFF]); + } else { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_8_9_OFF]); + } + + uni_vmovdqu64(v_dst_0, v_dst); + + if (!is_vpclmulqdq) { + vextracti128(xmm_dst_1, v_dst_0, 0x1); + } + + if (m_jcp.type == SINGLE_THREAD) { + add(r64_src_ptr, get_vlen() - xmm_len); + } else { + add(r64_src_ptr, r64_bulk_step); + } + prefetcht2(ptr[r64_src_ptr + 4096]); + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + sub(r64_work_amount, 2 * get_vlen() - xmm_len); + + L(l_fold_loop); { + uni_vmovdqu64(v_src_0, ptr[r64_src_ptr]); + vpshufb(v_src_0, v_src_0, v_shuf_mask); + + if (m_jcp.type == SINGLE_THREAD) { + add(r64_src_ptr, get_vlen()); + } else { + add(r64_src_ptr, r64_bulk_step); + } + prefetcht2(ptr[r64_src_ptr + 4096]); + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + if (is_vpclmulqdq) { + vpclmulqdq(v_aux_0, v_dst_0, v_k_loop, 0b00000000); + vpclmulqdq(v_dst_0, v_dst_0, v_k_loop, 0b00010001); + uni_vpxorq(v_aux_0, v_aux_0, v_src_0); + uni_vpxorq(v_dst_0, v_dst_0, v_aux_0); + } else { + // 0 + vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0); + uni_vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0); + // 1 + vextracti128(xmm_src_1, v_src_0, 0x1); + vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); + } + + sub(r64_work_amount, get_vlen()); + jge(l_fold_loop, T_NEAR); + } + add(r64_work_amount, get_vlen()); + + if (m_jcp.type == SINGLE_THREAD) { + if (is_vpclmulqdq) { + vextracti128(xmm_dst_1, v_dst_0, 0x1); + } + vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_dst_0); + } else { + if (is_vpclmulqdq) { + uni_vmovdqu64(ptr[r64_dst_ptr], v_dst_0); + } else { + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 0lu], xmm_dst_0); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 1lu], xmm_dst_1); + } + } + + L(l_end); +} + +template <> +void ComputeHash::join(const Vmm& v_dst) { + if (m_jcp.type != FINAL_FOLD) { + return; + } + + mov(r64_aux, ptr[r64_params + GET_OFF(intermediate_ptr)]); + prefetcht0(ptr[r64_aux + 1024]); + + auto xmm_src_0 = getXmm(); + auto xmm_src_last = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_aux_0 = getXmm(); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + + uni_vmovdqu64(xmm_src_last, ptr[r64_aux + xmm_len * 7]); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_14_15_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_14_15_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_12_13_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_12_13_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 2lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_10_11_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_10_11_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 3lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_8_9_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_8_9_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 4lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 5lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 6lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); +} + +template +void ComputeHash::join(const Vmm& v_dst) { + if (m_jcp.type != FINAL_FOLD) { + return; + } + + mov(r64_aux, ptr[r64_params + GET_OFF(intermediate_ptr)]); + prefetcht0(ptr[r64_aux + 1024]); + + auto xmm_src_0 = getXmm(); + auto xmm_src_last = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_aux_0 = getXmm(); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + + uni_vmovdqu64(xmm_src_last, ptr[r64_aux + xmm_len * 3]); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 0lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 1lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 2lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); +} + +template +void ComputeHash::fold_to_128(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FINAL_FOLD) { + return; + } + Xbyak::Label l_fold_loop, l_end; + cmp(r64_work_amount, xmm_len); + jl(l_end, T_NEAR); + + auto xmm_src = getXmm(); + auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); + auto xmm_aux = getXmm(); + + L(l_fold_loop); { + uni_vmovdqu64(xmm_src, ptr[r64_src_ptr]); + vpshufb(xmm_src, xmm_src, xmm_shuf_mask); + + vpclmulqdq(xmm_aux, xmm_dst, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst, xmm_dst, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + uni_vpxorq(xmm_dst, xmm_dst, xmm_src); + + add(r64_src_ptr, xmm_len); + sub(r64_work_amount, xmm_len); + cmp(r64_work_amount, xmm_len); + jge(l_fold_loop, T_NEAR); + } + + L(l_end); +} + +template +void ComputeHash::fold_to_64(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FINAL_FOLD) { + return; + } + Xbyak::Label l_fold_to_64; + cmp(r64_work_amount, 0); + jle(l_fold_to_64, T_NEAR); + + auto xmm_src = getXmm(); + auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); + auto xmm_aux = getXmm(); + auto xmm_aux_1 = getXmm(); + auto xmm_aux_2 = getXmm(); + + partial_load(xmm_src, ptr[r64_src_ptr], r64_work_amount); + vpshufb(xmm_src, xmm_src, xmm_shuf_mask); + + vpclmulqdq(xmm_aux, xmm_dst, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst, xmm_dst, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_aux, xmm_aux, xmm_src); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + + L(l_fold_to_64); + + mov(r64_aux, K_2); + vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x0); + vpclmulqdq(xmm_aux, xmm_dst, xmm_aux, 0b00000001); + vpslldq(xmm_dst, xmm_dst, 0x8); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + + mov(r64_aux, P_1); + vpinsrq(xmm_aux_2, xmm_aux_2, r64_aux, 0x0); + vpclmulqdq(xmm_aux, xmm_dst, xmm_aux_2, 0b00000001); + mov(r64_aux, 0x0); + vpinsrq(xmm_aux_1, xmm_dst, r64_aux, 0x0); + uni_vpxorq(xmm_aux, xmm_aux, xmm_aux_1); + vpinsrq(xmm_aux_1, xmm_aux, r64_aux, 0x0); + + mov(r64_aux, P_2); + vpinsrq(xmm_aux_2, xmm_aux_2, r64_aux, 0x1); + vpclmulqdq(xmm_aux, xmm_aux, xmm_aux_2, 0b00010001); + uni_vpxorq(xmm_aux, xmm_aux, xmm_aux_1); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + + vpextrq(ptr[r64_dst_ptr], xmm_dst, 0x0); +} + +} // namespace jit +#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 + +size_t compute_hash(const void* src, size_t size) { +#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) + if (Generator::mayiuse(avx2)) { + uint64_t result = 0lu; + + // Parallel section + constexpr size_t min_wa_per_thread = 131072lu; // 2^17 + if (size >= min_wa_per_thread * 2lu) { + static auto first_thr_kernel = Generator::mayiuse(avx512_core) ? + jit::ComputeHash::create({jit::FIRST_THREAD}) : jit::ComputeHash::create({jit::FIRST_THREAD}); + static auto n_thr_kernel = Generator::mayiuse(avx512_core) ? + jit::ComputeHash::create({jit::N_THREAD}) : jit::ComputeHash::create({jit::N_THREAD}); + static auto final_fold_kernel = Generator::mayiuse(avx512_core) ? + jit::ComputeHash::create({jit::FINAL_FOLD}) : jit::ComputeHash::create({jit::FINAL_FOLD}); + + static const size_t max_thr_num = 2lu; + size_t thr_num = std::min(size / min_wa_per_thread, max_thr_num); + const uint64_t el_per_thread = first_thr_kernel->get_vlen() * ( (size / thr_num) / first_thr_kernel->get_vlen()); + std::vector intermediate(thr_num * first_thr_kernel->get_vlen()); + + parallel_nt_static(thr_num, [&](const int ithr, const int nthr) { + uint64_t start = ithr * el_per_thread; + if (start >= size) { + return; + } + uint64_t work_amount = (el_per_thread + start > size) ? size - start : el_per_thread; + + jit::ComputeHashCallArgs args; + + args.src_ptr = reinterpret_cast(src) + first_thr_kernel->get_vlen() * ithr; + args.dst_ptr = &(intermediate[ithr * first_thr_kernel->get_vlen()]); + args.k_ptr = jit::K_PULL; + args.work_amount = work_amount; + args.size = size; + args.threads_num = thr_num; + + if (ithr == 0) { + (*first_thr_kernel)(&args); + } else { + (*n_thr_kernel)(&args); + } + }); + + jit::ComputeHashCallArgs args; + args.src_ptr = reinterpret_cast(src) + size - args.work_amount; + args.dst_ptr = &result; + args.k_ptr = jit::K_PULL; + args.work_amount = size - el_per_thread * thr_num; + args.size = size; + args.intermediate_ptr = intermediate.data(); + + (*final_fold_kernel)(&args); + } else { + static auto single_thr_kernel = Generator::mayiuse(avx512_core) + ? jit::ComputeHash::create({jit::SINGLE_THREAD}) : jit::ComputeHash::create({jit::SINGLE_THREAD}); + + jit::ComputeHashCallArgs args; + args.src_ptr = src; + args.dst_ptr = &result; + args.k_ptr = jit::K_PULL; + args.work_amount = size; + args.size = size; + + (*single_thr_kernel)(&args); + } + + return result; + } + +#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 + + constexpr auto cel_size = sizeof(size_t); + size_t seed = size; + const auto data = static_cast(src); + const auto d_end = std::next(data, size / cel_size); + // The constant value used as a magic number has been + // traditionally used e.g. in boost library's hash_combine. + // It happens to be derived from the golden ratio. + for (auto d = data; d != d_end; ++d) { + seed ^= *d + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + size_t last_bytes{0}; + std::memcpy(&last_bytes, d_end, size % cel_size); + seed ^= last_bytes + 0x9e3779b9 + (seed << 6) + (seed >> 2); + + return seed; +} + +} // namespace runtime +} // namespace ov