diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index bc42ffca8a3cf6..eb95a04337a7f8 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -49,6 +49,9 @@ target_include_directories(openvino_core_dev INTERFACE $ $) +target_include_directories(openvino_core_dev SYSTEM INTERFACE + $:$>>) + target_link_libraries(openvino_core_dev INTERFACE openvino::itt openvino::util) set_target_properties(openvino_core_dev PROPERTIES EXPORT_NAME core::dev) diff --git a/src/core/dev_api/openvino/runtime/compute_hash.hpp b/src/core/dev_api/openvino/runtime/compute_hash.hpp new file mode 100644 index 00000000000000..47a90d589be4ee --- /dev/null +++ b/src/core/dev_api/openvino/runtime/compute_hash.hpp @@ -0,0 +1,20 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace runtime { + +/** + * @brief Computes the hash value for the input data + * @param src A pointer to the input data + * @param size The length of the input data in bytes + */ +size_t compute_hash(const void* src, size_t size); + +} // namespace runtime +} // namespace ov diff --git a/src/core/reference/CMakeLists.txt b/src/core/reference/CMakeLists.txt index f7874964233cf5..24c74ede3b5fba 100644 --- a/src/core/reference/CMakeLists.txt +++ b/src/core/reference/CMakeLists.txt @@ -39,8 +39,6 @@ ov_build_target_faster(${TARGET_NAME} ov_set_threading_interface_for(${TARGET_NAME}) -target_compile_definitions(${TARGET_NAME} PRIVATE XBYAK_NO_OP_NAMES XBYAK64) - if(NOT BUILD_SHARED_LIBS) target_compile_definitions(${TARGET_NAME} PUBLIC OPENVINO_STATIC_LIBRARY) endif() @@ -50,9 +48,6 @@ target_include_directories(${TARGET_NAME} PUBLIC $ $) -target_include_directories(${TARGET_NAME} SYSTEM PRIVATE - $:$>>) - find_package(Threads REQUIRED) target_link_libraries(${TARGET_NAME} PRIVATE Threads::Threads openvino::core::dev) diff --git a/src/core/reference/include/openvino/reference/utils/combine_hash.hpp b/src/core/reference/include/openvino/reference/utils/combine_hash.hpp deleted file mode 100644 index 9f1cfdea812494..00000000000000 --- a/src/core/reference/include/openvino/reference/utils/combine_hash.hpp +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -namespace ov { -namespace runtime { - -size_t combine_hash(const void* src, size_t size); - -} // namespace runtime -} // namespace ov diff --git a/src/core/reference/include/openvino/reference/utils/jit_generator.hpp b/src/core/reference/include/openvino/reference/utils/jit_generator.hpp index 49c5cb6e0e959e..a8f5def4197275 100644 --- a/src/core/reference/include/openvino/reference/utils/jit_generator.hpp +++ b/src/core/reference/include/openvino/reference/utils/jit_generator.hpp @@ -5,94 +5,97 @@ #pragma once #if defined _WIN32 && !defined NOMINMAX -#define NOMINMAX +# define NOMINMAX #endif -#include +#define XBYAK64 +#define XBYAK_NO_OP_NAMES #include +#include + namespace ov { namespace reference { namespace jit { -#ifdef XBYAK64 - static const Xbyak::Operand::Code abi_save_gpr_regs[] = { - Xbyak::Operand::RBX, - Xbyak::Operand::RBP, - Xbyak::Operand::R12, - Xbyak::Operand::R13, - Xbyak::Operand::R14, - Xbyak::Operand::R15, +static const Xbyak::Operand::Code abi_save_gpr_regs[] = { + Xbyak::Operand::RBX, + Xbyak::Operand::RBP, + Xbyak::Operand::R12, + Xbyak::Operand::R13, + Xbyak::Operand::R14, + Xbyak::Operand::R15, #ifdef _WIN32 - Xbyak::Operand::RDI, - Xbyak::Operand::RSI, + Xbyak::Operand::RDI, + Xbyak::Operand::RSI, #endif - }; +}; #ifdef _WIN32 -#define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX) // RCX +# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX) // RCX #else -#define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI) // RDI +# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI) // RDI #endif -#endif // XBYAK64 - - typedef enum { - isa_any, - sse42, - avx, - avx2, - avx512_common, - avx512_core, - avx512_core_vnni, - avx512_mic, - avx512_mic_4ops, - avx512_core_bf16, - avx512_vpopcnt, - fp16, - pclmulqdq, - vpclmulqdq - } cpu_isa_t; - - class Generator : public Xbyak::CodeGenerator - { + +typedef enum { + isa_any, + sse42, + avx, + avx2, + avx512_common, + avx512_core, + avx512_core_vnni, + avx512_mic, + avx512_mic_4ops, + avx512_core_bf16, + avx512_vpopcnt, + fp16, + pclmulqdq, + vpclmulqdq +} cpu_isa_t; + +class Generator : public Xbyak::CodeGenerator { #ifdef _WIN32 - static constexpr size_t xmm_to_preserve_start = 6; - static constexpr size_t xmm_to_preserve = 10; + static constexpr size_t xmm_to_preserve_start = 6llu; + static constexpr size_t xmm_to_preserve = 10llu; #else - static constexpr size_t xmm_to_preserve_start = 0; - static constexpr size_t xmm_to_preserve = 0; + static constexpr size_t xmm_to_preserve_start = 0lu; + static constexpr size_t xmm_to_preserve = 0lu; #endif - static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]); - const size_t size_of_abi_save_regs; + static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]); + const size_t size_of_abi_save_regs; + + const Xbyak::Reg64 reg_EVEX_max_8b_offt; + static constexpr int EVEX_max_8b_offt = 0x200; + size_t m_vlen = ymm_len; - const Xbyak::Reg64 reg_EVEX_max_8b_offt; - static constexpr int EVEX_max_8b_offt = 0x200; +public: + static constexpr size_t xmm_len = 16lu; + static constexpr size_t ymm_len = 32lu; + static constexpr size_t zmm_len = 64lu; - public: - static constexpr size_t xmm_len = 16; - static constexpr size_t ymm_len = 32; - static constexpr size_t zmm_len = 64; + const Xbyak::Reg64 param = abi_param1; - const Xbyak::Reg64 param = abi_param1; + static bool mayiuse(const cpu_isa_t cpu_isa); + static bool is_x64(); - static bool mayiuse(const cpu_isa_t cpu_isa); - static bool is_x64(); + Generator(cpu_isa_t isa = avx2, void* code_ptr = nullptr, size_t code_size = 16lu * 1024lu); + void preamble(); + void postamble(); - Generator(void* code_ptr = nullptr, size_t code_size = 16 * 1024); - void preamble(); - void postamble(); + void foreach (const Xbyak::Reg64& idx, + size_t step, + const Xbyak::Reg64& end, + std::function && fn); - void foreach (const Xbyak::Reg64& idx, - size_t step, - const Xbyak::Reg64& end, - std::function && fn); + template + void copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size); - template - void copy(const Xbyak::Reg64& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size); - }; + size_t get_vlen() { + return m_vlen; + } +}; } // namespace jit -} // namespace reference +} // namespace reference } // namespace ov diff --git a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp index 59ddd11596b980..1ac29ba7c6ab83 100644 --- a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp +++ b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp @@ -4,15 +4,14 @@ #pragma once -#include "jit_generator.hpp" -#include "openvino/core/except.hpp" - #include #include #include +#include "openvino/core/except.hpp" +#include "openvino/reference/utils/jit_generator.hpp" namespace ov { -namespace runtime { +namespace reference { namespace jit { class RegistersPool { @@ -21,25 +20,44 @@ class RegistersPool { using WeakPtr = std::weak_ptr; static constexpr int anyIdx = -1; - template + template class Reg { friend class RegistersPool; + public: Reg() {} - Reg(const RegistersPool::Ptr& regPool) { initialize(regPool); } - Reg(const RegistersPool::Ptr& regPool, int requestedIdx) { initialize(regPool, requestedIdx); } - ~Reg() { release(); } - Reg& operator=(Reg&& other) noexcept { + Reg(const RegistersPool::Ptr& regPool) { + initialize(regPool); + } + Reg(const RegistersPool::Ptr& regPool, int requestedIdx) { + initialize(regPool, requestedIdx); + } + ~Reg() { + release(); + } + Reg& operator=(Reg&& other) noexcept { release(); reg = other.reg; regPool = std::move(other.regPool); return *this; } - Reg(Reg&& other) noexcept : reg(other.reg), regPool(std::move(other.regPool)) {} - operator TReg&() { ensureValid(); return reg; } - operator const TReg&() const { ensureValid(); return reg; } - operator Xbyak::RegExp() const { ensureValid(); return reg; } - int getIdx() const { ensureValid(); return reg.getIdx(); } + Reg(Reg&& other) noexcept : reg(other.reg), regPool(std::move(other.regPool)) {} + operator TReg&() { + ensureValid(); + return reg; + } + operator const TReg&() const { + ensureValid(); + return reg; + } + operator Xbyak::RegExp() const { + ensureValid(); + return reg; + } + int getIdx() const { + ensureValid(); + return reg.getIdx(); + } friend Xbyak::RegExp operator+(const Reg& lhs, const Xbyak::RegExp& rhs) { lhs.ensureValid(); return lhs.operator Xbyak::RegExp() + rhs; @@ -50,7 +68,9 @@ class RegistersPool { regPool.reset(); } } - bool isInitialized() const { return !regPool.expired(); } + bool isInitialized() const { + return !regPool.expired(); + } private: void ensureValid() const { @@ -74,12 +94,12 @@ class RegistersPool { checkUniqueAndUpdate(false); } - template + template static Ptr create(std::initializer_list regsToExclude); static Ptr create(cpu_isa_t isa, std::initializer_list regsToExclude); - template + template size_t countFree() const { if (std::is_base_of::value) { return simdSet.countUnused(); @@ -158,12 +178,17 @@ class RegistersPool { std::vector isFreeIndexVector; }; - virtual int getFreeOpmask(int requestedIdx) { OPENVINO_THROW("getFreeOpmask: The Opmask is not supported in current instruction set"); } - virtual void returnOpmaskToPool(int idx) { OPENVINO_THROW("returnOpmaskToPool: The Opmask is not supported in current instruction set"); } - virtual size_t countUnusedOpmask() const { OPENVINO_THROW("countUnusedOpmask: The Opmask is not supported in current instruction set"); } + virtual int getFreeOpmask(int requestedIdx) { + OPENVINO_THROW("getFreeOpmask: The Opmask is not supported in current instruction set"); + } + virtual void returnOpmaskToPool(int idx) { + OPENVINO_THROW("returnOpmaskToPool: The Opmask is not supported in current instruction set"); + } + virtual size_t countUnusedOpmask() const { + OPENVINO_THROW("countUnusedOpmask: The Opmask is not supported in current instruction set"); + } - RegistersPool(int simdRegistersNumber) - : simdSet(simdRegistersNumber) { + RegistersPool(int simdRegistersNumber) : simdSet(simdRegistersNumber) { checkUniqueAndUpdate(); generalSet.exclude(Xbyak::Reg64(Xbyak::Operand::RSP)); generalSet.exclude(Xbyak::Reg64(Xbyak::Operand::RAX)); @@ -173,7 +198,7 @@ class RegistersPool { } RegistersPool(std::initializer_list regsToExclude, int simdRegistersNumber) - : simdSet(simdRegistersNumber) { + : simdSet(simdRegistersNumber) { checkUniqueAndUpdate(); for (auto& reg : regsToExclude) { if (reg.isXMM() || reg.isYMM() || reg.isZMM()) { @@ -186,7 +211,7 @@ class RegistersPool { } private: - template + template int getFree(int requestedIdx) { if (std::is_base_of::value) { auto idx = simdSet.getUnused(requestedIdx); @@ -202,7 +227,7 @@ class RegistersPool { } } - template + template void returnToPool(const TReg& reg) { if (std::is_base_of::value) { simdSet.setAsUnused(reg.getIdx()); @@ -226,7 +251,7 @@ class RegistersPool { } } - PhysicalSet generalSet {16}; + PhysicalSet generalSet{16}; PhysicalSet simdSet; }; @@ -240,11 +265,11 @@ template <> class IsaRegistersPool : public RegistersPool { public: IsaRegistersPool() : RegistersPool(32) { - opmaskSet.exclude(Xbyak::Opmask(0)); // the Opmask(0) has special meaning for some instructions, like gather instruction + opmaskSet.exclude( + Xbyak::Opmask(0)); // the Opmask(0) has special meaning for some instructions, like gather instruction } - IsaRegistersPool(std::initializer_list regsToExclude) - : RegistersPool(regsToExclude, 32) { + IsaRegistersPool(std::initializer_list regsToExclude) : RegistersPool(regsToExclude, 32) { for (auto& reg : regsToExclude) { if (reg.isOPMASK()) { opmaskSet.exclude(reg); @@ -267,7 +292,7 @@ class IsaRegistersPool : public RegistersPool { } protected: - PhysicalSet opmaskSet {8}; + PhysicalSet opmaskSet{8}; }; template <> @@ -289,9 +314,10 @@ RegistersPool::Ptr RegistersPool::create(std::initializer_list regsT return std::make_shared>(regsToExclude); } -inline -RegistersPool::Ptr RegistersPool::create(cpu_isa_t isa, std::initializer_list regsToExclude) { -#define ISA_SWITCH_CASE(isa) case isa: return std::make_shared>(regsToExclude); +inline RegistersPool::Ptr RegistersPool::create(cpu_isa_t isa, std::initializer_list regsToExclude) { +#define ISA_SWITCH_CASE(isa) \ + case isa: \ + return std::make_shared>(regsToExclude); switch (isa) { ISA_SWITCH_CASE(sse42) ISA_SWITCH_CASE(avx) @@ -299,14 +325,14 @@ RegistersPool::Ptr RegistersPool::create(cpu_isa_t isa, std::initializer_list>(regsToExclude); - default: - OPENVINO_THROW("Invalid isa argument in RegistersPool::create(): ", isa); - } - OPENVINO_THROW("Invalid isa argument in RegistersPool::create()"); + case avx512_vpopcnt: + return std::make_shared>(regsToExclude); + default: + OPENVINO_THROW("Invalid isa argument in RegistersPool::create(): ", isa); + } #undef ISA_SWITCH_CASE } -} // namespace jit -} // namespace runtime -} // namespace ov +} // namespace jit +} // namespace reference +} // namespace ov diff --git a/src/core/reference/src/op/jit_generator.hpp b/src/core/reference/src/op/jit_generator.hpp deleted file mode 100644 index b4b9cd7a60c23f..00000000000000 --- a/src/core/reference/src/op/jit_generator.hpp +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#if defined _WIN32 && !defined NOMINMAX -# define NOMINMAX -#endif - -#include - -#include - -namespace ov { -namespace reference { -namespace jit { -#ifdef XBYAK64 -static const Xbyak::Operand::Code abi_save_gpr_regs[] = { - Xbyak::Operand::RBX, - Xbyak::Operand::RBP, - Xbyak::Operand::R12, - Xbyak::Operand::R13, - Xbyak::Operand::R14, - Xbyak::Operand::R15, -# ifdef _WIN32 - Xbyak::Operand::RDI, - Xbyak::Operand::RSI, -# endif -}; - -# ifdef _WIN32 -# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX) // RCX -# else -# define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI) // RDI -# endif -#endif // XBYAK64 - -class Generator : public Xbyak::CodeGenerator { - static constexpr size_t xmm_len = 16; - -#ifdef _WIN32 - static constexpr size_t xmm_to_preserve_start = 6; - static constexpr size_t xmm_to_preserve = 10; -#else - static constexpr size_t xmm_to_preserve_start = 0; - static constexpr size_t xmm_to_preserve = 0; -#endif - - static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]); - const size_t size_of_abi_save_regs; - - const Xbyak::Reg64 reg_EVEX_max_8b_offt; - static constexpr int EVEX_max_8b_offt = 0x200; - -public: - const Xbyak::Reg64 param = abi_param1; - - typedef enum { - isa_any, - sse42, - avx, - avx2, - avx512_common, - avx512_core, - avx512_core_vnni, - avx512_mic, - avx512_mic_4ops, - avx512_core_bf16, - avx512_vpopcnt, - fp16 - } cpu_isa_t; - - static bool mayiuse(const cpu_isa_t cpu_isa); - static bool is_x64(); - - Generator(void* code_ptr = nullptr, size_t code_size = 16 * 1024); - void preamble(); - void postamble(); - - void foreach (const Xbyak::Reg64& idx, - size_t step, - const Xbyak::Reg64& end, - std::function && fn); - - template - void copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size); -}; -} // namespace jit -} // namespace reference -} // namespace ov diff --git a/src/core/reference/src/op/utils/combine_hash.cpp b/src/core/reference/src/op/utils/combine_hash.cpp deleted file mode 100644 index 1835155becf711..00000000000000 --- a/src/core/reference/src/op/utils/combine_hash.cpp +++ /dev/null @@ -1,666 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -// The CRC computation is used for x86. -// The calculations were taken from the article -// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel (December, 2009)". - -#include "openvino/core/visibility.hpp" -#include "openvino/core/parallel.hpp" -#include "openvino/reference/utils/combine_hash.hpp" - -#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) -# include "openvino/reference/utils/registers_pool.hpp" -#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 - -#include - -namespace ov { -namespace runtime { - -#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) -namespace jit { - -#define GET_OFF(field) offsetof(CombineHashCallArgs, field) -#define getReg64() RegistersPool::Reg(registersPool) -#define getVmm() RegistersPool::Reg(registersPool) -#define getXmm() RegistersPool::Reg(registersPool) - -struct CombineHashCompileParams { -}; - -struct CombineHashCallArgs { - const void* src_ptr; - void* dst_ptr; - uint64_t work_amount = 0lu; - uint64_t make_64_fold = 0lu; -}; - -typedef void (*fn_t)(const CombineHashCallArgs*); - -template -class CombineHash : public Generator { -public: - explicit CombineHash(const CombineHashCompileParams& jcp) : - m_jcp(jcp) { - if (isa == avx512_core) { - vlen = zmm_len; - } else if (isa == avx2) { - vlen = ymm_len; - } else { - OPENVINO_THROW("Unsupported isa: ", isa); - } - if (!mayiuse(cpu_isa_t::pclmulqdq)) { - OPENVINO_THROW("The current CPU does not support pclmulqdq instruction, which is required for the CRC algorithm."); - } - if (mayiuse(cpu_isa_t::vpclmulqdq)) { - is_vpclmulqdq = true; - } - - generate(); - } - - void generate() { - this->preamble(); - registersPool = RegistersPool::create(isa, {rax, rcx, rsp, rdi, k0}); - - r64_src = getReg64(); - r64_dst = getReg64(); - r64_work_amount = getReg64(); - r64_make_64_fold = getReg64(); - - mov(r64_src, ptr[r64_params + GET_OFF(src_ptr)]); - mov(r64_dst, ptr[r64_params + GET_OFF(dst_ptr)]); - mov(r64_work_amount, ptr[r64_params + GET_OFF(work_amount)]); - mov(r64_make_64_fold, ptr[r64_params + GET_OFF(make_64_fold)]); - - initVectors(); - bulkFold(v_dst); - restFold(v_dst); - tailFold(v_dst); - - registersPool.reset(); - this->postamble(); - } - - static fn_t get() { - static const CombineHashCompileParams params; - static CombineHash kernel(params); - - return (fn_t)kernel.getCode(); - } - - void fillRestWorkMask(const Xbyak::Opmask& k_dst_mask, - const Xbyak::Reg64& r64_work_rest) { - Xbyak::Label l_mv_mask; - auto rOnes = getReg64(); - - mov(rOnes, 0xFFFFFFFFFFFFFFFF); - cmp(r64_work_rest, 0x3f); - jg(l_mv_mask); - - shlx(rOnes, rOnes, r64_work_rest); - not_(rOnes); - - L(l_mv_mask); - kmovq(k_dst_mask, rOnes); - } - - void partialLoad(const Xbyak::Xmm& xmm_dst, - const Xbyak::Address& src_addr, - const Xbyak::Reg64& r64_load_num) { - Xbyak::Label l_partial, l_end; - - cmp(r64_load_num, xmm_len); - jl(l_partial, T_NEAR); - vmovdqu(xmm_dst, ptr[src_addr.getRegExp()]); - jmp(l_end, T_NEAR); - - L(l_partial); { - size_t offset = xmm_len; - - for (size_t j = 0lu; j < xmm_len - 1; j++) { - pinsrb(xmm_dst, ptr[src_addr.getRegExp() + offset], j); - cmp(r64_load_num, ++offset); - jle(l_end, T_NEAR); - } - } - - L(l_end); - } - - void partialLoad(const Xbyak::Ymm& ymm_dst, - const Xbyak::Address& src_addr, - const Xbyak::Reg64& r64_load_num) { - Xbyak::Label l_xmm, l_partial, l_end; - auto xmm_dst = Xbyak::Xmm(ymm_dst.getIdx()); - - cmp(r64_load_num, ymm_len); - jl(l_xmm, T_NEAR); - vmovdqu(ymm_dst, ptr[src_addr.getRegExp()]); - jmp(l_end, T_NEAR); - - L(l_xmm); - vpxorq(ymm_dst, ymm_dst, ymm_dst); - cmp(r64_load_num, xmm_len); - jl(l_partial, T_NEAR); - vmovdqu(xmm_dst, ptr[src_addr.getRegExp()]); - je(l_end, T_NEAR); - - { - Xbyak::Label l_rest_loop, l_perm; - size_t offset = xmm_len; - - vperm2f128(ymm_dst, ymm_dst, ymm_dst, 0x1); - for (size_t j = 0lu; j < xmm_len - 1; j++) { - pinsrb(xmm_dst, ptr[src_addr.getRegExp() + offset], j); - cmp(r64_load_num, ++offset); - jle(l_perm, T_NEAR); - } - L(l_perm); - vperm2f128(ymm_dst, ymm_dst, ymm_dst, 0x1); - } - jmp(l_end, T_NEAR); - - L(l_partial); { - size_t offset = xmm_len; - - for (size_t j = 0lu; j < xmm_len - 1; j++) { - pinsrb(xmm_dst, ptr[src_addr.getRegExp() + offset], j); - cmp(r64_load_num, ++offset); - jle(l_end, T_NEAR); - } - } - - L(l_end); - } - -private: - static constexpr uint64_t CHUNK_SIZE = 32; - static const uint64_t CRC_VAL; - static const uint64_t CONST_K[12]; - static const uint8_t SHUF_MASK[16]; - - using Vmm = typename std::conditional::type; - size_t vlen = xmm_len; - bool is_vpclmulqdq = false; - - CombineHashCompileParams m_jcp; - RegistersPool::Ptr registersPool; - - RegistersPool::Reg r64_src; - RegistersPool::Reg r64_dst; - RegistersPool::Reg r64_work_amount; - RegistersPool::Reg r64_make_64_fold; - - const Xbyak::Reg64 r64_params = abi_param1; - - // Vector registers - RegistersPool::Reg v_dst; - RegistersPool::Reg v_k_1_2; - RegistersPool::Reg v_k_4_5; - RegistersPool::Reg v_k_8_9; - RegistersPool::Reg v_k_16_17; - RegistersPool::Reg v_shuf_mask; - - size_t getVlen() { - return vlen; - } - - void initVectors(); - - void bulkFold(const Vmm& v_dst); - - void restFold(const Vmm& v_dst) { - Xbyak::Label l_fold_loop, l_end; - cmp(r64_work_amount, xmm_len); - jl(l_end, T_NEAR); - - auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); - auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx()); - auto xmm_src = getXmm(); - auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); - auto xmm_aux = getXmm(); - - L(l_fold_loop); { - vmovdqu64(xmm_src, ptr[r64_src]); - vpshufb(xmm_src, xmm_src, xmm_shuf_mask); - - vpclmulqdq(xmm_aux, xmm_dst, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst, xmm_dst, xmm_k_1_2, 0b00010001); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - vpxorq(xmm_dst, xmm_dst, xmm_src); - - add(r64_src, xmm_len); - sub(r64_work_amount, xmm_len); - cmp(r64_work_amount, xmm_len); - jge(l_fold_loop, T_NEAR); - } - - L(l_end); - } - - void tailFold(const Vmm& v_dst); -}; - -template <> -void CombineHash::initVectors() { - auto r64_aux = getReg64(); - - v_k_1_2 = getVmm(); - mov(r64_aux, reinterpret_cast(CONST_K)); - vbroadcasti64x2(v_k_1_2, ptr[r64_aux]); - v_k_8_9 = getVmm(); - mov(r64_aux, reinterpret_cast(CONST_K + 6)); - vbroadcasti64x2(v_k_8_9, ptr[r64_aux]); - - v_shuf_mask = getVmm(); - mov(r64_aux, reinterpret_cast(SHUF_MASK)); - vbroadcasti64x2(v_shuf_mask, ptr[r64_aux]); - - v_dst = getVmm(); - auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); - auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); - auto xmm_aux = getXmm(); - auto k_rest_mask = RegistersPool::Reg(registersPool); - // Initial CRC - mov(r64_aux, CRC_VAL); - vpxorq(v_dst, v_dst, v_dst); - vpinsrq(xmm_dst, xmm_dst, r64_work_amount, 0x0); - vpinsrq(xmm_dst, xmm_dst, r64_aux, 0x1); - // First xor with source - fillRestWorkMask(k_rest_mask, r64_work_amount); - vmovdqu8(Xbyak::Xmm(xmm_aux.getIdx()) | k_rest_mask | T_z, ptr[r64_src]); - vpshufb(xmm_aux, xmm_aux, xmm_shuf_mask); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - sub(r64_work_amount, xmm_len); - add(r64_src, xmm_len); -} - -template -void CombineHash::initVectors() { - auto r64_aux = getReg64(); - - v_k_1_2 = getVmm(); - mov(r64_aux, reinterpret_cast(CONST_K)); - vbroadcasti128(v_k_1_2, ptr[r64_aux]); - v_k_8_9 = getVmm(); - mov(r64_aux, reinterpret_cast(CONST_K + 6)); - vbroadcasti128(v_k_8_9, ptr[r64_aux]); - - v_shuf_mask = getVmm(); - mov(r64_aux, reinterpret_cast(SHUF_MASK)); - vbroadcasti128(v_shuf_mask, ptr[r64_aux]); - - v_dst = getVmm(); - auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); - auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); - auto xmm_aux = getXmm(); - auto k_rest_mask = RegistersPool::Reg(registersPool); - // Initial CRC - mov(r64_aux, CRC_VAL); - vpxorq(v_dst, v_dst, v_dst); - vpinsrq(xmm_dst, xmm_dst, r64_aux, 0x1); - // First xor with source - partialLoad(xmm_aux, ptr[r64_src], r64_work_amount); - vpshufb(xmm_aux, xmm_aux, xmm_shuf_mask); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - sub(r64_work_amount, xmm_len); -} - -template <> -void CombineHash::bulkFold(const Vmm& v_dst) { - Xbyak::Label l_fold_loop, l_end; - cmp(r64_work_amount, zmm_len + 3 * xmm_len); - jl(l_end, T_NEAR); - - auto r64_aux = getReg64(); - - auto v_src_0 = getVmm(); - auto v_dst_0 = getVmm(); - auto v_dst_1 = getVmm(); - auto v_dst_2 = getVmm(); - auto& v_dst_3 = v_dst; - auto v_aux_0 = getVmm(); - - auto xmm_k_8_9 = Xbyak::Xmm(v_k_8_9.getIdx()); - auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx()); - auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx()); - auto xmm_src_1 = getXmm(); - auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx()); - auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx()); - auto xmm_dst_2 = Xbyak::Xmm(v_dst_2.getIdx()); - auto xmm_dst_3 = Xbyak::Xmm(v_dst_3.getIdx()); - auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx()); - - vmovdqu64(v_dst_0, v_dst_3); - - if (!is_vpclmulqdq) { - prefetchnta(ptr[r64_src + 3 * xmm_len]); - vmovdqu64(xmm_dst_1, ptr[r64_src + 0 * xmm_len]); - vmovdqu64(xmm_dst_2, ptr[r64_src + 1 * xmm_len]); - vmovdqu64(xmm_dst_3, ptr[r64_src + 2 * xmm_len]); - } - - add(r64_src, 3 * xmm_len); - sub(r64_work_amount, zmm_len + 3 * xmm_len); - - L(l_fold_loop); { - vmovdqu64(v_src_0, ptr[r64_src]); - vpshufb(v_src_0, v_src_0, v_shuf_mask); - - if (is_vpclmulqdq) { - vpclmulqdq(v_aux_0, v_dst_0, v_k_8_9, 0b00000000); - vpclmulqdq(v_dst_0, v_dst_0, v_k_8_9, 0b00010001); - vpxorq(v_aux_0, v_aux_0, v_src_0); - vpxorq(v_dst_0, v_dst_0, v_aux_0); - } else { - // 0 - vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_8_9, 0b00000000); - vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_8_9, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0); - vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0); - // 1 - vextracti64x2(xmm_src_1, v_src_0, 0x1); - vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_8_9, 0b00000000); - vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_8_9, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); - vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); - // 2 - vextracti64x2(xmm_src_1, v_src_0, 0x2); - vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_8_9, 0b00000000); - vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_8_9, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); - vpxorq(xmm_dst_2, xmm_dst_2, xmm_aux_0); - // 3 - vextracti64x2(xmm_src_1, v_src_0, 0x3); - vpclmulqdq(xmm_aux_0, xmm_dst_3, xmm_k_8_9, 0b00000000); - vpclmulqdq(xmm_dst_3, xmm_dst_3, xmm_k_8_9, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - } - - add(r64_src, zmm_len); - sub(r64_work_amount, zmm_len); - jge(l_fold_loop, T_NEAR); - } - add(r64_work_amount, zmm_len); - - if (is_vpclmulqdq) { - auto ymm_dst_0 = Xbyak::Ymm(v_dst_0.getIdx()); - auto ymm_dst_1 = Xbyak::Ymm(v_dst_1.getIdx()); - auto ymm_aux_0 = Xbyak::Ymm(v_aux_0.getIdx()); - - vextracti64x4(ymm_dst_1, v_dst_0, 0x1); - mov(r64_aux, reinterpret_cast(CONST_K + 2)); - vpclmulqdq(ymm_aux_0, ymm_dst_0, ptr[r64_aux], 0b00000000); - vpclmulqdq(ymm_dst_0, ymm_dst_0, ptr[r64_aux], 0b00010001); - vpxorq(ymm_dst_1, ymm_dst_1, ymm_aux_0); - vpxorq(ymm_dst_0, ymm_dst_0, ymm_dst_1); - - vextracti64x2(xmm_dst_3, ymm_dst_0, 0x1); - vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_1_2, 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0); - } else { - mov(r64_aux, reinterpret_cast(CONST_K + 4)); - vpclmulqdq(xmm_aux_0, xmm_dst_0, ptr[r64_aux], 0b00000000); - vpclmulqdq(xmm_dst_0, xmm_dst_0, ptr[r64_aux], 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0); - - mov(r64_aux, reinterpret_cast(CONST_K + 2)); - vpclmulqdq(xmm_aux_0, xmm_dst_1, ptr[r64_aux], 0b00000000); - vpclmulqdq(xmm_dst_1, xmm_dst_1, ptr[r64_aux], 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_1); - - vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_1_2, 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_2); - } - - L(l_end); -} - -template <> -void CombineHash::bulkFold(const Vmm& v_dst) { - Xbyak::Label l_fold_loop, l_end; - cmp(r64_work_amount, 2 * vlen - xmm_len); - jl(l_end, T_NEAR); - - auto r64_aux = getReg64(); - - auto v_src_0 = getVmm(); - auto v_dst_0 = getVmm(); - auto v_dst_1 = getVmm(); - auto v_dst_2 = getVmm(); - auto& v_dst_3 = v_dst; - auto v_aux_0 = getVmm(); - - auto xmm_k_4_5 = Xbyak::Xmm(v_k_4_5.getIdx()); - auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx()); - auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx()); - auto xmm_src_1 = getXmm(); - auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx()); - auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx()); - auto xmm_dst_2 = Xbyak::Xmm(v_dst_2.getIdx()); - auto xmm_dst_3 = Xbyak::Xmm(v_dst_3.getIdx()); - auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx()); - - if (!is_vpclmulqdq) { - vmovdqu64(xmm_dst_1, ptr[r64_src + 0 * xmm_len]); - } - - add(r64_src, vlen - xmm_len); - sub(r64_work_amount, 2 * vlen - xmm_len); - - L(l_fold_loop); { - vmovdqu64(v_src_0, ptr[r64_src]); - vpshufb(v_src_0, v_src_0, v_shuf_mask); - - if (is_vpclmulqdq) { - vpclmulqdq(v_aux_0, v_dst_0, v_k_4_5, 0b00000000); - vpclmulqdq(v_dst_0, v_dst_0, v_k_4_5, 0b00010001); - vpxorq(v_aux_0, v_aux_0, v_src_0); - vpxorq(v_dst_0, v_dst_0, v_aux_0); - } else { - // 0 - vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_4_5, 0b00000000); - vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_4_5, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0); - vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0); - // 1 - vextracti128(xmm_src_1, v_src_0, 0x1); - vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_4_5, 0b00000000); - vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_4_5, 0b00010001); - vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); - vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); - } - - add(r64_src, vlen); - sub(r64_work_amount, vlen); - jge(l_fold_loop, T_NEAR); - } - add(r64_work_amount, vlen); - - if (is_vpclmulqdq) { - auto ymm_dst_0 = Xbyak::Ymm(v_dst_0.getIdx()); - auto ymm_dst_1 = Xbyak::Ymm(v_dst_1.getIdx()); - auto ymm_aux_0 = Xbyak::Ymm(v_aux_0.getIdx()); - - vextracti128(xmm_dst_3, ymm_dst_0, 0x1); - vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_1_2, 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0); - } else { - vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_1_2, 0b00010001); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); - vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_2); - } - - L(l_end); -} - - -template <> -void CombineHash::tailFold(const Vmm& v_dst) { - Xbyak::Label l_fold_to_64, l_save_128, l_end; - cmp(r64_work_amount, 0); - jle(l_fold_to_64, T_NEAR); - - auto r64_aux = getReg64(); - auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); - auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx()); - auto xmm_src = getXmm(); - auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); - auto xmm_aux = getXmm(); - auto xmm_aux_1 = getXmm(); - auto xmm_aux_2 = getXmm(); - auto k_rest_mask = RegistersPool::Reg(registersPool); - - fillRestWorkMask(k_rest_mask, r64_work_amount); - - vpxorq(xmm_src, xmm_src, xmm_src); - vmovdqu8(Xbyak::Xmm(xmm_src.getIdx()) | k_rest_mask | T_z, ptr[r64_src]); - vpshufb(xmm_src, xmm_src, xmm_shuf_mask); - - vpclmulqdq(xmm_aux, xmm_dst, xmm_k_1_2, 0b00000000); - vpclmulqdq(xmm_dst, xmm_dst, xmm_k_1_2, 0b00010001); - vpxorq(xmm_aux, xmm_aux, xmm_src); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - - L(l_fold_to_64); - cmp(r64_make_64_fold, 0); - je(l_save_128, T_NEAR); - - mov(r64_aux, reinterpret_cast(CONST_K + 8)); - vpclmulqdq(xmm_aux, xmm_dst, ptr[r64_aux], 0b00000001); - vpslldq(xmm_dst, xmm_dst, 0x8); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - - mov(r64_aux, reinterpret_cast(CONST_K + 10)); - vmovdqu64(xmm_aux_2, ptr[r64_aux]); - vpclmulqdq(xmm_aux, xmm_dst, xmm_aux_2, 0b00000001); - mov(r64_aux, 0x0); - vpinsrq(xmm_aux_1, xmm_dst, r64_aux, 0x0); - vpxorq(xmm_aux, xmm_aux, xmm_aux_1); - vpinsrq(xmm_aux_1, xmm_aux, r64_aux, 0x0); - vpclmulqdq(xmm_aux, xmm_aux, xmm_aux_2, 0b00010001); - vpxorq(xmm_aux, xmm_aux, xmm_aux_1); - vpxorq(xmm_dst, xmm_dst, xmm_aux); - - vpextrq(ptr[r64_dst], xmm_dst, 0x0); - jmp(l_end, T_NEAR); - - - L(l_save_128); - vmovdqu64(ptr[r64_dst], xmm_dst); - - L(l_end); -} - -template <> -void CombineHash::tailFold(const Vmm& v_dst) { -} - -template -const uint64_t CombineHash::CRC_VAL = 0xffffffffffffffff; - -// P(x) = 0x42F0E1EBA9EA3693 -template -const uint64_t CombineHash::CONST_K[12] = { 0x05f5c3c7eb52fab6, 0x4eb938a7d257740e, // x^(64*1), x^(64*2) - 0x571bee0a227ef92b, 0x44bef2a201b5200c, // x^(64*3), x^(64*4) - 0x54819d8713758b2c, 0x4a6b90073eb0af5a, // x^(64*5), x^(64*6) - 0x5f6843ca540df020, 0xddf4b6981205b83f, // x^(64*7), x^(64*8) - 0x05f5c3c7eb52fab6, 0x0000000000000000, // x^(64*1), x^(64*1) mod P(x) - 0x578d29d06cc4f872, 0x42f0e1eba9ea3693 // floor(x^128/P(x)) - x^64, P(x) - x^64 - }; - -template -const uint8_t CombineHash::SHUF_MASK[] = { 0b00001111, 0b00001110, 0b00001101, 0b00001100, 0b00001011, 0b00001010, 0b00001001, 0b00001000, - 0b00000111, 0b00000110, 0b00000101, 0b00000100, 0b00000011, 0b00000010, 0b00000001, 0b00000000 }; - -} // namespace jit -#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 - -size_t combine_hash(const void* src, size_t size) { -#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) - jit::fn_t kernel; - - if (jit::Generator::mayiuse(jit::avx512_core)) { - kernel = jit::CombineHash::get(); - } else if (jit::Generator::mayiuse(jit::avx2)) { - kernel = jit::CombineHash::get(); - } - - if (kernel) { - size_t res = 0lu; - - static const size_t block_size = 2lu * jit::Generator::zmm_len; - // There is no sense to perform parallel execution if there are less than 2 blocks. - if (size >= 2lu * block_size) { - const auto nthr = parallel_get_max_threads() / 2; // TODO: WA for Hyper Threading - std::vector intermediate(nthr * 2); // xmm_len * nthr - const uint64_t blocks = size / block_size; - const uint64_t el_per_thread = block_size * ((blocks + nthr - 1) / nthr); - - parallel_nt(nthr, [&](const int ithr, const int nthr) { - uint64_t start = ithr * el_per_thread; - if (start >= size) { - return; - } - uint64_t work_amount = (el_per_thread + start > size) ? size - start : el_per_thread; - - size_t res = 0lu; - jit::CombineHashCallArgs args; - - args.src_ptr = reinterpret_cast(src) + start; - args.dst_ptr = &intermediate[ithr * 2]; - args.work_amount = work_amount; - args.make_64_fold = 0lu; - kernel(&args); - }); - - - jit::CombineHashCallArgs args; - args.src_ptr = intermediate.data(); - args.dst_ptr = &res; - args.work_amount = ((size + el_per_thread - 1) / el_per_thread) * jit::Generator::xmm_len; - args.make_64_fold = 1lu; - kernel(&args); - } else { - jit::CombineHashCallArgs args; - args.src_ptr = src; - args.dst_ptr = &res; - args.work_amount = size; - args.make_64_fold = 1lu; - kernel(&args); - } - return res; - } -#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 - - constexpr auto cel_size = sizeof(size_t); - auto seed = static_cast(size); - const auto data = static_cast(src); - const auto d_end = std::next(data, size / cel_size); - // The constant value used as a magic number has been - // traditionally used e.g. in boost library's hash_combine. - // It happens to be derived from the golden ratio. - for (auto d = data; d != d_end; ++d) { - seed ^= *d + 0x9e3779b9 + (seed << 6) + (seed >> 2); - } - size_t last_bytes{0}; - std::memcpy(&last_bytes, d_end, size % cel_size); - seed ^= last_bytes + 0x9e3779b9 + (seed << 6) + (seed >> 2); - return seed; -} - -} // namespace runtime -} // namespace ov diff --git a/src/core/reference/src/op/utils/jit_generator.cpp b/src/core/reference/src/utils/jit_generator.cpp similarity index 94% rename from src/core/reference/src/op/utils/jit_generator.cpp rename to src/core/reference/src/utils/jit_generator.cpp index 174cbb9242acc4..39dc31c0033f9f 100644 --- a/src/core/reference/src/op/utils/jit_generator.cpp +++ b/src/core/reference/src/utils/jit_generator.cpp @@ -11,9 +11,10 @@ # endif # include -# include "openvino/reference/utils/jit_generator.hpp" +# include "openvino/core/except.hpp" # include "openvino/core/type/bfloat16.hpp" # include "openvino/core/type/float16.hpp" +# include "openvino/reference/utils/jit_generator.hpp" namespace ov { namespace reference { @@ -64,10 +65,18 @@ bool Generator::mayiuse(const cpu_isa_t cpu_isa) { bool Generator::is_x64() { return sizeof(void*) == 8; } -Generator::Generator(void* code_ptr, size_t code_size) +Generator::Generator(cpu_isa_t isa, void* code_ptr, size_t code_size) : Xbyak::CodeGenerator(code_size, code_ptr), size_of_abi_save_regs(num_abi_save_gpr_regs * rax.getBit() / 8 + xmm_to_preserve * xmm_len), - reg_EVEX_max_8b_offt(rbp) {} + reg_EVEX_max_8b_offt(rbp) { + if (isa == avx512_core) { + m_vlen = zmm_len; + } else if (isa == avx2) { + m_vlen = ymm_len; + } else { + OPENVINO_THROW("Unsupported isa: ", isa); + } +} void Generator::preamble() { if (xmm_to_preserve) { diff --git a/src/core/src/pass/serialize.cpp b/src/core/src/pass/serialize.cpp index c36b681d9e034d..c182b13594b74d 100644 --- a/src/core/src/pass/serialize.cpp +++ b/src/core/src/pass/serialize.cpp @@ -22,8 +22,8 @@ #include "openvino/opsets/opset1.hpp" #include "openvino/pass/constant_folding.hpp" #include "openvino/reference/convert.hpp" -#include "openvino/reference/utils/combine_hash.hpp" #include "openvino/runtime/aligned_buffer.hpp" +#include "openvino/runtime/compute_hash.hpp" #include "openvino/runtime/string_aligned_buffer.hpp" #include "openvino/util/file_util.hpp" #include "pugixml.hpp" @@ -76,9 +76,10 @@ class ConstantWriter { using HashValue = size_t; using ConstWritePositions = std::multimap>; - ConstantWriter(std::ostream& bin_data, bool enable_compression = true) + ConstantWriter(std::ostream& bin_data, bool enable_compression = true, bool write_hash_value = false) : m_binary_output(bin_data), m_enable_compression(enable_compression), + m_write_hash_value(write_hash_value), m_blob_offset(bin_data.tellp()) {} FilePosition write(const char* ptr, @@ -116,18 +117,24 @@ class ConstantWriter { // the same hash for {2, 2} and {0, 128} arrays. // But even strong hashing algorithms sometimes give collisions. // Therefore we always have to compare values when finding a match in the hash multimap. - const HashValue hash = ov::runtime::combine_hash(ptr_to_write, *new_size); + const HashValue hash = ov::runtime::compute_hash(ptr_to_write, *new_size); + auto found = m_hash_to_file_positions.find(hash); // iterate over all matches of the key in the multimap while (found != m_hash_to_file_positions.end()) { - if (memcmp(ptr, found->second.second, size) == 0) + if (memcmp(ptr, found->second.second, size) == 0) { return found->second.first; + } found++; } // Since fp16_compressed data will be disposed at exit point and since we cannot reread it from the ostream, // we store pointer to the original uncompressed blob. m_hash_to_file_positions.insert({hash, {offset, static_cast(ptr)}}); - m_binary_output.write(ptr_to_write, *new_size); + if (m_write_hash_value) { + m_binary_output.write(reinterpret_cast(&hash), sizeof(uint64_t)); + } else { + m_binary_output.write(ptr_to_write, *new_size); + } } return offset; } @@ -172,6 +179,7 @@ class ConstantWriter { ConstWritePositions m_hash_to_file_positions; std::ostream& m_binary_output; bool m_enable_compression; + bool m_write_hash_value; FilePosition m_blob_offset; // blob offset inside output stream }; @@ -1205,7 +1213,7 @@ void serializeFunc(std::ostream& xml_file, std::string name = "net"; pugi::xml_document xml_doc; pugi::xml_node net_node = xml_doc.append_child(name.c_str()); - ConstantWriter constant_write_handler(bin_file); + ConstantWriter constant_write_handler(bin_file, true, true); XmlSerializer visitor(net_node, name, constant_write_handler, version, deterministic); visitor.on_attribute(name, model); @@ -1377,10 +1385,19 @@ bool pass::StreamSerialize::run_on_model(const std::shared_ptr& model /// -------- Hash calculation pass ------------- namespace { -template -static uint64_t hash_combine(uint64_t seed, const T& a) { - // Hash combine formula from boost - return seed ^ (std::hash()(a) + 0x9e3779b9 + (seed << 6) + (seed >> 2)); +// Hash combine formula from boost for uint64_t. +inline uint64_t hash_combine(uint64_t h, uint64_t k) { + constexpr uint64_t m = 0xc6a4a7935bd1e995; + constexpr int r = 47; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + + return h + 0xe6546b64; } class OstreamHashWrapper final : public std::streambuf { @@ -1392,19 +1409,23 @@ class OstreamHashWrapper final : public std::streambuf { } std::streamsize xsputn(const char* s, std::streamsize n) override { - // Reinterpret data as uint32_t and accumulate in uint64_t to avoid overflow fluctuations in parallel_sum. - auto* int_sum = reinterpret_cast(s); - const uint64_t n32 = n / sizeof(uint32_t); + uint64_t h = ov::runtime::compute_hash(s, n); + m_res = hash_combine(m_res, h); + + return n; + } +}; - m_res += parallel_sum(n32, uint64_t(0lu), [&](size_t k) -> uint32_t { - return int_sum[k]; - }); +class OstreamHashWrapperBin final : public std::streambuf { + uint64_t m_res = 0lu; - const uint64_t rest = n % sizeof(uint32_t); - for (uint64_t i = 0lu; i < rest; i++) { - m_res += s[n - rest + i]; - } +public: + uint64_t getResult() const { + return m_res; + } + std::streamsize xsputn(const char* s, std::streamsize n) override { + m_res = hash_combine(m_res, *reinterpret_cast(s)); return n; } }; @@ -1413,7 +1434,7 @@ class OstreamHashWrapper final : public std::streambuf { bool pass::Hash::run_on_model(const std::shared_ptr& model) { RUN_ON_MODEL_SCOPE(Hash); OstreamHashWrapper xmlHash; - OstreamHashWrapper binHash; + OstreamHashWrapperBin binHash; std::ostream xml(&xmlHash); std::ostream bin(&binHash); diff --git a/src/core/src/runtime/compute_hash.cpp b/src/core/src/runtime/compute_hash.cpp new file mode 100644 index 00000000000000..395873c86d90f9 --- /dev/null +++ b/src/core/src/runtime/compute_hash.cpp @@ -0,0 +1,922 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +// The CRC computation is used for x86. +// The calculations were taken from the article +// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel (December, 2009)". + +#include "openvino/runtime/compute_hash.hpp" + +#include +#include + +#include "openvino/core/visibility.hpp" + +#if !defined(OS_CHROMEOS) && (defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)) +# define OV_CORE_USE_XBYAK_JIT +#endif + +#ifdef OV_CORE_USE_XBYAK_JIT +# include "openvino/core/parallel.hpp" +# include "openvino/reference/utils/registers_pool.hpp" +#endif // OV_CORE_USE_XBYAK_JIT + +namespace ov { +namespace runtime { + +#ifdef OV_CORE_USE_XBYAK_JIT + +using namespace ov::reference::jit; + +namespace jit { + +# define GET_OFF(field) offsetof(ComputeHashCallArgs, field) +# define getReg64() RegistersPool::Reg(m_registers_pool) +# define getVmm() RegistersPool::Reg(m_registers_pool) +# define getXmm() RegistersPool::Reg(m_registers_pool) + +enum KernelType { SINGLE_THREAD = 0, FIRST_THREAD, N_THREAD, FINAL_FOLD }; + +struct ComputeHashCompileParams { + KernelType type; +}; + +struct ComputeHashCallArgs { + const void* src_ptr = nullptr; + void* dst_ptr = nullptr; + const void* k_ptr = nullptr; + void* intermediate_ptr = nullptr; + uint64_t work_amount = 0lu; + uint64_t size = 0lu; + uint64_t threads_num = 1lu; +}; + +typedef void (*hash_kernel)(const ComputeHashCallArgs*); + +static const uint8_t SHUF_MASK[16] = {0b00001111, + 0b00001110, + 0b00001101, + 0b00001100, + 0b00001011, + 0b00001010, + 0b00001001, + 0b00001000, + 0b00000111, + 0b00000110, + 0b00000101, + 0b00000100, + 0b00000011, + 0b00000010, + 0b00000001, + 0b00000000}; + +constexpr uint64_t CRC_VAL = 0xffffffffffffffff; + +// POLYNOM(x) = 0x42F0E1EBA9EA3693 +constexpr uint64_t K_2 = 0x05f5c3c7eb52fab6; +constexpr uint64_t P_1 = 0x578d29d06cc4f872; +constexpr uint64_t P_2 = 0x42f0e1eba9ea3693; +static const uint64_t K_PULL[] = { + K_2, // x^(64*2) + 0x0000000000000000, // x^(64*1) mod P(x) + P_1, // floor(x^128/P(x))-x^64 + P_2, // P(x)-x^64 + K_2, // x^(64*2) + 0x4eb938a7d257740e, // x^(64*3) + 0x571bee0a227ef92b, // x^(64*4) + 0x44bef2a201b5200c, // x^(64*5) + 0x54819d8713758b2c, // x^(64*6) + 0x4a6b90073eb0af5a, // x^(64*7) + 0x5f6843ca540df020, // x^(64*8) + 0xddf4b6981205b83f, // x^(64*9) + 0x097c516e98bd2e73, // x^(64*10) + 0x0b76477b31e22e7b, // x^(64*11) + 0x9af04e1eff82d0dd, // x^(64*12) + 0x6e82e609297f8fe8, // x^(64*13) + 0xe464f4df5fb60ac1, // x^(64*14) + 0xb649c5b35a759cf2, // x^(64*15) + 0x05cf79dea9ac37d6, // x^(64*16) + 0x001067e571d7d5c2 // x^(64*17) +}; + +constexpr uint64_t K_1_0_OFF = 0lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_P_P_OFF = 1lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_2_3_OFF = 2lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_4_5_OFF = 3lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_6_7_OFF = 4lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_8_9_OFF = 5lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_10_11_OFF = 6lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_12_13_OFF = 7lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_14_15_OFF = 8lu * 2lu * sizeof(uint64_t); +constexpr uint64_t K_16_17_OFF = 9lu * 2lu * sizeof(uint64_t); + +class HashBase : public Generator { +protected: + void (*ker_fn)(const ComputeHashCallArgs*); + +public: + HashBase(cpu_isa_t isa) : Generator(isa) {} + + virtual void generate() = 0; + + void operator()(const ComputeHashCallArgs* args) { + ker_fn(args); + } + + virtual void create_kernel() { + generate(); + ker_fn = (decltype(ker_fn))getCode(); + OPENVINO_ASSERT(ker_fn, "[ CORE ] Could not generate kernel code."); + } +}; + +template +class ComputeHash : public HashBase { +public: + explicit ComputeHash(const ComputeHashCompileParams& jcp) : HashBase(isa), m_jcp(jcp) { + if (!mayiuse(cpu_isa_t::pclmulqdq)) { + OPENVINO_THROW( + "The current CPU does not support pclmulqdq instruction, which is required for the CRC algorithm."); + } + if (mayiuse(cpu_isa_t::vpclmulqdq)) { + is_vpclmulqdq = true; + } + } + + void generate() override { + m_registers_pool = RegistersPool::create(isa, {rax, rcx, rsp, rdi, k0}); + + r64_src_ptr = getReg64(); + r64_dst_ptr = getReg64(); + r64_work_amount = getReg64(); + r64_k_ptr = getReg64(); + r64_aux = getReg64(); + v_k_2_3 = getVmm(); + v_shuf_mask = getVmm(); + auto v_dst = getVmm(); + + this->preamble(); + + initialize(v_dst); + bulk_fold(v_dst); + join(v_dst); + fold_to_128(v_dst); + fold_to_64(v_dst); + + this->postamble(); + m_registers_pool.reset(); + } + + static std::shared_ptr create(const ComputeHashCompileParams& params) { + auto kernel = std::make_shared(params); + OPENVINO_ASSERT(kernel, "[ CORE ] Could not create ComputeHash kernel."); + kernel->create_kernel(); + + return kernel; + } + +private: + using Vmm = typename std::conditional::type; + bool is_vpclmulqdq = false; + + ComputeHashCompileParams m_jcp; + RegistersPool::Ptr m_registers_pool; + + const Xbyak::Reg64 r64_params = abi_param1; + + RegistersPool::Reg r64_src_ptr; + RegistersPool::Reg r64_dst_ptr; + RegistersPool::Reg r64_work_amount; + RegistersPool::Reg r64_k_ptr; + RegistersPool::Reg r64_aux; + + // Vector registers + RegistersPool::Reg v_k_2_3; + RegistersPool::Reg v_shuf_mask; + + void initialize(const Vmm& v_dst); + + void bulk_fold(const Vmm& v_dst); + + void join(const Vmm& v_dst); + + void fold_to_128(const Vmm& v_dst); + + void fold_to_64(const Vmm& v_dst); + + void uni_vpxorq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src_0, const Xbyak::Xmm& v_src_1); + + void uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0); + + void uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0); + + void uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0); + + void partial_load(const Xbyak::Xmm& xmm_dst, const Xbyak::Address& src_addr, const Xbyak::Reg64& r64_load_num); + + void partial_load(const Xbyak::Ymm& ymm_dst, const Xbyak::Address& src_addr, const Xbyak::Reg64& r64_load_num); +}; + +template <> +void ComputeHash::uni_vpxorq(const Xbyak::Xmm& v_dst, + const Xbyak::Xmm& v_src_0, + const Xbyak::Xmm& v_src_1) { + vpxorq(v_dst, v_src_0, v_src_1); +} +template +void ComputeHash::uni_vpxorq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src_0, const Xbyak::Xmm& v_src_1) { + vpxor(v_dst, v_src_0, v_src_1); +} +template <> +void ComputeHash::uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0) { + vmovdqu64(v_dst, v_src_0); +} +template +void ComputeHash::uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0) { + vmovdqu(v_dst, v_src_0); +} +template <> +void ComputeHash::uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0) { + vmovdqu64(v_dst, v_src_0); +} +template +void ComputeHash::uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0) { + vmovdqu(v_dst, v_src_0); +} +template <> +void ComputeHash::uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0) { + vbroadcasti64x2(v_dst, v_src_0); +} +template +void ComputeHash::uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0) { + vbroadcasti128(v_dst, v_src_0); +} +template <> +void ComputeHash::partial_load(const Xbyak::Xmm& xmm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + Xbyak::Label l_mv_mask; + auto rOnes = getReg64(); + auto k_load_mask = RegistersPool::Reg(m_registers_pool); + + mov(rOnes, 0xFFFFFFFFFFFFFFFF); + cmp(r64_load_num, 0x3f); + jg(l_mv_mask); + + shlx(rOnes, rOnes, r64_load_num); + not_(rOnes); + + L(l_mv_mask); + kmovq(k_load_mask, rOnes); + + vmovdqu8(Vmm(xmm_dst.getIdx()) | k_load_mask | T_z, ptr[r64_src_ptr]); +} +template +void ComputeHash::partial_load(const Xbyak::Xmm& xmm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + Xbyak::Label l_partial, l_end; + + cmp(r64_load_num, xmm_len); + jl(l_partial, T_NEAR); + uni_vmovdqu64(xmm_dst, ptr[src_addr.getRegExp()]); + jmp(l_end, T_NEAR); + + L(l_partial); + { + uni_vpxorq(xmm_dst, xmm_dst, xmm_dst); + for (size_t j = 0lu; j < xmm_len - 1; j++) { + cmp(r64_load_num, j); + jle(l_end, T_NEAR); + pinsrb(xmm_dst, ptr[src_addr.getRegExp() + j], j); + } + } + + L(l_end); +} +template <> +void ComputeHash::partial_load(const Xbyak::Ymm& xmm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + partial_load(Xbyak::Xmm(xmm_dst.getIdx()), src_addr, r64_load_num); +} +template +void ComputeHash::partial_load(const Xbyak::Ymm& ymm_dst, + const Xbyak::Address& src_addr, + const Xbyak::Reg64& r64_load_num) { + Xbyak::Label l_xmm, l_partial, l_end; + auto xmm_dst = Xbyak::Xmm(ymm_dst.getIdx()); + + cmp(r64_load_num, ymm_len); + jl(l_xmm, T_NEAR); + uni_vmovdqu64(ymm_dst, ptr[src_addr.getRegExp()]); + jmp(l_end, T_NEAR); + + L(l_xmm); + uni_vpxorq(ymm_dst, ymm_dst, ymm_dst); + cmp(r64_load_num, xmm_len); + jl(l_partial, T_NEAR); + uni_vmovdqu64(xmm_dst, ptr[src_addr.getRegExp()]); + je(l_end, T_NEAR); + + { + Xbyak::Label l_rest_loop, l_perm; + + vperm2i128(ymm_dst, ymm_dst, ymm_dst, 0x1); + for (size_t j = 0lu; j < xmm_len - 1; j++) { + cmp(r64_load_num, xmm_len + j); + jle(l_perm, T_NEAR); + pinsrb(xmm_dst, ptr[src_addr.getRegExp() + xmm_len + j], j); + } + L(l_perm); + vperm2i128(ymm_dst, ymm_dst, ymm_dst, 0x1); + } + jmp(l_end, T_NEAR); + + L(l_partial); + { + for (size_t j = 0lu; j < xmm_len - 1; j++) { + cmp(r64_load_num, j); + jle(l_end, T_NEAR); + pinsrb(xmm_dst, ptr[src_addr.getRegExp() + j], j); + } + } + + L(l_end); +} + +template +void ComputeHash::initialize(const Vmm& v_dst) { + mov(r64_src_ptr, ptr[r64_params + GET_OFF(src_ptr)]); + mov(r64_dst_ptr, ptr[r64_params + GET_OFF(dst_ptr)]); + mov(r64_k_ptr, ptr[r64_params + GET_OFF(k_ptr)]); + mov(r64_work_amount, ptr[r64_params + GET_OFF(work_amount)]); + + uni_vbroadcasti64x2(v_k_2_3, ptr[r64_k_ptr + K_2_3_OFF]); + + mov(r64_aux, reinterpret_cast(SHUF_MASK)); + uni_vbroadcasti64x2(v_shuf_mask, ptr[r64_aux]); + + if (m_jcp.type == SINGLE_THREAD || m_jcp.type == FIRST_THREAD) { + auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_aux = getXmm(); + + // Initial CRC + mov(r64_aux, ptr[r64_params + GET_OFF(size)]); + vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x0); + mov(r64_aux, CRC_VAL); + vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x1); + + // First xor with source. + partial_load(v_dst, ptr[r64_src_ptr], r64_work_amount); + vpshufb(v_dst, v_dst, v_shuf_mask); + pxor(xmm_dst, xmm_aux); // The SSE version is used to avoid zeroing out the rest of the Vmm. + if (m_jcp.type == SINGLE_THREAD) { + add(r64_src_ptr, xmm_len); + } + } else if (m_jcp.type == N_THREAD) { + uni_vmovdqu64(v_dst, ptr[r64_src_ptr]); + vpshufb(v_dst, v_dst, v_shuf_mask); + } + if (m_jcp.type == SINGLE_THREAD || m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + sub(r64_work_amount, xmm_len); + } +} + +template <> +void ComputeHash::bulk_fold(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FIRST_THREAD && m_jcp.type != N_THREAD) { + return; + } + Xbyak::Label l_fold_loop, l_end; + cmp(r64_work_amount, 2 * get_vlen() - xmm_len); + jl(l_end, T_NEAR); + + auto v_src_0 = getVmm(); + auto v_dst_0 = getVmm(); + auto v_dst_1 = getVmm(); + auto v_dst_2 = getVmm(); + auto& v_dst_3 = v_dst; + auto v_k_loop = getVmm(); + auto v_aux_0 = getVmm(); + + auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx()); + auto xmm_src_1 = getXmm(); + auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx()); + auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx()); + auto xmm_dst_2 = Xbyak::Xmm(v_dst_2.getIdx()); + auto xmm_dst_3 = Xbyak::Xmm(v_dst_3.getIdx()); + auto xmm_k_loop = Xbyak::Xmm(v_k_loop.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx()); + + RegistersPool::Reg r64_bulk_step; + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + r64_bulk_step = getReg64(); + mov(r64_bulk_step, ptr[r64_params + GET_OFF(threads_num)]); + sal(r64_bulk_step, static_cast(std::log2(get_vlen()))); // * vlen + } + + if (m_jcp.type == SINGLE_THREAD) { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_8_9_OFF]); + } else { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_16_17_OFF]); + } + + uni_vmovdqu64(v_dst_0, v_dst); + + if (!is_vpclmulqdq) { + vextracti64x2(xmm_dst_1, v_dst_0, 0x1); + vextracti64x2(xmm_dst_2, v_dst_0, 0x2); + vextracti64x2(xmm_dst_3, v_dst_0, 0x3); + } + + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + add(r64_src_ptr, r64_bulk_step); + prefetcht2(ptr[r64_src_ptr + 16384]); + } else { + add(r64_src_ptr, get_vlen() - xmm_len); + prefetcht2(ptr[r64_src_ptr + 4096]); + } + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + sub(r64_work_amount, 2 * get_vlen() - xmm_len); + + L(l_fold_loop); + { + uni_vmovdqu64(v_src_0, ptr[r64_src_ptr]); + vpshufb(v_src_0, v_src_0, v_shuf_mask); + + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + add(r64_src_ptr, r64_bulk_step); + prefetcht2(ptr[r64_src_ptr + 16384]); + } else { + add(r64_src_ptr, get_vlen()); + prefetcht2(ptr[r64_src_ptr + 4096]); + } + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + if (is_vpclmulqdq) { + vpclmulqdq(v_aux_0, v_dst_0, v_k_loop, 0b00000000); + vpclmulqdq(v_dst_0, v_dst_0, v_k_loop, 0b00010001); + uni_vpxorq(v_aux_0, v_aux_0, v_src_0); + uni_vpxorq(v_dst_0, v_dst_0, v_aux_0); + } else { + // 0 + vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0); + uni_vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0); + + // 1 + vextracti64x2(xmm_src_1, v_src_0, 0x1); + vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); + + // 2 + vextracti64x2(xmm_src_1, v_src_0, 0x2); + vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_2, xmm_dst_2, xmm_aux_0); + + // 3 + vextracti64x2(xmm_src_1, v_src_0, 0x3); + vpclmulqdq(xmm_aux_0, xmm_dst_3, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_3, xmm_dst_3, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + } + + sub(r64_work_amount, get_vlen()); + jge(l_fold_loop, T_NEAR); + } + add(r64_work_amount, get_vlen()); + + if (m_jcp.type == SINGLE_THREAD) { + if (is_vpclmulqdq) { + vextracti64x2(xmm_dst_1, v_dst_0, 0x1); + vextracti64x2(xmm_dst_2, v_dst_0, 0x2); + vextracti64x2(xmm_dst_3, v_dst_0, 0x3); + } + + vpclmulqdq(xmm_aux_0, xmm_dst_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0); + + vpclmulqdq(xmm_aux_0, xmm_dst_1, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000); + vpclmulqdq(xmm_dst_1, xmm_dst_1, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_1); + + vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0); + uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_2); + } else { + if (is_vpclmulqdq) { + uni_vmovdqu64(ptr[r64_dst_ptr], v_dst_0); + } else { + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 0lu], xmm_dst_0); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 1lu], xmm_dst_1); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 2lu], xmm_dst_2); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 3lu], xmm_dst_3); + } + } + + L(l_end); +} + +template +void ComputeHash::bulk_fold(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FIRST_THREAD && m_jcp.type != N_THREAD) { + return; + } + Xbyak::Label l_fold_loop, l_end; + cmp(r64_work_amount, 2 * get_vlen() - xmm_len); + jl(l_end, T_NEAR); + + auto v_src_0 = getVmm(); + auto v_dst_0 = getVmm(); + auto& v_dst_1 = v_dst; + auto v_aux_0 = getVmm(); + auto v_k_loop = getVmm(); + + auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx()); + auto xmm_src_1 = getXmm(); + auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx()); + auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx()); + auto xmm_k_loop = Xbyak::Xmm(v_k_loop.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx()); + + RegistersPool::Reg r64_bulk_step; + if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) { + r64_bulk_step = getReg64(); + mov(r64_bulk_step, ptr[r64_params + GET_OFF(threads_num)]); + sal(r64_bulk_step, static_cast(std::log2(get_vlen()))); // * vlen + } + + if (m_jcp.type == SINGLE_THREAD) { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_4_5_OFF]); + } else { + uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_8_9_OFF]); + } + + uni_vmovdqu64(v_dst_0, v_dst); + + if (!is_vpclmulqdq) { + vextracti128(xmm_dst_1, v_dst_0, 0x1); + } + + if (m_jcp.type == SINGLE_THREAD) { + add(r64_src_ptr, get_vlen() - xmm_len); + } else { + add(r64_src_ptr, r64_bulk_step); + } + prefetcht2(ptr[r64_src_ptr + 4096]); + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + sub(r64_work_amount, 2 * get_vlen() - xmm_len); + + L(l_fold_loop); + { + uni_vmovdqu64(v_src_0, ptr[r64_src_ptr]); + vpshufb(v_src_0, v_src_0, v_shuf_mask); + + if (m_jcp.type == SINGLE_THREAD) { + add(r64_src_ptr, get_vlen()); + } else { + add(r64_src_ptr, r64_bulk_step); + } + prefetcht2(ptr[r64_src_ptr + 4096]); + prefetcht1(ptr[r64_src_ptr + 1024]); + prefetcht0(ptr[r64_src_ptr + 64]); + + if (is_vpclmulqdq) { + vpclmulqdq(v_aux_0, v_dst_0, v_k_loop, 0b00000000); + vpclmulqdq(v_dst_0, v_dst_0, v_k_loop, 0b00010001); + uni_vpxorq(v_aux_0, v_aux_0, v_src_0); + uni_vpxorq(v_dst_0, v_dst_0, v_aux_0); + } else { + // 0 + vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0); + uni_vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0); + // 1 + vextracti128(xmm_src_1, v_src_0, 0x1); + vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_loop, 0b00000000); + vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_loop, 0b00010001); + uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); + } + + sub(r64_work_amount, get_vlen()); + jge(l_fold_loop, T_NEAR); + } + add(r64_work_amount, get_vlen()); + + if (m_jcp.type == SINGLE_THREAD) { + if (is_vpclmulqdq) { + vextracti128(xmm_dst_1, v_dst_0, 0x1); + } + vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0); + uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_dst_0); + } else { + if (is_vpclmulqdq) { + uni_vmovdqu64(ptr[r64_dst_ptr], v_dst_0); + } else { + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 0lu], xmm_dst_0); + uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 1lu], xmm_dst_1); + } + } + + L(l_end); +} + +template <> +void ComputeHash::join(const Vmm& v_dst) { + if (m_jcp.type != FINAL_FOLD) { + return; + } + + mov(r64_aux, ptr[r64_params + GET_OFF(intermediate_ptr)]); + prefetcht0(ptr[r64_aux + 1024]); + + auto xmm_src_0 = getXmm(); + auto xmm_src_last = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_aux_0 = getXmm(); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + + uni_vmovdqu64(xmm_src_last, ptr[r64_aux + xmm_len * 7]); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_14_15_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_14_15_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_12_13_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_12_13_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 2lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_10_11_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_10_11_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 3lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_8_9_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_8_9_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 4lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 5lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 6lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); +} + +template +void ComputeHash::join(const Vmm& v_dst) { + if (m_jcp.type != FINAL_FOLD) { + return; + } + + mov(r64_aux, ptr[r64_params + GET_OFF(intermediate_ptr)]); + prefetcht0(ptr[r64_aux + 1024]); + + auto xmm_src_0 = getXmm(); + auto xmm_src_last = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_aux_0 = getXmm(); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + + uni_vmovdqu64(xmm_src_last, ptr[r64_aux + xmm_len * 3]); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 0lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 1lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); + + uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 2lu]); + vpclmulqdq(xmm_aux_0, xmm_src_0, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_src_0, xmm_src_0, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0); + uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0); +} + +template +void ComputeHash::fold_to_128(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FINAL_FOLD) { + return; + } + Xbyak::Label l_fold_loop, l_end; + cmp(r64_work_amount, xmm_len); + jl(l_end, T_NEAR); + + auto xmm_src = getXmm(); + auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); + auto xmm_aux = getXmm(); + + L(l_fold_loop); + { + uni_vmovdqu64(xmm_src, ptr[r64_src_ptr]); + vpshufb(xmm_src, xmm_src, xmm_shuf_mask); + + vpclmulqdq(xmm_aux, xmm_dst, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst, xmm_dst, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + uni_vpxorq(xmm_dst, xmm_dst, xmm_src); + + add(r64_src_ptr, xmm_len); + sub(r64_work_amount, xmm_len); + cmp(r64_work_amount, xmm_len); + jge(l_fold_loop, T_NEAR); + } + + L(l_end); +} + +template +void ComputeHash::fold_to_64(const Vmm& v_dst) { + if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FINAL_FOLD) { + return; + } + Xbyak::Label l_fold_to_64; + cmp(r64_work_amount, 0); + jle(l_fold_to_64, T_NEAR); + + auto xmm_src = getXmm(); + auto xmm_dst = Xbyak::Xmm(v_dst.getIdx()); + auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx()); + auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx()); + auto xmm_aux = getXmm(); + auto xmm_aux_1 = getXmm(); + auto xmm_aux_2 = getXmm(); + + partial_load(xmm_src, ptr[r64_src_ptr], r64_work_amount); + vpshufb(xmm_src, xmm_src, xmm_shuf_mask); + + vpclmulqdq(xmm_aux, xmm_dst, xmm_k_2_3, 0b00000000); + vpclmulqdq(xmm_dst, xmm_dst, xmm_k_2_3, 0b00010001); + uni_vpxorq(xmm_aux, xmm_aux, xmm_src); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + + L(l_fold_to_64); + + mov(r64_aux, K_2); + vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x0); + vpclmulqdq(xmm_aux, xmm_dst, xmm_aux, 0b00000001); + vpslldq(xmm_dst, xmm_dst, 0x8); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + + mov(r64_aux, P_1); + vpinsrq(xmm_aux_2, xmm_aux_2, r64_aux, 0x0); + vpclmulqdq(xmm_aux, xmm_dst, xmm_aux_2, 0b00000001); + mov(r64_aux, 0x0); + vpinsrq(xmm_aux_1, xmm_dst, r64_aux, 0x0); + uni_vpxorq(xmm_aux, xmm_aux, xmm_aux_1); + vpinsrq(xmm_aux_1, xmm_aux, r64_aux, 0x0); + + mov(r64_aux, P_2); + vpinsrq(xmm_aux_2, xmm_aux_2, r64_aux, 0x1); + vpclmulqdq(xmm_aux, xmm_aux, xmm_aux_2, 0b00010001); + uni_vpxorq(xmm_aux, xmm_aux, xmm_aux_1); + uni_vpxorq(xmm_dst, xmm_dst, xmm_aux); + + vpextrq(ptr[r64_dst_ptr], xmm_dst, 0x0); +} + +} // namespace jit +#endif // OV_CORE_USE_XBYAK_JIT + +size_t compute_hash(const void* src, size_t size) { +#ifdef OV_CORE_USE_XBYAK_JIT + if (Generator::mayiuse(avx2)) { + uint64_t result = 0lu; + + // Parallel section + constexpr size_t min_wa_per_thread = 131072lu; // 2^17 + if (size >= min_wa_per_thread * 2lu) { + static auto first_thr_kernel = Generator::mayiuse(avx512_core) + ? jit::ComputeHash::create({jit::FIRST_THREAD}) + : jit::ComputeHash::create({jit::FIRST_THREAD}); + static auto n_thr_kernel = Generator::mayiuse(avx512_core) + ? jit::ComputeHash::create({jit::N_THREAD}) + : jit::ComputeHash::create({jit::N_THREAD}); + static auto final_fold_kernel = Generator::mayiuse(avx512_core) + ? jit::ComputeHash::create({jit::FINAL_FOLD}) + : jit::ComputeHash::create({jit::FINAL_FOLD}); + + static const size_t max_thr_num = 2lu; + size_t thr_num = std::min(size / min_wa_per_thread, max_thr_num); + const uint64_t el_per_thread = + first_thr_kernel->get_vlen() * ((size / thr_num) / first_thr_kernel->get_vlen()); + std::vector intermediate(thr_num * first_thr_kernel->get_vlen()); + + parallel_nt_static(thr_num, [&](const int ithr, const int nthr) { + uint64_t start = ithr * el_per_thread; + if (start >= size) { + return; + } + uint64_t work_amount = (el_per_thread + start > size) ? size - start : el_per_thread; + + jit::ComputeHashCallArgs args; + + args.src_ptr = reinterpret_cast(src) + first_thr_kernel->get_vlen() * ithr; + args.dst_ptr = &(intermediate[ithr * first_thr_kernel->get_vlen()]); + args.k_ptr = jit::K_PULL; + args.work_amount = work_amount; + args.size = size; + args.threads_num = thr_num; + + if (ithr == 0) { + (*first_thr_kernel)(&args); + } else { + (*n_thr_kernel)(&args); + } + }); + + jit::ComputeHashCallArgs args; + args.src_ptr = reinterpret_cast(src) + size - args.work_amount; + args.dst_ptr = &result; + args.k_ptr = jit::K_PULL; + args.work_amount = size - el_per_thread * thr_num; + args.size = size; + args.intermediate_ptr = intermediate.data(); + + (*final_fold_kernel)(&args); + } else { + static auto single_thr_kernel = Generator::mayiuse(avx512_core) + ? jit::ComputeHash::create({jit::SINGLE_THREAD}) + : jit::ComputeHash::create({jit::SINGLE_THREAD}); + + jit::ComputeHashCallArgs args; + args.src_ptr = src; + args.dst_ptr = &result; + args.k_ptr = jit::K_PULL; + args.work_amount = size; + args.size = size; + + (*single_thr_kernel)(&args); + } + + return result; + } + +#endif // OV_CORE_USE_XBYAK_JIT + + constexpr auto cel_size = sizeof(size_t); + size_t seed = size; + const auto data = static_cast(src); + const auto d_end = std::next(data, size / cel_size); + // The constant value used as a magic number has been + // traditionally used e.g. in boost library's hash_combine. + // It happens to be derived from the golden ratio. + for (auto d = data; d != d_end; ++d) { + seed ^= *d + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + size_t last_bytes{0}; + std::memcpy(&last_bytes, d_end, size % cel_size); + seed ^= last_bytes + 0x9e3779b9 + (seed << 6) + (seed >> 2); + + return seed; +} + +} // namespace runtime +} // namespace ov