diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index bc42ffca8a3cf6..eb95a04337a7f8 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -49,6 +49,9 @@ target_include_directories(openvino_core_dev INTERFACE
     $<BUILD_INTERFACE:${OpenVINO_SOURCE_DIR}/src/common/transformations/include>
     $<BUILD_INTERFACE:${OpenVINO_SOURCE_DIR}/src/common/low_precision_transformations/include>)
 
+target_include_directories(openvino_core_dev SYSTEM INTERFACE
+    $<BUILD_INTERFACE:$<$<TARGET_EXISTS:xbyak::xbyak>:$<TARGET_PROPERTY:xbyak::xbyak,INTERFACE_INCLUDE_DIRECTORIES>>>)
+
 target_link_libraries(openvino_core_dev INTERFACE openvino::itt openvino::util)
 
 set_target_properties(openvino_core_dev PROPERTIES EXPORT_NAME core::dev)
diff --git a/src/core/dev_api/openvino/runtime/compute_hash.hpp b/src/core/dev_api/openvino/runtime/compute_hash.hpp
new file mode 100644
index 00000000000000..47a90d589be4ee
--- /dev/null
+++ b/src/core/dev_api/openvino/runtime/compute_hash.hpp
@@ -0,0 +1,20 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstddef>
+
+namespace ov {
+namespace runtime {
+
+/**
+ * @brief Computes the hash value for the input data
+ * @param src  A pointer to the input data
+ * @param size The length of the input data in bytes
+ */
+size_t compute_hash(const void* src, size_t size);
+
+}  // namespace runtime
+}  // namespace ov
diff --git a/src/core/reference/CMakeLists.txt b/src/core/reference/CMakeLists.txt
index f7874964233cf5..24c74ede3b5fba 100644
--- a/src/core/reference/CMakeLists.txt
+++ b/src/core/reference/CMakeLists.txt
@@ -39,8 +39,6 @@ ov_build_target_faster(${TARGET_NAME}
 
 ov_set_threading_interface_for(${TARGET_NAME})
 
-target_compile_definitions(${TARGET_NAME} PRIVATE XBYAK_NO_OP_NAMES XBYAK64)
-
 if(NOT BUILD_SHARED_LIBS)
     target_compile_definitions(${TARGET_NAME} PUBLIC OPENVINO_STATIC_LIBRARY)
 endif()
@@ -50,9 +48,6 @@ target_include_directories(${TARGET_NAME} PUBLIC
     $<BUILD_INTERFACE:${OV_CORE_DEV_API_PATH}>
     $<BUILD_INTERFACE:${OV_CORE_INCLUDE_PATH}>)
 
-target_include_directories(${TARGET_NAME} SYSTEM PRIVATE
-    $<BUILD_INTERFACE:$<$<TARGET_EXISTS:xbyak::xbyak>:$<TARGET_PROPERTY:xbyak::xbyak,INTERFACE_INCLUDE_DIRECTORIES>>>)
-
 find_package(Threads REQUIRED)
 target_link_libraries(${TARGET_NAME} PRIVATE Threads::Threads openvino::core::dev)
 
diff --git a/src/core/reference/include/openvino/reference/utils/combine_hash.hpp b/src/core/reference/include/openvino/reference/utils/combine_hash.hpp
deleted file mode 100644
index 9f1cfdea812494..00000000000000
--- a/src/core/reference/include/openvino/reference/utils/combine_hash.hpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <cstddef>
-
-namespace ov {
-namespace runtime {
-
-size_t combine_hash(const void* src, size_t size);
-
-}   // namespace runtime
-}   // namespace ov
diff --git a/src/core/reference/include/openvino/reference/utils/jit_generator.hpp b/src/core/reference/include/openvino/reference/utils/jit_generator.hpp
index 49c5cb6e0e959e..a8f5def4197275 100644
--- a/src/core/reference/include/openvino/reference/utils/jit_generator.hpp
+++ b/src/core/reference/include/openvino/reference/utils/jit_generator.hpp
@@ -5,94 +5,97 @@
 #pragma once
 
 #if defined _WIN32 && !defined NOMINMAX
-#define NOMINMAX
+#    define NOMINMAX
 #endif
 
-#include <functional>
+#define XBYAK64
+#define XBYAK_NO_OP_NAMES
 #include <xbyak/xbyak.h>
 
+#include <functional>
+
 namespace ov {
 namespace reference {
 namespace jit {
-#ifdef XBYAK64
-    static const Xbyak::Operand::Code abi_save_gpr_regs[] = {
-        Xbyak::Operand::RBX,
-        Xbyak::Operand::RBP,
-        Xbyak::Operand::R12,
-        Xbyak::Operand::R13,
-        Xbyak::Operand::R14,
-        Xbyak::Operand::R15,
+static const Xbyak::Operand::Code abi_save_gpr_regs[] = {
+    Xbyak::Operand::RBX,
+    Xbyak::Operand::RBP,
+    Xbyak::Operand::R12,
+    Xbyak::Operand::R13,
+    Xbyak::Operand::R14,
+    Xbyak::Operand::R15,
 #ifdef _WIN32
-        Xbyak::Operand::RDI,
-        Xbyak::Operand::RSI,
+    Xbyak::Operand::RDI,
+    Xbyak::Operand::RSI,
 #endif
-    };
+};
 
 #ifdef _WIN32
-#define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX) // RCX
+#    define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX)  // RCX
 #else
-#define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI) // RDI
+#    define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI)  // RDI
 #endif
-#endif  // XBYAK64
-
-    typedef enum {
-        isa_any,
-        sse42,
-        avx,
-        avx2,
-        avx512_common,
-        avx512_core,
-        avx512_core_vnni,
-        avx512_mic,
-        avx512_mic_4ops,
-        avx512_core_bf16,
-        avx512_vpopcnt,
-        fp16,
-        pclmulqdq,
-        vpclmulqdq
-    } cpu_isa_t;
-
-    class Generator : public Xbyak::CodeGenerator
-    {
+
+typedef enum {
+    isa_any,
+    sse42,
+    avx,
+    avx2,
+    avx512_common,
+    avx512_core,
+    avx512_core_vnni,
+    avx512_mic,
+    avx512_mic_4ops,
+    avx512_core_bf16,
+    avx512_vpopcnt,
+    fp16,
+    pclmulqdq,
+    vpclmulqdq
+} cpu_isa_t;
+
+class Generator : public Xbyak::CodeGenerator {
 #ifdef _WIN32
-        static constexpr size_t xmm_to_preserve_start = 6;
-        static constexpr size_t xmm_to_preserve = 10;
+    static constexpr size_t xmm_to_preserve_start = 6llu;
+    static constexpr size_t xmm_to_preserve = 10llu;
 #else
-        static constexpr size_t xmm_to_preserve_start = 0;
-        static constexpr size_t xmm_to_preserve = 0;
+    static constexpr size_t xmm_to_preserve_start = 0lu;
+    static constexpr size_t xmm_to_preserve = 0lu;
 #endif
 
-        static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]);
-        const size_t size_of_abi_save_regs;
+    static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]);
+    const size_t size_of_abi_save_regs;
+
+    const Xbyak::Reg64 reg_EVEX_max_8b_offt;
+    static constexpr int EVEX_max_8b_offt = 0x200;
+    size_t m_vlen = ymm_len;
 
-        const Xbyak::Reg64 reg_EVEX_max_8b_offt;
-        static constexpr int EVEX_max_8b_offt = 0x200;
+public:
+    static constexpr size_t xmm_len = 16lu;
+    static constexpr size_t ymm_len = 32lu;
+    static constexpr size_t zmm_len = 64lu;
 
-    public:
-        static constexpr size_t xmm_len = 16;
-        static constexpr size_t ymm_len = 32;
-        static constexpr size_t zmm_len = 64;
+    const Xbyak::Reg64 param = abi_param1;
 
-        const Xbyak::Reg64 param = abi_param1;
+    static bool mayiuse(const cpu_isa_t cpu_isa);
+    static bool is_x64();
 
-        static bool mayiuse(const cpu_isa_t cpu_isa);
-        static bool is_x64();
+    Generator(cpu_isa_t isa = avx2, void* code_ptr = nullptr, size_t code_size = 16lu * 1024lu);
+    void preamble();
+    void postamble();
 
-        Generator(void* code_ptr = nullptr, size_t code_size = 16 * 1024);
-        void preamble();
-        void postamble();
+    void foreach (const Xbyak::Reg64& idx,
+                  size_t step,
+                  const Xbyak::Reg64& end,
+                  std::function<void(const Xbyak::Reg64&)> && fn);
 
-        void foreach (const Xbyak::Reg64& idx,
-                        size_t step,
-                        const Xbyak::Reg64& end,
-                        std::function<void(const Xbyak::Reg64&)> && fn);
+    template <typename T>
+    void copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size);
 
-        template <typename T>
-        void copy(const Xbyak::Reg64& dst,
-                    const Xbyak::Reg64& src,
-                    const Xbyak::Reg64& size);
-    };
+    size_t get_vlen() {
+        return m_vlen;
+    }
+};
 
 }  // namespace jit
-}  // namespace reference 
+}  // namespace reference
 }  // namespace ov
diff --git a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp
index 59ddd11596b980..1ac29ba7c6ab83 100644
--- a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp
+++ b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp
@@ -4,15 +4,14 @@
 
 #pragma once
 
-#include "jit_generator.hpp"
-#include "openvino/core/except.hpp"
-
 #include <memory>
 #include <utility>
 #include <vector>
 
+#include "openvino/core/except.hpp"
+#include "openvino/reference/utils/jit_generator.hpp"
 namespace ov {
-namespace runtime {
+namespace reference {
 namespace jit {
 
 class RegistersPool {
@@ -21,25 +20,44 @@ class RegistersPool {
     using WeakPtr = std::weak_ptr<RegistersPool>;
     static constexpr int anyIdx = -1;
 
-    template<typename TReg>
+    template <typename TReg>
     class Reg {
         friend class RegistersPool;
+
     public:
         Reg() {}
-        Reg(const RegistersPool::Ptr& regPool) { initialize(regPool); }
-        Reg(const RegistersPool::Ptr& regPool, int requestedIdx) { initialize(regPool, requestedIdx); }
-        ~Reg() { release(); }
-        Reg& operator=(Reg&& other)  noexcept {
+        Reg(const RegistersPool::Ptr& regPool) {
+            initialize(regPool);
+        }
+        Reg(const RegistersPool::Ptr& regPool, int requestedIdx) {
+            initialize(regPool, requestedIdx);
+        }
+        ~Reg() {
+            release();
+        }
+        Reg& operator=(Reg&& other) noexcept {
             release();
             reg = other.reg;
             regPool = std::move(other.regPool);
             return *this;
         }
-        Reg(Reg&& other)  noexcept : reg(other.reg), regPool(std::move(other.regPool)) {}
-        operator TReg&() { ensureValid(); return reg; }
-        operator const TReg&() const { ensureValid(); return reg; }
-        operator Xbyak::RegExp() const { ensureValid(); return reg; }
-        int getIdx() const { ensureValid(); return reg.getIdx(); }
+        Reg(Reg&& other) noexcept : reg(other.reg), regPool(std::move(other.regPool)) {}
+        operator TReg&() {
+            ensureValid();
+            return reg;
+        }
+        operator const TReg&() const {
+            ensureValid();
+            return reg;
+        }
+        operator Xbyak::RegExp() const {
+            ensureValid();
+            return reg;
+        }
+        int getIdx() const {
+            ensureValid();
+            return reg.getIdx();
+        }
         friend Xbyak::RegExp operator+(const Reg& lhs, const Xbyak::RegExp& rhs) {
             lhs.ensureValid();
             return lhs.operator Xbyak::RegExp() + rhs;
@@ -50,7 +68,9 @@ class RegistersPool {
                 regPool.reset();
             }
         }
-        bool isInitialized() const { return !regPool.expired(); }
+        bool isInitialized() const {
+            return !regPool.expired();
+        }
 
     private:
         void ensureValid() const {
@@ -74,12 +94,12 @@ class RegistersPool {
         checkUniqueAndUpdate(false);
     }
 
-    template <ov::runtime::jit::cpu_isa_t isa>
+    template <ov::reference::jit::cpu_isa_t isa>
     static Ptr create(std::initializer_list<Xbyak::Reg> regsToExclude);
 
     static Ptr create(cpu_isa_t isa, std::initializer_list<Xbyak::Reg> regsToExclude);
 
-    template<typename TReg>
+    template <typename TReg>
     size_t countFree() const {
         if (std::is_base_of<Xbyak::Mmx, TReg>::value) {
             return simdSet.countUnused();
@@ -158,12 +178,17 @@ class RegistersPool {
         std::vector<bool> isFreeIndexVector;
     };
 
-    virtual int getFreeOpmask(int requestedIdx) { OPENVINO_THROW("getFreeOpmask: The Opmask is not supported in current instruction set"); }
-    virtual void returnOpmaskToPool(int idx) { OPENVINO_THROW("returnOpmaskToPool: The Opmask is not supported in current instruction set"); }
-    virtual size_t countUnusedOpmask() const { OPENVINO_THROW("countUnusedOpmask: The Opmask is not supported in current instruction set"); }
+    virtual int getFreeOpmask(int requestedIdx) {
+        OPENVINO_THROW("getFreeOpmask: The Opmask is not supported in current instruction set");
+    }
+    virtual void returnOpmaskToPool(int idx) {
+        OPENVINO_THROW("returnOpmaskToPool: The Opmask is not supported in current instruction set");
+    }
+    virtual size_t countUnusedOpmask() const {
+        OPENVINO_THROW("countUnusedOpmask: The Opmask is not supported in current instruction set");
+    }
 
-    RegistersPool(int simdRegistersNumber)
-            : simdSet(simdRegistersNumber) {
+    RegistersPool(int simdRegistersNumber) : simdSet(simdRegistersNumber) {
         checkUniqueAndUpdate();
         generalSet.exclude(Xbyak::Reg64(Xbyak::Operand::RSP));
         generalSet.exclude(Xbyak::Reg64(Xbyak::Operand::RAX));
@@ -173,7 +198,7 @@ class RegistersPool {
     }
 
     RegistersPool(std::initializer_list<Xbyak::Reg> regsToExclude, int simdRegistersNumber)
-            : simdSet(simdRegistersNumber) {
+        : simdSet(simdRegistersNumber) {
         checkUniqueAndUpdate();
         for (auto& reg : regsToExclude) {
             if (reg.isXMM() || reg.isYMM() || reg.isZMM()) {
@@ -186,7 +211,7 @@ class RegistersPool {
     }
 
 private:
-    template<typename TReg>
+    template <typename TReg>
     int getFree(int requestedIdx) {
         if (std::is_base_of<Xbyak::Mmx, TReg>::value) {
             auto idx = simdSet.getUnused(requestedIdx);
@@ -202,7 +227,7 @@ class RegistersPool {
         }
     }
 
-    template<typename TReg>
+    template <typename TReg>
     void returnToPool(const TReg& reg) {
         if (std::is_base_of<Xbyak::Mmx, TReg>::value) {
             simdSet.setAsUnused(reg.getIdx());
@@ -226,7 +251,7 @@ class RegistersPool {
         }
     }
 
-    PhysicalSet generalSet {16};
+    PhysicalSet generalSet{16};
     PhysicalSet simdSet;
 };
 
@@ -240,11 +265,11 @@ template <>
 class IsaRegistersPool<avx512_core> : public RegistersPool {
 public:
     IsaRegistersPool() : RegistersPool(32) {
-        opmaskSet.exclude(Xbyak::Opmask(0)); // the Opmask(0) has special meaning for some instructions, like gather instruction
+        opmaskSet.exclude(
+            Xbyak::Opmask(0));  // the Opmask(0) has special meaning for some instructions, like gather instruction
     }
 
-    IsaRegistersPool(std::initializer_list<Xbyak::Reg> regsToExclude)
-        : RegistersPool(regsToExclude, 32) {
+    IsaRegistersPool(std::initializer_list<Xbyak::Reg> regsToExclude) : RegistersPool(regsToExclude, 32) {
         for (auto& reg : regsToExclude) {
             if (reg.isOPMASK()) {
                 opmaskSet.exclude(reg);
@@ -267,7 +292,7 @@ class IsaRegistersPool<avx512_core> : public RegistersPool {
     }
 
 protected:
-    PhysicalSet opmaskSet {8};
+    PhysicalSet opmaskSet{8};
 };
 
 template <>
@@ -289,9 +314,10 @@ RegistersPool::Ptr RegistersPool::create(std::initializer_list<Xbyak::Reg> regsT
     return std::make_shared<IsaRegistersPool<isa>>(regsToExclude);
 }
 
-inline
-RegistersPool::Ptr RegistersPool::create(cpu_isa_t isa, std::initializer_list<Xbyak::Reg> regsToExclude) {
-#define ISA_SWITCH_CASE(isa) case isa: return std::make_shared<IsaRegistersPool<isa>>(regsToExclude);
+inline RegistersPool::Ptr RegistersPool::create(cpu_isa_t isa, std::initializer_list<Xbyak::Reg> regsToExclude) {
+#define ISA_SWITCH_CASE(isa) \
+    case isa:                \
+        return std::make_shared<IsaRegistersPool<isa>>(regsToExclude);
     switch (isa) {
         ISA_SWITCH_CASE(sse42)
         ISA_SWITCH_CASE(avx)
@@ -299,14 +325,14 @@ RegistersPool::Ptr RegistersPool::create(cpu_isa_t isa, std::initializer_list<Xb
         ISA_SWITCH_CASE(avx512_core)
         ISA_SWITCH_CASE(avx512_core_vnni)
         ISA_SWITCH_CASE(avx512_core_bf16)
-        case avx512_vpopcnt: return std::make_shared<IsaRegistersPool<avx512_core>>(regsToExclude);
-        default:
-            OPENVINO_THROW("Invalid isa argument in RegistersPool::create(): ", isa);
-        }
-    OPENVINO_THROW("Invalid isa argument in RegistersPool::create()");
+    case avx512_vpopcnt:
+        return std::make_shared<IsaRegistersPool<avx512_core>>(regsToExclude);
+    default:
+        OPENVINO_THROW("Invalid isa argument in RegistersPool::create(): ", isa);
+    }
 #undef ISA_SWITCH_CASE
 }
 
-} // namespace jit
-} // namespace runtime
-} // namespace ov
+}  // namespace jit
+}  // namespace reference
+}  // namespace ov
diff --git a/src/core/reference/src/op/jit_generator.hpp b/src/core/reference/src/op/jit_generator.hpp
deleted file mode 100644
index b4b9cd7a60c23f..00000000000000
--- a/src/core/reference/src/op/jit_generator.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#if defined _WIN32 && !defined NOMINMAX
-#    define NOMINMAX
-#endif
-
-#include <xbyak/xbyak.h>
-
-#include <functional>
-
-namespace ov {
-namespace reference {
-namespace jit {
-#ifdef XBYAK64
-static const Xbyak::Operand::Code abi_save_gpr_regs[] = {
-    Xbyak::Operand::RBX,
-    Xbyak::Operand::RBP,
-    Xbyak::Operand::R12,
-    Xbyak::Operand::R13,
-    Xbyak::Operand::R14,
-    Xbyak::Operand::R15,
-#    ifdef _WIN32
-    Xbyak::Operand::RDI,
-    Xbyak::Operand::RSI,
-#    endif
-};
-
-#    ifdef _WIN32
-#        define abi_param1 Xbyak::Reg64(Xbyak::Operand::RCX)  // RCX
-#    else
-#        define abi_param1 Xbyak::Reg64(Xbyak::Operand::RDI)  // RDI
-#    endif
-#endif  // XBYAK64
-
-class Generator : public Xbyak::CodeGenerator {
-    static constexpr size_t xmm_len = 16;
-
-#ifdef _WIN32
-    static constexpr size_t xmm_to_preserve_start = 6;
-    static constexpr size_t xmm_to_preserve = 10;
-#else
-    static constexpr size_t xmm_to_preserve_start = 0;
-    static constexpr size_t xmm_to_preserve = 0;
-#endif
-
-    static const size_t num_abi_save_gpr_regs = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]);
-    const size_t size_of_abi_save_regs;
-
-    const Xbyak::Reg64 reg_EVEX_max_8b_offt;
-    static constexpr int EVEX_max_8b_offt = 0x200;
-
-public:
-    const Xbyak::Reg64 param = abi_param1;
-
-    typedef enum {
-        isa_any,
-        sse42,
-        avx,
-        avx2,
-        avx512_common,
-        avx512_core,
-        avx512_core_vnni,
-        avx512_mic,
-        avx512_mic_4ops,
-        avx512_core_bf16,
-        avx512_vpopcnt,
-        fp16
-    } cpu_isa_t;
-
-    static bool mayiuse(const cpu_isa_t cpu_isa);
-    static bool is_x64();
-
-    Generator(void* code_ptr = nullptr, size_t code_size = 16 * 1024);
-    void preamble();
-    void postamble();
-
-    void foreach (const Xbyak::Reg64& idx,
-                  size_t step,
-                  const Xbyak::Reg64& end,
-                  std::function<void(const Xbyak::Reg64&)> && fn);
-
-    template <typename T>
-    void copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size);
-};
-}  // namespace jit
-}  // namespace reference
-}  // namespace ov
diff --git a/src/core/reference/src/op/utils/combine_hash.cpp b/src/core/reference/src/op/utils/combine_hash.cpp
deleted file mode 100644
index 1835155becf711..00000000000000
--- a/src/core/reference/src/op/utils/combine_hash.cpp
+++ /dev/null
@@ -1,666 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-// The CRC computation is used for x86.
-// The calculations were taken from the article
-// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel (December, 2009)".
-
-#include "openvino/core/visibility.hpp"
-#include "openvino/core/parallel.hpp"
-#include "openvino/reference/utils/combine_hash.hpp"
-
-#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
-#    include "openvino/reference/utils/registers_pool.hpp"
-#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64
-
-#include <cstring>
-
-namespace ov {
-namespace runtime {
-
-#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
-namespace jit {
-
-#define GET_OFF(field) offsetof(CombineHashCallArgs, field)
-#define getReg64() RegistersPool::Reg<Xbyak::Reg64>(registersPool)
-#define getVmm()   RegistersPool::Reg<Vmm>(registersPool)
-#define getXmm()   RegistersPool::Reg<Xbyak::Xmm>(registersPool)
-
-struct CombineHashCompileParams {
-};
-
-struct CombineHashCallArgs {
-    const void* src_ptr;
-    void* dst_ptr;
-    uint64_t work_amount = 0lu;
-    uint64_t make_64_fold = 0lu;
-};
-
-typedef void (*fn_t)(const CombineHashCallArgs*);
-
-template <cpu_isa_t isa>
-class CombineHash : public Generator {
-public:
-    explicit CombineHash(const CombineHashCompileParams& jcp) :
-            m_jcp(jcp) {
-        if (isa == avx512_core) {
-            vlen = zmm_len;
-        } else if (isa == avx2) {
-            vlen = ymm_len;
-        } else {
-            OPENVINO_THROW("Unsupported isa: ", isa);
-        }
-        if (!mayiuse(cpu_isa_t::pclmulqdq)) {
-            OPENVINO_THROW("The current CPU does not support pclmulqdq instruction, which is required for the CRC algorithm.");
-        }
-        if (mayiuse(cpu_isa_t::vpclmulqdq)) {
-            is_vpclmulqdq = true;
-        }
-
-        generate();
-    }
-
-    void generate() {
-        this->preamble();
-        registersPool = RegistersPool::create(isa, {rax, rcx, rsp, rdi, k0});
-
-        r64_src = getReg64();
-        r64_dst = getReg64();
-        r64_work_amount  = getReg64();
-        r64_make_64_fold = getReg64();
-
-        mov(r64_src, ptr[r64_params + GET_OFF(src_ptr)]);
-        mov(r64_dst, ptr[r64_params + GET_OFF(dst_ptr)]);
-        mov(r64_work_amount, ptr[r64_params + GET_OFF(work_amount)]);
-        mov(r64_make_64_fold, ptr[r64_params + GET_OFF(make_64_fold)]);
-
-        initVectors();
-        bulkFold(v_dst);
-        restFold(v_dst);
-        tailFold(v_dst);
-
-        registersPool.reset();
-        this->postamble();
-    }
-
-    static fn_t get() {
-        static const CombineHashCompileParams params;
-        static CombineHash<isa> kernel(params);
-
-        return (fn_t)kernel.getCode();
-    }
-
-    void fillRestWorkMask(const Xbyak::Opmask& k_dst_mask,
-                          const Xbyak::Reg64& r64_work_rest) {
-        Xbyak::Label l_mv_mask;
-        auto rOnes = getReg64();
-
-        mov(rOnes, 0xFFFFFFFFFFFFFFFF);
-        cmp(r64_work_rest, 0x3f);
-        jg(l_mv_mask);
-
-        shlx(rOnes, rOnes, r64_work_rest);
-        not_(rOnes);
-
-        L(l_mv_mask);
-        kmovq(k_dst_mask, rOnes);
-    }
-
-    void partialLoad(const Xbyak::Xmm&     xmm_dst,
-                     const Xbyak::Address& src_addr,
-                     const Xbyak::Reg64&   r64_load_num) {
-        Xbyak::Label l_partial, l_end;
-
-        cmp(r64_load_num, xmm_len);
-        jl(l_partial, T_NEAR);
-        vmovdqu(xmm_dst, ptr[src_addr.getRegExp()]);
-        jmp(l_end, T_NEAR);
-
-        L(l_partial); {
-            size_t offset = xmm_len;
-
-            for (size_t j = 0lu; j < xmm_len - 1; j++) {
-                pinsrb(xmm_dst, ptr[src_addr.getRegExp() + offset], j);
-                cmp(r64_load_num, ++offset);
-                jle(l_end, T_NEAR);
-            }
-        }
-
-        L(l_end);
-    }
-
-    void partialLoad(const Xbyak::Ymm&     ymm_dst,
-                     const Xbyak::Address& src_addr,
-                     const Xbyak::Reg64&   r64_load_num) {
-        Xbyak::Label l_xmm, l_partial, l_end;
-        auto xmm_dst = Xbyak::Xmm(ymm_dst.getIdx());
-
-        cmp(r64_load_num, ymm_len);
-        jl(l_xmm, T_NEAR);
-        vmovdqu(ymm_dst, ptr[src_addr.getRegExp()]);
-        jmp(l_end, T_NEAR);
-
-        L(l_xmm);
-        vpxorq(ymm_dst, ymm_dst, ymm_dst);
-        cmp(r64_load_num, xmm_len);
-        jl(l_partial, T_NEAR);
-        vmovdqu(xmm_dst, ptr[src_addr.getRegExp()]);
-        je(l_end, T_NEAR);
-
-        {
-            Xbyak::Label l_rest_loop, l_perm;
-            size_t offset = xmm_len;
-
-            vperm2f128(ymm_dst, ymm_dst, ymm_dst, 0x1);
-            for (size_t j = 0lu; j < xmm_len - 1; j++) {
-                pinsrb(xmm_dst, ptr[src_addr.getRegExp() + offset], j);
-                cmp(r64_load_num, ++offset);
-                jle(l_perm, T_NEAR);
-            }
-            L(l_perm);
-            vperm2f128(ymm_dst, ymm_dst, ymm_dst, 0x1);
-        }
-        jmp(l_end, T_NEAR);
-
-        L(l_partial); {
-            size_t offset = xmm_len;
-
-            for (size_t j = 0lu; j < xmm_len - 1; j++) {
-                pinsrb(xmm_dst, ptr[src_addr.getRegExp() + offset], j);
-                cmp(r64_load_num, ++offset);
-                jle(l_end, T_NEAR);
-            }
-        }
-
-        L(l_end);
-    }
-
-private:
-    static constexpr uint64_t CHUNK_SIZE = 32;
-    static const uint64_t CRC_VAL;
-    static const uint64_t CONST_K[12];
-    static const uint8_t SHUF_MASK[16];
-
-    using Vmm = typename std::conditional<isa == avx512_core, Xbyak::Zmm, Xbyak::Ymm>::type;
-    size_t vlen = xmm_len;
-    bool is_vpclmulqdq = false;
-
-    CombineHashCompileParams m_jcp;
-    RegistersPool::Ptr registersPool;
-
-    RegistersPool::Reg<Xbyak::Reg64> r64_src;
-    RegistersPool::Reg<Xbyak::Reg64> r64_dst;
-    RegistersPool::Reg<Xbyak::Reg64> r64_work_amount;
-    RegistersPool::Reg<Xbyak::Reg64> r64_make_64_fold;
-
-    const Xbyak::Reg64 r64_params = abi_param1;
-
-    // Vector registers
-    RegistersPool::Reg<Vmm> v_dst;
-    RegistersPool::Reg<Vmm> v_k_1_2;
-    RegistersPool::Reg<Vmm> v_k_4_5;
-    RegistersPool::Reg<Vmm> v_k_8_9;
-    RegistersPool::Reg<Vmm> v_k_16_17;
-    RegistersPool::Reg<Vmm> v_shuf_mask;
-
-    size_t getVlen() {
-        return vlen;
-    }
-
-    void initVectors();
-
-    void bulkFold(const Vmm& v_dst);
-
-    void restFold(const Vmm& v_dst) {
-        Xbyak::Label l_fold_loop, l_end;
-        cmp(r64_work_amount, xmm_len);
-        jl(l_end, T_NEAR);
-
-        auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx());
-        auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx());
-        auto xmm_src = getXmm();
-        auto xmm_dst = Xbyak::Xmm(v_dst.getIdx());
-        auto xmm_aux = getXmm();
-
-        L(l_fold_loop); {
-            vmovdqu64(xmm_src, ptr[r64_src]);
-            vpshufb(xmm_src, xmm_src, xmm_shuf_mask);
-
-            vpclmulqdq(xmm_aux, xmm_dst, xmm_k_1_2, 0b00000000);
-            vpclmulqdq(xmm_dst, xmm_dst, xmm_k_1_2, 0b00010001);
-            vpxorq(xmm_dst, xmm_dst, xmm_aux);
-            vpxorq(xmm_dst, xmm_dst, xmm_src);
-
-            add(r64_src, xmm_len);
-            sub(r64_work_amount, xmm_len);
-            cmp(r64_work_amount, xmm_len);
-            jge(l_fold_loop, T_NEAR);
-        }
-
-        L(l_end);
-    }
-
-    void tailFold(const Vmm& v_dst);
-};
-
-template <>
-void CombineHash<avx512_core>::initVectors() {
-    auto r64_aux = getReg64();
-
-    v_k_1_2 = getVmm();
-    mov(r64_aux, reinterpret_cast<uintptr_t>(CONST_K));
-    vbroadcasti64x2(v_k_1_2, ptr[r64_aux]);
-    v_k_8_9 = getVmm();
-    mov(r64_aux, reinterpret_cast<uintptr_t>(CONST_K + 6));
-    vbroadcasti64x2(v_k_8_9, ptr[r64_aux]);
-
-    v_shuf_mask = getVmm();
-    mov(r64_aux, reinterpret_cast<uintptr_t>(SHUF_MASK));
-    vbroadcasti64x2(v_shuf_mask, ptr[r64_aux]);
-
-    v_dst = getVmm();
-    auto xmm_dst = Xbyak::Xmm(v_dst.getIdx());
-    auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx());
-    auto xmm_aux = getXmm();
-    auto k_rest_mask = RegistersPool::Reg<Xbyak::Opmask>(registersPool);
-    // Initial CRC
-    mov(r64_aux, CRC_VAL);
-    vpxorq(v_dst, v_dst, v_dst);
-    vpinsrq(xmm_dst, xmm_dst, r64_work_amount, 0x0);
-    vpinsrq(xmm_dst, xmm_dst, r64_aux, 0x1);
-    // First xor with source
-    fillRestWorkMask(k_rest_mask, r64_work_amount);
-    vmovdqu8(Xbyak::Xmm(xmm_aux.getIdx()) | k_rest_mask | T_z, ptr[r64_src]);
-    vpshufb(xmm_aux, xmm_aux, xmm_shuf_mask);
-    vpxorq(xmm_dst, xmm_dst, xmm_aux);
-    sub(r64_work_amount, xmm_len);
-    add(r64_src, xmm_len);
-}
-
-template <cpu_isa_t isa>
-void CombineHash<isa>::initVectors() {
-    auto r64_aux = getReg64();
-
-    v_k_1_2 = getVmm();
-    mov(r64_aux, reinterpret_cast<uintptr_t>(CONST_K));
-    vbroadcasti128(v_k_1_2, ptr[r64_aux]);
-    v_k_8_9 = getVmm();
-    mov(r64_aux, reinterpret_cast<uintptr_t>(CONST_K + 6));
-    vbroadcasti128(v_k_8_9, ptr[r64_aux]);
-
-    v_shuf_mask = getVmm();
-    mov(r64_aux, reinterpret_cast<uintptr_t>(SHUF_MASK));
-    vbroadcasti128(v_shuf_mask, ptr[r64_aux]);
-
-    v_dst = getVmm();
-    auto xmm_dst = Xbyak::Xmm(v_dst.getIdx());
-    auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx());
-    auto xmm_aux = getXmm();
-    auto k_rest_mask = RegistersPool::Reg<Xbyak::Opmask>(registersPool);
-    // Initial CRC
-    mov(r64_aux, CRC_VAL);
-    vpxorq(v_dst, v_dst, v_dst);
-    vpinsrq(xmm_dst, xmm_dst, r64_aux, 0x1);
-    // First xor with source
-    partialLoad(xmm_aux, ptr[r64_src], r64_work_amount);
-    vpshufb(xmm_aux, xmm_aux, xmm_shuf_mask);
-    vpxorq(xmm_dst, xmm_dst, xmm_aux);
-    sub(r64_work_amount, xmm_len);
-}
-
-template <>
-void CombineHash<avx512_core>::bulkFold(const Vmm& v_dst) {
-    Xbyak::Label l_fold_loop, l_end;
-    cmp(r64_work_amount, zmm_len + 3 * xmm_len);
-    jl(l_end, T_NEAR);
-
-    auto r64_aux = getReg64();
-
-    auto v_src_0 = getVmm();
-    auto v_dst_0 = getVmm();
-    auto v_dst_1 = getVmm();
-    auto v_dst_2 = getVmm();
-    auto& v_dst_3 = v_dst;
-    auto v_aux_0 = getVmm();
-
-    auto xmm_k_8_9 = Xbyak::Xmm(v_k_8_9.getIdx());
-    auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx());
-    auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx());
-    auto xmm_src_1 = getXmm();
-    auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx());
-    auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx());
-    auto xmm_dst_2 = Xbyak::Xmm(v_dst_2.getIdx());
-    auto xmm_dst_3 = Xbyak::Xmm(v_dst_3.getIdx());
-    auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx());
-
-    vmovdqu64(v_dst_0, v_dst_3);
-
-    if (!is_vpclmulqdq) {
-        prefetchnta(ptr[r64_src + 3 * xmm_len]);
-        vmovdqu64(xmm_dst_1, ptr[r64_src + 0 * xmm_len]);
-        vmovdqu64(xmm_dst_2, ptr[r64_src + 1 * xmm_len]);
-        vmovdqu64(xmm_dst_3, ptr[r64_src + 2 * xmm_len]);
-    }
-
-    add(r64_src, 3 * xmm_len);
-    sub(r64_work_amount, zmm_len + 3 * xmm_len);
-
-    L(l_fold_loop); {
-        vmovdqu64(v_src_0, ptr[r64_src]);
-        vpshufb(v_src_0, v_src_0, v_shuf_mask);
-
-        if (is_vpclmulqdq) {
-            vpclmulqdq(v_aux_0, v_dst_0, v_k_8_9, 0b00000000);
-            vpclmulqdq(v_dst_0, v_dst_0, v_k_8_9, 0b00010001);
-            vpxorq(v_aux_0, v_aux_0, v_src_0);
-            vpxorq(v_dst_0, v_dst_0, v_aux_0);
-        } else {
-            // 0
-            vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_8_9, 0b00000000);
-            vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_8_9, 0b00010001);
-            vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0);
-            vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0);
-            // 1
-            vextracti64x2(xmm_src_1, v_src_0, 0x1);
-            vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_8_9, 0b00000000);
-            vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_8_9, 0b00010001);
-            vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1);
-            vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0);
-            // 2
-            vextracti64x2(xmm_src_1, v_src_0, 0x2);
-            vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_8_9, 0b00000000);
-            vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_8_9, 0b00010001);
-            vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1);
-            vpxorq(xmm_dst_2, xmm_dst_2, xmm_aux_0);
-            // 3
-            vextracti64x2(xmm_src_1, v_src_0, 0x3);
-            vpclmulqdq(xmm_aux_0, xmm_dst_3, xmm_k_8_9, 0b00000000);
-            vpclmulqdq(xmm_dst_3, xmm_dst_3, xmm_k_8_9, 0b00010001);
-            vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1);
-            vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0);
-        }
-
-        add(r64_src, zmm_len);
-        sub(r64_work_amount, zmm_len);
-        jge(l_fold_loop, T_NEAR);
-    }
-    add(r64_work_amount, zmm_len);
-
-    if (is_vpclmulqdq) {
-        auto ymm_dst_0 = Xbyak::Ymm(v_dst_0.getIdx());
-        auto ymm_dst_1 = Xbyak::Ymm(v_dst_1.getIdx());
-        auto ymm_aux_0 = Xbyak::Ymm(v_aux_0.getIdx());
-
-        vextracti64x4(ymm_dst_1, v_dst_0, 0x1);
-        mov(r64_aux, reinterpret_cast<uintptr_t>(CONST_K + 2));
-        vpclmulqdq(ymm_aux_0, ymm_dst_0, ptr[r64_aux], 0b00000000);
-        vpclmulqdq(ymm_dst_0, ymm_dst_0, ptr[r64_aux], 0b00010001);
-        vpxorq(ymm_dst_1, ymm_dst_1, ymm_aux_0);
-        vpxorq(ymm_dst_0, ymm_dst_0, ymm_dst_1);
-
-        vextracti64x2(xmm_dst_3, ymm_dst_0, 0x1);
-        vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_1_2, 0b00000000);
-        vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_1_2, 0b00010001);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0);
-    } else {
-        mov(r64_aux, reinterpret_cast<uintptr_t>(CONST_K + 4));
-        vpclmulqdq(xmm_aux_0, xmm_dst_0, ptr[r64_aux], 0b00000000);
-        vpclmulqdq(xmm_dst_0, xmm_dst_0, ptr[r64_aux], 0b00010001);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0);
-
-        mov(r64_aux, reinterpret_cast<uintptr_t>(CONST_K + 2));
-        vpclmulqdq(xmm_aux_0, xmm_dst_1, ptr[r64_aux], 0b00000000);
-        vpclmulqdq(xmm_dst_1, xmm_dst_1, ptr[r64_aux], 0b00010001);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_1);
-
-        vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_1_2, 0b00000000);
-        vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_1_2, 0b00010001);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_2);
-    }
-
-    L(l_end);
-}
-
-template <>
-void CombineHash<avx2>::bulkFold(const Vmm& v_dst) {
-    Xbyak::Label l_fold_loop, l_end;
-    cmp(r64_work_amount, 2 * vlen - xmm_len);
-    jl(l_end, T_NEAR);
-
-    auto r64_aux = getReg64();
-
-    auto v_src_0 = getVmm();
-    auto v_dst_0 = getVmm();
-    auto v_dst_1 = getVmm();
-    auto v_dst_2 = getVmm();
-    auto& v_dst_3 = v_dst;
-    auto v_aux_0 = getVmm();
-
-    auto xmm_k_4_5 = Xbyak::Xmm(v_k_4_5.getIdx());
-    auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx());
-    auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx());
-    auto xmm_src_1 = getXmm();
-    auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx());
-    auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx());
-    auto xmm_dst_2 = Xbyak::Xmm(v_dst_2.getIdx());
-    auto xmm_dst_3 = Xbyak::Xmm(v_dst_3.getIdx());
-    auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx());
-
-    if (!is_vpclmulqdq) {
-        vmovdqu64(xmm_dst_1, ptr[r64_src + 0 * xmm_len]);
-    }
-
-    add(r64_src, vlen - xmm_len);
-    sub(r64_work_amount, 2 * vlen - xmm_len);
-
-    L(l_fold_loop); {
-        vmovdqu64(v_src_0, ptr[r64_src]);
-        vpshufb(v_src_0, v_src_0, v_shuf_mask);
-
-        if (is_vpclmulqdq) {
-            vpclmulqdq(v_aux_0, v_dst_0, v_k_4_5, 0b00000000);
-            vpclmulqdq(v_dst_0, v_dst_0, v_k_4_5, 0b00010001);
-            vpxorq(v_aux_0, v_aux_0, v_src_0);
-            vpxorq(v_dst_0, v_dst_0, v_aux_0);
-        } else {
-            // 0
-            vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_4_5, 0b00000000);
-            vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_4_5, 0b00010001);
-            vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0);
-            vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0);
-            // 1
-            vextracti128(xmm_src_1, v_src_0, 0x1);
-            vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_4_5, 0b00000000);
-            vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_4_5, 0b00010001);
-            vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1);
-            vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0);
-        }
-
-        add(r64_src, vlen);
-        sub(r64_work_amount, vlen);
-        jge(l_fold_loop, T_NEAR);
-    }
-    add(r64_work_amount, vlen);
-
-    if (is_vpclmulqdq) {
-        auto ymm_dst_0 = Xbyak::Ymm(v_dst_0.getIdx());
-        auto ymm_dst_1 = Xbyak::Ymm(v_dst_1.getIdx());
-        auto ymm_aux_0 = Xbyak::Ymm(v_aux_0.getIdx());
-
-        vextracti128(xmm_dst_3, ymm_dst_0, 0x1);
-        vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_1_2, 0b00000000);
-        vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_1_2, 0b00010001);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0);
-    } else {
-        vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_1_2, 0b00000000);
-        vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_1_2, 0b00010001);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0);
-        vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_2);
-    }
-
-    L(l_end);
-}
-
-
-template <>
-void CombineHash<avx512_core>::tailFold(const Vmm& v_dst) {
-    Xbyak::Label l_fold_to_64, l_save_128, l_end;
-    cmp(r64_work_amount, 0);
-    jle(l_fold_to_64, T_NEAR);
-
-    auto r64_aux = getReg64();
-    auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx());
-    auto xmm_k_1_2 = Xbyak::Xmm(v_k_1_2.getIdx());
-    auto xmm_src = getXmm();
-    auto xmm_dst = Xbyak::Xmm(v_dst.getIdx());
-    auto xmm_aux = getXmm();
-    auto xmm_aux_1 = getXmm();
-    auto xmm_aux_2 = getXmm();
-    auto k_rest_mask = RegistersPool::Reg<Xbyak::Opmask>(registersPool);
-
-    fillRestWorkMask(k_rest_mask, r64_work_amount);
-
-    vpxorq(xmm_src, xmm_src, xmm_src);
-    vmovdqu8(Xbyak::Xmm(xmm_src.getIdx()) | k_rest_mask | T_z, ptr[r64_src]);
-    vpshufb(xmm_src, xmm_src, xmm_shuf_mask);
-
-    vpclmulqdq(xmm_aux, xmm_dst, xmm_k_1_2, 0b00000000);
-    vpclmulqdq(xmm_dst, xmm_dst, xmm_k_1_2, 0b00010001);
-    vpxorq(xmm_aux, xmm_aux, xmm_src);
-    vpxorq(xmm_dst, xmm_dst, xmm_aux);
-
-    L(l_fold_to_64);
-    cmp(r64_make_64_fold, 0);
-    je(l_save_128, T_NEAR);
-
-    mov(r64_aux, reinterpret_cast<uintptr_t>(CONST_K + 8));
-    vpclmulqdq(xmm_aux, xmm_dst, ptr[r64_aux], 0b00000001);
-    vpslldq(xmm_dst, xmm_dst, 0x8);
-    vpxorq(xmm_dst, xmm_dst, xmm_aux);
-
-    mov(r64_aux, reinterpret_cast<uintptr_t>(CONST_K + 10));
-    vmovdqu64(xmm_aux_2, ptr[r64_aux]);
-    vpclmulqdq(xmm_aux, xmm_dst, xmm_aux_2, 0b00000001);
-    mov(r64_aux, 0x0);
-    vpinsrq(xmm_aux_1, xmm_dst, r64_aux, 0x0);
-    vpxorq(xmm_aux, xmm_aux, xmm_aux_1);
-    vpinsrq(xmm_aux_1, xmm_aux, r64_aux, 0x0);
-    vpclmulqdq(xmm_aux, xmm_aux, xmm_aux_2, 0b00010001);
-    vpxorq(xmm_aux, xmm_aux, xmm_aux_1);
-    vpxorq(xmm_dst, xmm_dst, xmm_aux);
-
-    vpextrq(ptr[r64_dst], xmm_dst, 0x0);
-    jmp(l_end, T_NEAR);
-
-
-    L(l_save_128);
-    vmovdqu64(ptr[r64_dst], xmm_dst);
-
-    L(l_end);
-}
-
-template <>
-void CombineHash<avx2>::tailFold(const Vmm& v_dst) {
-}
-
-template <cpu_isa_t isa>
-const uint64_t CombineHash<isa>::CRC_VAL = 0xffffffffffffffff;
-
-// P(x) = 0x42F0E1EBA9EA3693
-template <cpu_isa_t isa>
-const uint64_t CombineHash<isa>::CONST_K[12] = { 0x05f5c3c7eb52fab6, 0x4eb938a7d257740e,  // x^(64*1), x^(64*2)
-                                                 0x571bee0a227ef92b, 0x44bef2a201b5200c,  // x^(64*3), x^(64*4)
-                                                 0x54819d8713758b2c, 0x4a6b90073eb0af5a,  // x^(64*5), x^(64*6)
-                                                 0x5f6843ca540df020, 0xddf4b6981205b83f,  // x^(64*7), x^(64*8)
-                                                 0x05f5c3c7eb52fab6, 0x0000000000000000,  // x^(64*1), x^(64*1) mod P(x)
-                                                 0x578d29d06cc4f872, 0x42f0e1eba9ea3693   // floor(x^128/P(x)) - x^64, P(x) - x^64
-                                                };
-
-template <cpu_isa_t isa>
-const uint8_t CombineHash<isa>::SHUF_MASK[] = { 0b00001111, 0b00001110, 0b00001101, 0b00001100, 0b00001011, 0b00001010, 0b00001001, 0b00001000,
-                                                0b00000111, 0b00000110, 0b00000101, 0b00000100, 0b00000011, 0b00000010, 0b00000001, 0b00000000 };
-
-} // namespace jit
-#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64
-
-size_t combine_hash(const void* src, size_t size) {
-#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
-    jit::fn_t kernel;
-
-    if (jit::Generator::mayiuse(jit::avx512_core)) {
-        kernel = jit::CombineHash<jit::avx512_core>::get();
-    } else if (jit::Generator::mayiuse(jit::avx2)) {
-        kernel = jit::CombineHash<jit::avx2>::get();
-    }
-
-    if (kernel) {
-        size_t res = 0lu;
-
-        static const size_t block_size = 2lu * jit::Generator::zmm_len;
-        // There is no sense to perform parallel execution if there are less than 2 blocks.
-        if (size >= 2lu * block_size) {
-            const auto nthr = parallel_get_max_threads() / 2; // TODO: WA for Hyper Threading
-            std::vector<uint64_t> intermediate(nthr * 2); // xmm_len * nthr
-            const uint64_t blocks = size / block_size;
-            const uint64_t el_per_thread = block_size * ((blocks + nthr - 1) / nthr);
-
-            parallel_nt(nthr, [&](const int ithr, const int nthr) {
-                uint64_t start = ithr * el_per_thread;
-                if (start >= size) {
-                    return;
-                }
-                uint64_t work_amount = (el_per_thread + start > size) ? size - start : el_per_thread;
-
-                size_t res = 0lu;
-                jit::CombineHashCallArgs args;
-
-                args.src_ptr = reinterpret_cast<const uint8_t *>(src) + start;
-                args.dst_ptr = &intermediate[ithr * 2];
-                args.work_amount = work_amount;
-                args.make_64_fold = 0lu;
-                kernel(&args);
-            });
-
-
-            jit::CombineHashCallArgs args;
-            args.src_ptr = intermediate.data();
-            args.dst_ptr = &res;
-            args.work_amount = ((size + el_per_thread - 1) / el_per_thread) * jit::Generator::xmm_len;
-            args.make_64_fold = 1lu;
-            kernel(&args);
-        } else {
-            jit::CombineHashCallArgs args;
-            args.src_ptr = src;
-            args.dst_ptr = &res;
-            args.work_amount = size;
-            args.make_64_fold = 1lu;
-            kernel(&args);
-        }
-        return res;
-    }
-#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64
-
-    constexpr auto cel_size = sizeof(size_t);
-    auto seed = static_cast<size_t>(size);
-    const auto data = static_cast<const size_t*>(src);
-    const auto d_end = std::next(data, size / cel_size);
-    // The constant value used as a magic number has been
-    // traditionally used e.g. in boost library's hash_combine.
-    // It happens to be derived from the golden ratio.
-    for (auto d = data; d != d_end; ++d) {
-        seed ^= *d + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-    }
-    size_t last_bytes{0};
-    std::memcpy(&last_bytes, d_end, size % cel_size);
-    seed ^= last_bytes + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-    return seed;
-}
-
-}   // namespace runtime
-}   // namespace ov
diff --git a/src/core/reference/src/op/utils/jit_generator.cpp b/src/core/reference/src/utils/jit_generator.cpp
similarity index 94%
rename from src/core/reference/src/op/utils/jit_generator.cpp
rename to src/core/reference/src/utils/jit_generator.cpp
index 174cbb9242acc4..39dc31c0033f9f 100644
--- a/src/core/reference/src/op/utils/jit_generator.cpp
+++ b/src/core/reference/src/utils/jit_generator.cpp
@@ -11,9 +11,10 @@
 #    endif
 #    include <xbyak/xbyak_util.h>
 
-#    include "openvino/reference/utils/jit_generator.hpp"
+#    include "openvino/core/except.hpp"
 #    include "openvino/core/type/bfloat16.hpp"
 #    include "openvino/core/type/float16.hpp"
+#    include "openvino/reference/utils/jit_generator.hpp"
 
 namespace ov {
 namespace reference {
@@ -64,10 +65,18 @@ bool Generator::mayiuse(const cpu_isa_t cpu_isa) {
 bool Generator::is_x64() {
     return sizeof(void*) == 8;
 }
-Generator::Generator(void* code_ptr, size_t code_size)
+Generator::Generator(cpu_isa_t isa, void* code_ptr, size_t code_size)
     : Xbyak::CodeGenerator(code_size, code_ptr),
       size_of_abi_save_regs(num_abi_save_gpr_regs * rax.getBit() / 8 + xmm_to_preserve * xmm_len),
-      reg_EVEX_max_8b_offt(rbp) {}
+      reg_EVEX_max_8b_offt(rbp) {
+    if (isa == avx512_core) {
+        m_vlen = zmm_len;
+    } else if (isa == avx2) {
+        m_vlen = ymm_len;
+    } else {
+        OPENVINO_THROW("Unsupported isa: ", isa);
+    }
+}
 
 void Generator::preamble() {
     if (xmm_to_preserve) {
diff --git a/src/core/src/pass/serialize.cpp b/src/core/src/pass/serialize.cpp
index c36b681d9e034d..c182b13594b74d 100644
--- a/src/core/src/pass/serialize.cpp
+++ b/src/core/src/pass/serialize.cpp
@@ -22,8 +22,8 @@
 #include "openvino/opsets/opset1.hpp"
 #include "openvino/pass/constant_folding.hpp"
 #include "openvino/reference/convert.hpp"
-#include "openvino/reference/utils/combine_hash.hpp"
 #include "openvino/runtime/aligned_buffer.hpp"
+#include "openvino/runtime/compute_hash.hpp"
 #include "openvino/runtime/string_aligned_buffer.hpp"
 #include "openvino/util/file_util.hpp"
 #include "pugixml.hpp"
@@ -76,9 +76,10 @@ class ConstantWriter {
     using HashValue = size_t;
     using ConstWritePositions = std::multimap<HashValue, std::pair<FilePosition, void const*>>;
 
-    ConstantWriter(std::ostream& bin_data, bool enable_compression = true)
+    ConstantWriter(std::ostream& bin_data, bool enable_compression = true, bool write_hash_value = false)
         : m_binary_output(bin_data),
           m_enable_compression(enable_compression),
+          m_write_hash_value(write_hash_value),
           m_blob_offset(bin_data.tellp()) {}
 
     FilePosition write(const char* ptr,
@@ -116,18 +117,24 @@ class ConstantWriter {
             // the same hash for {2, 2} and {0, 128} arrays.
             // But even strong hashing algorithms sometimes give collisions.
             // Therefore we always have to compare values when finding a match in the hash multimap.
-            const HashValue hash = ov::runtime::combine_hash(ptr_to_write, *new_size);
+            const HashValue hash = ov::runtime::compute_hash(ptr_to_write, *new_size);
+
             auto found = m_hash_to_file_positions.find(hash);
             // iterate over all matches of the key in the multimap
             while (found != m_hash_to_file_positions.end()) {
-                if (memcmp(ptr, found->second.second, size) == 0)
+                if (memcmp(ptr, found->second.second, size) == 0) {
                     return found->second.first;
+                }
                 found++;
             }
             // Since fp16_compressed data will be disposed at exit point and since we cannot reread it from the ostream,
             // we store pointer to the original uncompressed blob.
             m_hash_to_file_positions.insert({hash, {offset, static_cast<void const*>(ptr)}});
-            m_binary_output.write(ptr_to_write, *new_size);
+            if (m_write_hash_value) {
+                m_binary_output.write(reinterpret_cast<const char*>(&hash), sizeof(uint64_t));
+            } else {
+                m_binary_output.write(ptr_to_write, *new_size);
+            }
         }
         return offset;
     }
@@ -172,6 +179,7 @@ class ConstantWriter {
     ConstWritePositions m_hash_to_file_positions;
     std::ostream& m_binary_output;
     bool m_enable_compression;
+    bool m_write_hash_value;
     FilePosition m_blob_offset;  // blob offset inside output stream
 };
 
@@ -1205,7 +1213,7 @@ void serializeFunc(std::ostream& xml_file,
     std::string name = "net";
     pugi::xml_document xml_doc;
     pugi::xml_node net_node = xml_doc.append_child(name.c_str());
-    ConstantWriter constant_write_handler(bin_file);
+    ConstantWriter constant_write_handler(bin_file, true, true);
     XmlSerializer visitor(net_node, name, constant_write_handler, version, deterministic);
     visitor.on_attribute(name, model);
 
@@ -1377,10 +1385,19 @@ bool pass::StreamSerialize::run_on_model(const std::shared_ptr<ov::Model>& model
 /// -------- Hash calculation pass -------------
 
 namespace {
-template <typename T>
-static uint64_t hash_combine(uint64_t seed, const T& a) {
-    // Hash combine formula from boost
-    return seed ^ (std::hash<T>()(a) + 0x9e3779b9 + (seed << 6) + (seed >> 2));
+// Hash combine formula from boost for uint64_t.
+inline uint64_t hash_combine(uint64_t h, uint64_t k) {
+    constexpr uint64_t m = 0xc6a4a7935bd1e995;
+    constexpr int r = 47;
+
+    k *= m;
+    k ^= k >> r;
+    k *= m;
+
+    h ^= k;
+    h *= m;
+
+    return h + 0xe6546b64;
 }
 
 class OstreamHashWrapper final : public std::streambuf {
@@ -1392,19 +1409,23 @@ class OstreamHashWrapper final : public std::streambuf {
     }
 
     std::streamsize xsputn(const char* s, std::streamsize n) override {
-        // Reinterpret data as uint32_t and accumulate in uint64_t to avoid overflow fluctuations in parallel_sum.
-        auto* int_sum = reinterpret_cast<const uint32_t*>(s);
-        const uint64_t n32 = n / sizeof(uint32_t);
+        uint64_t h = ov::runtime::compute_hash(s, n);
+        m_res = hash_combine(m_res, h);
+
+        return n;
+    }
+};
 
-        m_res += parallel_sum(n32, uint64_t(0lu), [&](size_t k) -> uint32_t {
-            return int_sum[k];
-        });
+class OstreamHashWrapperBin final : public std::streambuf {
+    uint64_t m_res = 0lu;
 
-        const uint64_t rest = n % sizeof(uint32_t);
-        for (uint64_t i = 0lu; i < rest; i++) {
-            m_res += s[n - rest + i];
-        }
+public:
+    uint64_t getResult() const {
+        return m_res;
+    }
 
+    std::streamsize xsputn(const char* s, std::streamsize n) override {
+        m_res = hash_combine(m_res, *reinterpret_cast<const uint64_t*>(s));
         return n;
     }
 };
@@ -1413,7 +1434,7 @@ class OstreamHashWrapper final : public std::streambuf {
 bool pass::Hash::run_on_model(const std::shared_ptr<ov::Model>& model) {
     RUN_ON_MODEL_SCOPE(Hash);
     OstreamHashWrapper xmlHash;
-    OstreamHashWrapper binHash;
+    OstreamHashWrapperBin binHash;
     std::ostream xml(&xmlHash);
     std::ostream bin(&binHash);
 
diff --git a/src/core/src/runtime/compute_hash.cpp b/src/core/src/runtime/compute_hash.cpp
new file mode 100644
index 00000000000000..395873c86d90f9
--- /dev/null
+++ b/src/core/src/runtime/compute_hash.cpp
@@ -0,0 +1,922 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// The CRC computation is used for x86.
+// The calculations were taken from the article
+// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel (December, 2009)".
+
+#include "openvino/runtime/compute_hash.hpp"
+
+#include <cmath>
+#include <cstring>
+
+#include "openvino/core/visibility.hpp"
+
+#if !defined(OS_CHROMEOS) && (defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64))
+#    define OV_CORE_USE_XBYAK_JIT
+#endif
+
+#ifdef OV_CORE_USE_XBYAK_JIT
+#    include "openvino/core/parallel.hpp"
+#    include "openvino/reference/utils/registers_pool.hpp"
+#endif  // OV_CORE_USE_XBYAK_JIT
+
+namespace ov {
+namespace runtime {
+
+#ifdef OV_CORE_USE_XBYAK_JIT
+
+using namespace ov::reference::jit;
+
+namespace jit {
+
+#    define GET_OFF(field) offsetof(ComputeHashCallArgs, field)
+#    define getReg64()     RegistersPool::Reg<Xbyak::Reg64>(m_registers_pool)
+#    define getVmm()       RegistersPool::Reg<Vmm>(m_registers_pool)
+#    define getXmm()       RegistersPool::Reg<Xbyak::Xmm>(m_registers_pool)
+
+enum KernelType { SINGLE_THREAD = 0, FIRST_THREAD, N_THREAD, FINAL_FOLD };
+
+struct ComputeHashCompileParams {
+    KernelType type;
+};
+
+struct ComputeHashCallArgs {
+    const void* src_ptr = nullptr;
+    void* dst_ptr = nullptr;
+    const void* k_ptr = nullptr;
+    void* intermediate_ptr = nullptr;
+    uint64_t work_amount = 0lu;
+    uint64_t size = 0lu;
+    uint64_t threads_num = 1lu;
+};
+
+typedef void (*hash_kernel)(const ComputeHashCallArgs*);
+
+static const uint8_t SHUF_MASK[16] = {0b00001111,
+                                      0b00001110,
+                                      0b00001101,
+                                      0b00001100,
+                                      0b00001011,
+                                      0b00001010,
+                                      0b00001001,
+                                      0b00001000,
+                                      0b00000111,
+                                      0b00000110,
+                                      0b00000101,
+                                      0b00000100,
+                                      0b00000011,
+                                      0b00000010,
+                                      0b00000001,
+                                      0b00000000};
+
+constexpr uint64_t CRC_VAL = 0xffffffffffffffff;
+
+// POLYNOM(x) = 0x42F0E1EBA9EA3693
+constexpr uint64_t K_2 = 0x05f5c3c7eb52fab6;
+constexpr uint64_t P_1 = 0x578d29d06cc4f872;
+constexpr uint64_t P_2 = 0x42f0e1eba9ea3693;
+static const uint64_t K_PULL[] = {
+    K_2,                 // x^(64*2)
+    0x0000000000000000,  // x^(64*1) mod P(x)
+    P_1,                 // floor(x^128/P(x))-x^64
+    P_2,                 // P(x)-x^64
+    K_2,                 // x^(64*2)
+    0x4eb938a7d257740e,  // x^(64*3)
+    0x571bee0a227ef92b,  // x^(64*4)
+    0x44bef2a201b5200c,  // x^(64*5)
+    0x54819d8713758b2c,  // x^(64*6)
+    0x4a6b90073eb0af5a,  // x^(64*7)
+    0x5f6843ca540df020,  // x^(64*8)
+    0xddf4b6981205b83f,  // x^(64*9)
+    0x097c516e98bd2e73,  // x^(64*10)
+    0x0b76477b31e22e7b,  // x^(64*11)
+    0x9af04e1eff82d0dd,  // x^(64*12)
+    0x6e82e609297f8fe8,  // x^(64*13)
+    0xe464f4df5fb60ac1,  // x^(64*14)
+    0xb649c5b35a759cf2,  // x^(64*15)
+    0x05cf79dea9ac37d6,  // x^(64*16)
+    0x001067e571d7d5c2   // x^(64*17)
+};
+
+constexpr uint64_t K_1_0_OFF = 0lu * 2lu * sizeof(uint64_t);
+constexpr uint64_t K_P_P_OFF = 1lu * 2lu * sizeof(uint64_t);
+constexpr uint64_t K_2_3_OFF = 2lu * 2lu * sizeof(uint64_t);
+constexpr uint64_t K_4_5_OFF = 3lu * 2lu * sizeof(uint64_t);
+constexpr uint64_t K_6_7_OFF = 4lu * 2lu * sizeof(uint64_t);
+constexpr uint64_t K_8_9_OFF = 5lu * 2lu * sizeof(uint64_t);
+constexpr uint64_t K_10_11_OFF = 6lu * 2lu * sizeof(uint64_t);
+constexpr uint64_t K_12_13_OFF = 7lu * 2lu * sizeof(uint64_t);
+constexpr uint64_t K_14_15_OFF = 8lu * 2lu * sizeof(uint64_t);
+constexpr uint64_t K_16_17_OFF = 9lu * 2lu * sizeof(uint64_t);
+
+class HashBase : public Generator {
+protected:
+    void (*ker_fn)(const ComputeHashCallArgs*);
+
+public:
+    HashBase(cpu_isa_t isa) : Generator(isa) {}
+
+    virtual void generate() = 0;
+
+    void operator()(const ComputeHashCallArgs* args) {
+        ker_fn(args);
+    }
+
+    virtual void create_kernel() {
+        generate();
+        ker_fn = (decltype(ker_fn))getCode();
+        OPENVINO_ASSERT(ker_fn, "[ CORE ] Could not generate kernel code.");
+    }
+};
+
+template <cpu_isa_t isa>
+class ComputeHash : public HashBase {
+public:
+    explicit ComputeHash(const ComputeHashCompileParams& jcp) : HashBase(isa), m_jcp(jcp) {
+        if (!mayiuse(cpu_isa_t::pclmulqdq)) {
+            OPENVINO_THROW(
+                "The current CPU does not support pclmulqdq instruction, which is required for the CRC algorithm.");
+        }
+        if (mayiuse(cpu_isa_t::vpclmulqdq)) {
+            is_vpclmulqdq = true;
+        }
+    }
+
+    void generate() override {
+        m_registers_pool = RegistersPool::create(isa, {rax, rcx, rsp, rdi, k0});
+
+        r64_src_ptr = getReg64();
+        r64_dst_ptr = getReg64();
+        r64_work_amount = getReg64();
+        r64_k_ptr = getReg64();
+        r64_aux = getReg64();
+        v_k_2_3 = getVmm();
+        v_shuf_mask = getVmm();
+        auto v_dst = getVmm();
+
+        this->preamble();
+
+        initialize(v_dst);
+        bulk_fold(v_dst);
+        join(v_dst);
+        fold_to_128(v_dst);
+        fold_to_64(v_dst);
+
+        this->postamble();
+        m_registers_pool.reset();
+    }
+
+    static std::shared_ptr<HashBase> create(const ComputeHashCompileParams& params) {
+        auto kernel = std::make_shared<ComputeHash>(params);
+        OPENVINO_ASSERT(kernel, "[ CORE ] Could not create ComputeHash kernel.");
+        kernel->create_kernel();
+
+        return kernel;
+    }
+
+private:
+    using Vmm = typename std::conditional<isa == avx512_core, Xbyak::Zmm, Xbyak::Ymm>::type;
+    bool is_vpclmulqdq = false;
+
+    ComputeHashCompileParams m_jcp;
+    RegistersPool::Ptr m_registers_pool;
+
+    const Xbyak::Reg64 r64_params = abi_param1;
+
+    RegistersPool::Reg<Xbyak::Reg64> r64_src_ptr;
+    RegistersPool::Reg<Xbyak::Reg64> r64_dst_ptr;
+    RegistersPool::Reg<Xbyak::Reg64> r64_work_amount;
+    RegistersPool::Reg<Xbyak::Reg64> r64_k_ptr;
+    RegistersPool::Reg<Xbyak::Reg64> r64_aux;
+
+    // Vector registers
+    RegistersPool::Reg<Vmm> v_k_2_3;
+    RegistersPool::Reg<Vmm> v_shuf_mask;
+
+    void initialize(const Vmm& v_dst);
+
+    void bulk_fold(const Vmm& v_dst);
+
+    void join(const Vmm& v_dst);
+
+    void fold_to_128(const Vmm& v_dst);
+
+    void fold_to_64(const Vmm& v_dst);
+
+    void uni_vpxorq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src_0, const Xbyak::Xmm& v_src_1);
+
+    void uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0);
+
+    void uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0);
+
+    void uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0);
+
+    void partial_load(const Xbyak::Xmm& xmm_dst, const Xbyak::Address& src_addr, const Xbyak::Reg64& r64_load_num);
+
+    void partial_load(const Xbyak::Ymm& ymm_dst, const Xbyak::Address& src_addr, const Xbyak::Reg64& r64_load_num);
+};
+
+template <>
+void ComputeHash<avx512_core>::uni_vpxorq(const Xbyak::Xmm& v_dst,
+                                          const Xbyak::Xmm& v_src_0,
+                                          const Xbyak::Xmm& v_src_1) {
+    vpxorq(v_dst, v_src_0, v_src_1);
+}
+template <cpu_isa_t isa>
+void ComputeHash<isa>::uni_vpxorq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src_0, const Xbyak::Xmm& v_src_1) {
+    vpxor(v_dst, v_src_0, v_src_1);
+}
+template <>
+void ComputeHash<avx512_core>::uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0) {
+    vmovdqu64(v_dst, v_src_0);
+}
+template <cpu_isa_t isa>
+void ComputeHash<isa>::uni_vmovdqu64(const Xbyak::Xmm& v_dst, const Xbyak::Operand& v_src_0) {
+    vmovdqu(v_dst, v_src_0);
+}
+template <>
+void ComputeHash<avx512_core>::uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0) {
+    vmovdqu64(v_dst, v_src_0);
+}
+template <cpu_isa_t isa>
+void ComputeHash<isa>::uni_vmovdqu64(const Xbyak::Address& v_dst, const Xbyak::Xmm& v_src_0) {
+    vmovdqu(v_dst, v_src_0);
+}
+template <>
+void ComputeHash<avx512_core>::uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0) {
+    vbroadcasti64x2(v_dst, v_src_0);
+}
+template <cpu_isa_t isa>
+void ComputeHash<isa>::uni_vbroadcasti64x2(const Xbyak::Ymm& v_dst, const Xbyak::Address& v_src_0) {
+    vbroadcasti128(v_dst, v_src_0);
+}
+template <>
+void ComputeHash<avx512_core>::partial_load(const Xbyak::Xmm& xmm_dst,
+                                            const Xbyak::Address& src_addr,
+                                            const Xbyak::Reg64& r64_load_num) {
+    Xbyak::Label l_mv_mask;
+    auto rOnes = getReg64();
+    auto k_load_mask = RegistersPool::Reg<Xbyak::Opmask>(m_registers_pool);
+
+    mov(rOnes, 0xFFFFFFFFFFFFFFFF);
+    cmp(r64_load_num, 0x3f);
+    jg(l_mv_mask);
+
+    shlx(rOnes, rOnes, r64_load_num);
+    not_(rOnes);
+
+    L(l_mv_mask);
+    kmovq(k_load_mask, rOnes);
+
+    vmovdqu8(Vmm(xmm_dst.getIdx()) | k_load_mask | T_z, ptr[r64_src_ptr]);
+}
+template <cpu_isa_t isa>
+void ComputeHash<isa>::partial_load(const Xbyak::Xmm& xmm_dst,
+                                    const Xbyak::Address& src_addr,
+                                    const Xbyak::Reg64& r64_load_num) {
+    Xbyak::Label l_partial, l_end;
+
+    cmp(r64_load_num, xmm_len);
+    jl(l_partial, T_NEAR);
+    uni_vmovdqu64(xmm_dst, ptr[src_addr.getRegExp()]);
+    jmp(l_end, T_NEAR);
+
+    L(l_partial);
+    {
+        uni_vpxorq(xmm_dst, xmm_dst, xmm_dst);
+        for (size_t j = 0lu; j < xmm_len - 1; j++) {
+            cmp(r64_load_num, j);
+            jle(l_end, T_NEAR);
+            pinsrb(xmm_dst, ptr[src_addr.getRegExp() + j], j);
+        }
+    }
+
+    L(l_end);
+}
+template <>
+void ComputeHash<avx512_core>::partial_load(const Xbyak::Ymm& xmm_dst,
+                                            const Xbyak::Address& src_addr,
+                                            const Xbyak::Reg64& r64_load_num) {
+    partial_load(Xbyak::Xmm(xmm_dst.getIdx()), src_addr, r64_load_num);
+}
+template <cpu_isa_t isa>
+void ComputeHash<isa>::partial_load(const Xbyak::Ymm& ymm_dst,
+                                    const Xbyak::Address& src_addr,
+                                    const Xbyak::Reg64& r64_load_num) {
+    Xbyak::Label l_xmm, l_partial, l_end;
+    auto xmm_dst = Xbyak::Xmm(ymm_dst.getIdx());
+
+    cmp(r64_load_num, ymm_len);
+    jl(l_xmm, T_NEAR);
+    uni_vmovdqu64(ymm_dst, ptr[src_addr.getRegExp()]);
+    jmp(l_end, T_NEAR);
+
+    L(l_xmm);
+    uni_vpxorq(ymm_dst, ymm_dst, ymm_dst);
+    cmp(r64_load_num, xmm_len);
+    jl(l_partial, T_NEAR);
+    uni_vmovdqu64(xmm_dst, ptr[src_addr.getRegExp()]);
+    je(l_end, T_NEAR);
+
+    {
+        Xbyak::Label l_rest_loop, l_perm;
+
+        vperm2i128(ymm_dst, ymm_dst, ymm_dst, 0x1);
+        for (size_t j = 0lu; j < xmm_len - 1; j++) {
+            cmp(r64_load_num, xmm_len + j);
+            jle(l_perm, T_NEAR);
+            pinsrb(xmm_dst, ptr[src_addr.getRegExp() + xmm_len + j], j);
+        }
+        L(l_perm);
+        vperm2i128(ymm_dst, ymm_dst, ymm_dst, 0x1);
+    }
+    jmp(l_end, T_NEAR);
+
+    L(l_partial);
+    {
+        for (size_t j = 0lu; j < xmm_len - 1; j++) {
+            cmp(r64_load_num, j);
+            jle(l_end, T_NEAR);
+            pinsrb(xmm_dst, ptr[src_addr.getRegExp() + j], j);
+        }
+    }
+
+    L(l_end);
+}
+
+template <cpu_isa_t isa>
+void ComputeHash<isa>::initialize(const Vmm& v_dst) {
+    mov(r64_src_ptr, ptr[r64_params + GET_OFF(src_ptr)]);
+    mov(r64_dst_ptr, ptr[r64_params + GET_OFF(dst_ptr)]);
+    mov(r64_k_ptr, ptr[r64_params + GET_OFF(k_ptr)]);
+    mov(r64_work_amount, ptr[r64_params + GET_OFF(work_amount)]);
+
+    uni_vbroadcasti64x2(v_k_2_3, ptr[r64_k_ptr + K_2_3_OFF]);
+
+    mov(r64_aux, reinterpret_cast<uintptr_t>(SHUF_MASK));
+    uni_vbroadcasti64x2(v_shuf_mask, ptr[r64_aux]);
+
+    if (m_jcp.type == SINGLE_THREAD || m_jcp.type == FIRST_THREAD) {
+        auto xmm_dst = Xbyak::Xmm(v_dst.getIdx());
+        auto xmm_aux = getXmm();
+
+        // Initial CRC
+        mov(r64_aux, ptr[r64_params + GET_OFF(size)]);
+        vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x0);
+        mov(r64_aux, CRC_VAL);
+        vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x1);
+
+        // First xor with source.
+        partial_load(v_dst, ptr[r64_src_ptr], r64_work_amount);
+        vpshufb(v_dst, v_dst, v_shuf_mask);
+        pxor(xmm_dst, xmm_aux);  // The SSE version is used to avoid zeroing out the rest of the Vmm.
+        if (m_jcp.type == SINGLE_THREAD) {
+            add(r64_src_ptr, xmm_len);
+        }
+    } else if (m_jcp.type == N_THREAD) {
+        uni_vmovdqu64(v_dst, ptr[r64_src_ptr]);
+        vpshufb(v_dst, v_dst, v_shuf_mask);
+    }
+    if (m_jcp.type == SINGLE_THREAD || m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) {
+        sub(r64_work_amount, xmm_len);
+    }
+}
+
+template <>
+void ComputeHash<avx512_core>::bulk_fold(const Vmm& v_dst) {
+    if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FIRST_THREAD && m_jcp.type != N_THREAD) {
+        return;
+    }
+    Xbyak::Label l_fold_loop, l_end;
+    cmp(r64_work_amount, 2 * get_vlen() - xmm_len);
+    jl(l_end, T_NEAR);
+
+    auto v_src_0 = getVmm();
+    auto v_dst_0 = getVmm();
+    auto v_dst_1 = getVmm();
+    auto v_dst_2 = getVmm();
+    auto& v_dst_3 = v_dst;
+    auto v_k_loop = getVmm();
+    auto v_aux_0 = getVmm();
+
+    auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx());
+    auto xmm_src_1 = getXmm();
+    auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx());
+    auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx());
+    auto xmm_dst_2 = Xbyak::Xmm(v_dst_2.getIdx());
+    auto xmm_dst_3 = Xbyak::Xmm(v_dst_3.getIdx());
+    auto xmm_k_loop = Xbyak::Xmm(v_k_loop.getIdx());
+    auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx());
+    auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx());
+
+    RegistersPool::Reg<Xbyak::Reg64> r64_bulk_step;
+    if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) {
+        r64_bulk_step = getReg64();
+        mov(r64_bulk_step, ptr[r64_params + GET_OFF(threads_num)]);
+        sal(r64_bulk_step, static_cast<int>(std::log2(get_vlen())));  // * vlen
+    }
+
+    if (m_jcp.type == SINGLE_THREAD) {
+        uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_8_9_OFF]);
+    } else {
+        uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_16_17_OFF]);
+    }
+
+    uni_vmovdqu64(v_dst_0, v_dst);
+
+    if (!is_vpclmulqdq) {
+        vextracti64x2(xmm_dst_1, v_dst_0, 0x1);
+        vextracti64x2(xmm_dst_2, v_dst_0, 0x2);
+        vextracti64x2(xmm_dst_3, v_dst_0, 0x3);
+    }
+
+    if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) {
+        add(r64_src_ptr, r64_bulk_step);
+        prefetcht2(ptr[r64_src_ptr + 16384]);
+    } else {
+        add(r64_src_ptr, get_vlen() - xmm_len);
+        prefetcht2(ptr[r64_src_ptr + 4096]);
+    }
+    prefetcht1(ptr[r64_src_ptr + 1024]);
+    prefetcht0(ptr[r64_src_ptr + 64]);
+
+    sub(r64_work_amount, 2 * get_vlen() - xmm_len);
+
+    L(l_fold_loop);
+    {
+        uni_vmovdqu64(v_src_0, ptr[r64_src_ptr]);
+        vpshufb(v_src_0, v_src_0, v_shuf_mask);
+
+        if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) {
+            add(r64_src_ptr, r64_bulk_step);
+            prefetcht2(ptr[r64_src_ptr + 16384]);
+        } else {
+            add(r64_src_ptr, get_vlen());
+            prefetcht2(ptr[r64_src_ptr + 4096]);
+        }
+        prefetcht1(ptr[r64_src_ptr + 1024]);
+        prefetcht0(ptr[r64_src_ptr + 64]);
+
+        if (is_vpclmulqdq) {
+            vpclmulqdq(v_aux_0, v_dst_0, v_k_loop, 0b00000000);
+            vpclmulqdq(v_dst_0, v_dst_0, v_k_loop, 0b00010001);
+            uni_vpxorq(v_aux_0, v_aux_0, v_src_0);
+            uni_vpxorq(v_dst_0, v_dst_0, v_aux_0);
+        } else {
+            // 0
+            vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_loop, 0b00000000);
+            vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_loop, 0b00010001);
+            uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0);
+            uni_vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0);
+
+            // 1
+            vextracti64x2(xmm_src_1, v_src_0, 0x1);
+            vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_loop, 0b00000000);
+            vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_loop, 0b00010001);
+            uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1);
+            uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0);
+
+            // 2
+            vextracti64x2(xmm_src_1, v_src_0, 0x2);
+            vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_loop, 0b00000000);
+            vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_loop, 0b00010001);
+            uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1);
+            uni_vpxorq(xmm_dst_2, xmm_dst_2, xmm_aux_0);
+
+            // 3
+            vextracti64x2(xmm_src_1, v_src_0, 0x3);
+            vpclmulqdq(xmm_aux_0, xmm_dst_3, xmm_k_loop, 0b00000000);
+            vpclmulqdq(xmm_dst_3, xmm_dst_3, xmm_k_loop, 0b00010001);
+            uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1);
+            uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0);
+        }
+
+        sub(r64_work_amount, get_vlen());
+        jge(l_fold_loop, T_NEAR);
+    }
+    add(r64_work_amount, get_vlen());
+
+    if (m_jcp.type == SINGLE_THREAD) {
+        if (is_vpclmulqdq) {
+            vextracti64x2(xmm_dst_1, v_dst_0, 0x1);
+            vextracti64x2(xmm_dst_2, v_dst_0, 0x2);
+            vextracti64x2(xmm_dst_3, v_dst_0, 0x3);
+        }
+
+        vpclmulqdq(xmm_aux_0, xmm_dst_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000);
+        vpclmulqdq(xmm_dst_0, xmm_dst_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001);
+        uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0);
+        uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_0);
+
+        vpclmulqdq(xmm_aux_0, xmm_dst_1, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000);
+        vpclmulqdq(xmm_dst_1, xmm_dst_1, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001);
+        uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0);
+        uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_1);
+
+        vpclmulqdq(xmm_aux_0, xmm_dst_2, xmm_k_2_3, 0b00000000);
+        vpclmulqdq(xmm_dst_2, xmm_dst_2, xmm_k_2_3, 0b00010001);
+        uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_aux_0);
+        uni_vpxorq(xmm_dst_3, xmm_dst_3, xmm_dst_2);
+    } else {
+        if (is_vpclmulqdq) {
+            uni_vmovdqu64(ptr[r64_dst_ptr], v_dst_0);
+        } else {
+            uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 0lu], xmm_dst_0);
+            uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 1lu], xmm_dst_1);
+            uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 2lu], xmm_dst_2);
+            uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 3lu], xmm_dst_3);
+        }
+    }
+
+    L(l_end);
+}
+
+template <cpu_isa_t isa>
+void ComputeHash<isa>::bulk_fold(const Vmm& v_dst) {
+    if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FIRST_THREAD && m_jcp.type != N_THREAD) {
+        return;
+    }
+    Xbyak::Label l_fold_loop, l_end;
+    cmp(r64_work_amount, 2 * get_vlen() - xmm_len);
+    jl(l_end, T_NEAR);
+
+    auto v_src_0 = getVmm();
+    auto v_dst_0 = getVmm();
+    auto& v_dst_1 = v_dst;
+    auto v_aux_0 = getVmm();
+    auto v_k_loop = getVmm();
+
+    auto xmm_src_0 = Xbyak::Xmm(v_src_0.getIdx());
+    auto xmm_src_1 = getXmm();
+    auto xmm_dst_0 = Xbyak::Xmm(v_dst_0.getIdx());
+    auto xmm_dst_1 = Xbyak::Xmm(v_dst_1.getIdx());
+    auto xmm_k_loop = Xbyak::Xmm(v_k_loop.getIdx());
+    auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx());
+    auto xmm_aux_0 = Xbyak::Xmm(v_aux_0.getIdx());
+
+    RegistersPool::Reg<Xbyak::Reg64> r64_bulk_step;
+    if (m_jcp.type == FIRST_THREAD || m_jcp.type == N_THREAD) {
+        r64_bulk_step = getReg64();
+        mov(r64_bulk_step, ptr[r64_params + GET_OFF(threads_num)]);
+        sal(r64_bulk_step, static_cast<int>(std::log2(get_vlen())));  // * vlen
+    }
+
+    if (m_jcp.type == SINGLE_THREAD) {
+        uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_4_5_OFF]);
+    } else {
+        uni_vbroadcasti64x2(v_k_loop, ptr[r64_k_ptr + K_8_9_OFF]);
+    }
+
+    uni_vmovdqu64(v_dst_0, v_dst);
+
+    if (!is_vpclmulqdq) {
+        vextracti128(xmm_dst_1, v_dst_0, 0x1);
+    }
+
+    if (m_jcp.type == SINGLE_THREAD) {
+        add(r64_src_ptr, get_vlen() - xmm_len);
+    } else {
+        add(r64_src_ptr, r64_bulk_step);
+    }
+    prefetcht2(ptr[r64_src_ptr + 4096]);
+    prefetcht1(ptr[r64_src_ptr + 1024]);
+    prefetcht0(ptr[r64_src_ptr + 64]);
+
+    sub(r64_work_amount, 2 * get_vlen() - xmm_len);
+
+    L(l_fold_loop);
+    {
+        uni_vmovdqu64(v_src_0, ptr[r64_src_ptr]);
+        vpshufb(v_src_0, v_src_0, v_shuf_mask);
+
+        if (m_jcp.type == SINGLE_THREAD) {
+            add(r64_src_ptr, get_vlen());
+        } else {
+            add(r64_src_ptr, r64_bulk_step);
+        }
+        prefetcht2(ptr[r64_src_ptr + 4096]);
+        prefetcht1(ptr[r64_src_ptr + 1024]);
+        prefetcht0(ptr[r64_src_ptr + 64]);
+
+        if (is_vpclmulqdq) {
+            vpclmulqdq(v_aux_0, v_dst_0, v_k_loop, 0b00000000);
+            vpclmulqdq(v_dst_0, v_dst_0, v_k_loop, 0b00010001);
+            uni_vpxorq(v_aux_0, v_aux_0, v_src_0);
+            uni_vpxorq(v_dst_0, v_dst_0, v_aux_0);
+        } else {
+            // 0
+            vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_loop, 0b00000000);
+            vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_loop, 0b00010001);
+            uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_0);
+            uni_vpxorq(xmm_dst_0, xmm_dst_0, xmm_aux_0);
+            // 1
+            vextracti128(xmm_src_1, v_src_0, 0x1);
+            vpclmulqdq(xmm_aux_0, xmm_dst_1, xmm_k_loop, 0b00000000);
+            vpclmulqdq(xmm_dst_1, xmm_dst_1, xmm_k_loop, 0b00010001);
+            uni_vpxorq(xmm_aux_0, xmm_aux_0, xmm_src_1);
+            uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0);
+        }
+
+        sub(r64_work_amount, get_vlen());
+        jge(l_fold_loop, T_NEAR);
+    }
+    add(r64_work_amount, get_vlen());
+
+    if (m_jcp.type == SINGLE_THREAD) {
+        if (is_vpclmulqdq) {
+            vextracti128(xmm_dst_1, v_dst_0, 0x1);
+        }
+        vpclmulqdq(xmm_aux_0, xmm_dst_0, xmm_k_2_3, 0b00000000);
+        vpclmulqdq(xmm_dst_0, xmm_dst_0, xmm_k_2_3, 0b00010001);
+        uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_aux_0);
+        uni_vpxorq(xmm_dst_1, xmm_dst_1, xmm_dst_0);
+    } else {
+        if (is_vpclmulqdq) {
+            uni_vmovdqu64(ptr[r64_dst_ptr], v_dst_0);
+        } else {
+            uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 0lu], xmm_dst_0);
+            uni_vmovdqu64(ptr[r64_dst_ptr + xmm_len * 1lu], xmm_dst_1);
+        }
+    }
+
+    L(l_end);
+}
+
+template <>
+void ComputeHash<avx512_core>::join(const Vmm& v_dst) {
+    if (m_jcp.type != FINAL_FOLD) {
+        return;
+    }
+
+    mov(r64_aux, ptr[r64_params + GET_OFF(intermediate_ptr)]);
+    prefetcht0(ptr[r64_aux + 1024]);
+
+    auto xmm_src_0 = getXmm();
+    auto xmm_src_last = Xbyak::Xmm(v_dst.getIdx());
+    auto xmm_aux_0 = getXmm();
+    auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx());
+
+    uni_vmovdqu64(xmm_src_last, ptr[r64_aux + xmm_len * 7]);
+
+    uni_vmovdqu64(xmm_src_0, ptr[r64_aux]);
+    vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_14_15_OFF], 0b00000000);
+    vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_14_15_OFF], 0b00010001);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0);
+
+    uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len]);
+    vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_12_13_OFF], 0b00000000);
+    vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_12_13_OFF], 0b00010001);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0);
+
+    uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 2lu]);
+    vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_10_11_OFF], 0b00000000);
+    vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_10_11_OFF], 0b00010001);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0);
+
+    uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 3lu]);
+    vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_8_9_OFF], 0b00000000);
+    vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_8_9_OFF], 0b00010001);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0);
+
+    uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 4lu]);
+    vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000);
+    vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0);
+
+    uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 5lu]);
+    vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000);
+    vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0);
+
+    uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 6lu]);
+    vpclmulqdq(xmm_aux_0, xmm_src_0, xmm_k_2_3, 0b00000000);
+    vpclmulqdq(xmm_src_0, xmm_src_0, xmm_k_2_3, 0b00010001);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0);
+}
+
+template <cpu_isa_t isa>
+void ComputeHash<isa>::join(const Vmm& v_dst) {
+    if (m_jcp.type != FINAL_FOLD) {
+        return;
+    }
+
+    mov(r64_aux, ptr[r64_params + GET_OFF(intermediate_ptr)]);
+    prefetcht0(ptr[r64_aux + 1024]);
+
+    auto xmm_src_0 = getXmm();
+    auto xmm_src_last = Xbyak::Xmm(v_dst.getIdx());
+    auto xmm_aux_0 = getXmm();
+    auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx());
+
+    uni_vmovdqu64(xmm_src_last, ptr[r64_aux + xmm_len * 3]);
+
+    uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 0lu]);
+    vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00000000);
+    vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_6_7_OFF], 0b00010001);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0);
+
+    uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 1lu]);
+    vpclmulqdq(xmm_aux_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00000000);
+    vpclmulqdq(xmm_src_0, xmm_src_0, ptr[r64_k_ptr + K_4_5_OFF], 0b00010001);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0);
+
+    uni_vmovdqu64(xmm_src_0, ptr[r64_aux + xmm_len * 2lu]);
+    vpclmulqdq(xmm_aux_0, xmm_src_0, xmm_k_2_3, 0b00000000);
+    vpclmulqdq(xmm_src_0, xmm_src_0, xmm_k_2_3, 0b00010001);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_aux_0);
+    uni_vpxorq(xmm_src_last, xmm_src_last, xmm_src_0);
+}
+
+template <cpu_isa_t isa>
+void ComputeHash<isa>::fold_to_128(const Vmm& v_dst) {
+    if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FINAL_FOLD) {
+        return;
+    }
+    Xbyak::Label l_fold_loop, l_end;
+    cmp(r64_work_amount, xmm_len);
+    jl(l_end, T_NEAR);
+
+    auto xmm_src = getXmm();
+    auto xmm_dst = Xbyak::Xmm(v_dst.getIdx());
+    auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx());
+    auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx());
+    auto xmm_aux = getXmm();
+
+    L(l_fold_loop);
+    {
+        uni_vmovdqu64(xmm_src, ptr[r64_src_ptr]);
+        vpshufb(xmm_src, xmm_src, xmm_shuf_mask);
+
+        vpclmulqdq(xmm_aux, xmm_dst, xmm_k_2_3, 0b00000000);
+        vpclmulqdq(xmm_dst, xmm_dst, xmm_k_2_3, 0b00010001);
+        uni_vpxorq(xmm_dst, xmm_dst, xmm_aux);
+        uni_vpxorq(xmm_dst, xmm_dst, xmm_src);
+
+        add(r64_src_ptr, xmm_len);
+        sub(r64_work_amount, xmm_len);
+        cmp(r64_work_amount, xmm_len);
+        jge(l_fold_loop, T_NEAR);
+    }
+
+    L(l_end);
+}
+
+template <cpu_isa_t isa>
+void ComputeHash<isa>::fold_to_64(const Vmm& v_dst) {
+    if (m_jcp.type != SINGLE_THREAD && m_jcp.type != FINAL_FOLD) {
+        return;
+    }
+    Xbyak::Label l_fold_to_64;
+    cmp(r64_work_amount, 0);
+    jle(l_fold_to_64, T_NEAR);
+
+    auto xmm_src = getXmm();
+    auto xmm_dst = Xbyak::Xmm(v_dst.getIdx());
+    auto xmm_k_2_3 = Xbyak::Xmm(v_k_2_3.getIdx());
+    auto xmm_shuf_mask = Xbyak::Xmm(v_shuf_mask.getIdx());
+    auto xmm_aux = getXmm();
+    auto xmm_aux_1 = getXmm();
+    auto xmm_aux_2 = getXmm();
+
+    partial_load(xmm_src, ptr[r64_src_ptr], r64_work_amount);
+    vpshufb(xmm_src, xmm_src, xmm_shuf_mask);
+
+    vpclmulqdq(xmm_aux, xmm_dst, xmm_k_2_3, 0b00000000);
+    vpclmulqdq(xmm_dst, xmm_dst, xmm_k_2_3, 0b00010001);
+    uni_vpxorq(xmm_aux, xmm_aux, xmm_src);
+    uni_vpxorq(xmm_dst, xmm_dst, xmm_aux);
+
+    L(l_fold_to_64);
+
+    mov(r64_aux, K_2);
+    vpinsrq(xmm_aux, xmm_aux, r64_aux, 0x0);
+    vpclmulqdq(xmm_aux, xmm_dst, xmm_aux, 0b00000001);
+    vpslldq(xmm_dst, xmm_dst, 0x8);
+    uni_vpxorq(xmm_dst, xmm_dst, xmm_aux);
+
+    mov(r64_aux, P_1);
+    vpinsrq(xmm_aux_2, xmm_aux_2, r64_aux, 0x0);
+    vpclmulqdq(xmm_aux, xmm_dst, xmm_aux_2, 0b00000001);
+    mov(r64_aux, 0x0);
+    vpinsrq(xmm_aux_1, xmm_dst, r64_aux, 0x0);
+    uni_vpxorq(xmm_aux, xmm_aux, xmm_aux_1);
+    vpinsrq(xmm_aux_1, xmm_aux, r64_aux, 0x0);
+
+    mov(r64_aux, P_2);
+    vpinsrq(xmm_aux_2, xmm_aux_2, r64_aux, 0x1);
+    vpclmulqdq(xmm_aux, xmm_aux, xmm_aux_2, 0b00010001);
+    uni_vpxorq(xmm_aux, xmm_aux, xmm_aux_1);
+    uni_vpxorq(xmm_dst, xmm_dst, xmm_aux);
+
+    vpextrq(ptr[r64_dst_ptr], xmm_dst, 0x0);
+}
+
+}  // namespace jit
+#endif  // OV_CORE_USE_XBYAK_JIT
+
+size_t compute_hash(const void* src, size_t size) {
+#ifdef OV_CORE_USE_XBYAK_JIT
+    if (Generator::mayiuse(avx2)) {
+        uint64_t result = 0lu;
+
+        // Parallel section
+        constexpr size_t min_wa_per_thread = 131072lu;  // 2^17
+        if (size >= min_wa_per_thread * 2lu) {
+            static auto first_thr_kernel = Generator::mayiuse(avx512_core)
+                                               ? jit::ComputeHash<avx512_core>::create({jit::FIRST_THREAD})
+                                               : jit::ComputeHash<avx2>::create({jit::FIRST_THREAD});
+            static auto n_thr_kernel = Generator::mayiuse(avx512_core)
+                                           ? jit::ComputeHash<avx512_core>::create({jit::N_THREAD})
+                                           : jit::ComputeHash<avx2>::create({jit::N_THREAD});
+            static auto final_fold_kernel = Generator::mayiuse(avx512_core)
+                                                ? jit::ComputeHash<avx512_core>::create({jit::FINAL_FOLD})
+                                                : jit::ComputeHash<avx2>::create({jit::FINAL_FOLD});
+
+            static const size_t max_thr_num = 2lu;
+            size_t thr_num = std::min(size / min_wa_per_thread, max_thr_num);
+            const uint64_t el_per_thread =
+                first_thr_kernel->get_vlen() * ((size / thr_num) / first_thr_kernel->get_vlen());
+            std::vector<uint8_t> intermediate(thr_num * first_thr_kernel->get_vlen());
+
+            parallel_nt_static(thr_num, [&](const int ithr, const int nthr) {
+                uint64_t start = ithr * el_per_thread;
+                if (start >= size) {
+                    return;
+                }
+                uint64_t work_amount = (el_per_thread + start > size) ? size - start : el_per_thread;
+
+                jit::ComputeHashCallArgs args;
+
+                args.src_ptr = reinterpret_cast<const uint8_t*>(src) + first_thr_kernel->get_vlen() * ithr;
+                args.dst_ptr = &(intermediate[ithr * first_thr_kernel->get_vlen()]);
+                args.k_ptr = jit::K_PULL;
+                args.work_amount = work_amount;
+                args.size = size;
+                args.threads_num = thr_num;
+
+                if (ithr == 0) {
+                    (*first_thr_kernel)(&args);
+                } else {
+                    (*n_thr_kernel)(&args);
+                }
+            });
+
+            jit::ComputeHashCallArgs args;
+            args.src_ptr = reinterpret_cast<const uint8_t*>(src) + size - args.work_amount;
+            args.dst_ptr = &result;
+            args.k_ptr = jit::K_PULL;
+            args.work_amount = size - el_per_thread * thr_num;
+            args.size = size;
+            args.intermediate_ptr = intermediate.data();
+
+            (*final_fold_kernel)(&args);
+        } else {
+            static auto single_thr_kernel = Generator::mayiuse(avx512_core)
+                                                ? jit::ComputeHash<avx512_core>::create({jit::SINGLE_THREAD})
+                                                : jit::ComputeHash<avx2>::create({jit::SINGLE_THREAD});
+
+            jit::ComputeHashCallArgs args;
+            args.src_ptr = src;
+            args.dst_ptr = &result;
+            args.k_ptr = jit::K_PULL;
+            args.work_amount = size;
+            args.size = size;
+
+            (*single_thr_kernel)(&args);
+        }
+
+        return result;
+    }
+
+#endif  // OV_CORE_USE_XBYAK_JIT
+
+    constexpr auto cel_size = sizeof(size_t);
+    size_t seed = size;
+    const auto data = static_cast<const size_t*>(src);
+    const auto d_end = std::next(data, size / cel_size);
+    // The constant value used as a magic number has been
+    // traditionally used e.g. in boost library's hash_combine.
+    // It happens to be derived from the golden ratio.
+    for (auto d = data; d != d_end; ++d) {
+        seed ^= *d + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    }
+    size_t last_bytes{0};
+    std::memcpy(&last_bytes, d_end, size % cel_size);
+    seed ^= last_bytes + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+
+    return seed;
+}
+
+}  // namespace runtime
+}  // namespace ov