diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h b/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h index 706db9038c458a..2bd2e1335f9eea 100755 --- a/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h @@ -6,6 +6,8 @@ #include #include "ie_api.h" +#include +#include namespace ov { namespace intel_cpu { @@ -51,5 +53,20 @@ inline int cpu_memcpy_s(void* dst, size_t dst_size, const void* src, size_t coun return 0; } +inline void cpu_parallel_memcpy(void* dst, const void* src, size_t count) { + const size_t l2_cache_size = dnnl::utils::get_cache_size(2, true); + if (count >= l2_cache_size) { + auto src_int8 = static_cast(src); + auto dst_int8 = static_cast(dst); + parallel_nt(0, [&](const size_t ithr, const size_t nthr) { + size_t start = 0, end = 0; + splitter(count, nthr, ithr, start, end); + cpu_memcpy(dst_int8 + start, src_int8 + start, end - start); + }); + } else { + cpu_memcpy(dst, src, count); + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/unique.cpp b/src/plugins/intel_cpu/src/nodes/unique.cpp index 4ed2a0b9c4384e..1c0ff95b21c51a 100644 --- a/src/plugins/intel_cpu/src/nodes/unique.cpp +++ b/src/plugins/intel_cpu/src/nodes/unique.cpp @@ -6,6 +6,7 @@ #include "ie_parallel.hpp" #include +#include "common/cpu_memcpy.h" #include using namespace InferenceEngine; @@ -180,7 +181,7 @@ void Unique::flattenTensorExec() { uniqueLen = inputLen; if (sorted) { - std::memcpy(uniDataTmpPtr, srcDataPtr, inputLen * sizeof(T)); + cpu_parallel_memcpy(uniDataTmpPtr, srcDataPtr, inputLen * sizeof(T)); std::sort(uniDataTmpPtr, uniDataTmpPtr + inputLen); auto last = std::unique(uniDataTmpPtr, uniDataTmpPtr + inputLen); uniqueLen = last - uniDataTmpPtr; @@ -263,18 +264,18 @@ void Unique::flattenTensorExec() { redefineOutputMemory({ {uniqueLen}, {uniqueLen}, {inputLen}, {uniqueLen}}); T* uniDataPtr = reinterpret_cast(getChildEdgesAtPort(UNIQUE_DATA)[0]->getMemoryPtr()->GetPtr()); - memcpy(uniDataPtr, uniDataTmpPtr, uniqueLen * sizeof(T)); + cpu_parallel_memcpy(uniDataPtr, uniDataTmpPtr, uniqueLen * sizeof(T)); if (definedOutputs[FIRST_UNIQUE_IDX]) { int *firstPtr = reinterpret_cast(getChildEdgesAtPort(FIRST_UNIQUE_IDX)[0]->getMemoryPtr()->GetPtr()); - memcpy(firstPtr, firstUniTmp.data(), uniqueLen * sizeof(int)); + cpu_parallel_memcpy(firstPtr, firstUniTmp.data(), uniqueLen * sizeof(int)); } if (definedOutputs[INPUT_TO_UNIQ_IDX]) { auto inToOutPtr = reinterpret_cast(getChildEdgesAtPort(INPUT_TO_UNIQ_IDX)[0]->getMemoryPtr()->GetPtr()); - memcpy(inToOutPtr, inToOutTmp.data(), inputLen * sizeof(int)); + cpu_parallel_memcpy(inToOutPtr, inToOutTmp.data(), inputLen * sizeof(int)); } if (definedOutputs[OCCURRENCES_NUM]) { auto occurPtr = reinterpret_cast(getChildEdgesAtPort(OCCURRENCES_NUM)[0]->getMemoryPtr()->GetPtr()); - memcpy(occurPtr, occurTmp.data(), uniqueLen * sizeof(int)); + cpu_parallel_memcpy(occurPtr, occurTmp.data(), uniqueLen * sizeof(int)); } } @@ -383,7 +384,7 @@ void Unique::slicedTensorExec() { auto first1 = srcDataPtr + uniqIdx[u] * innerLen; auto first2 = dstDataPtr + u * innerLen; for (int64_t p = 0lu; p < outerLen; p++) { - memcpy(first2, first1, innerSizeB); + cpu_memcpy(first2, first1, innerSizeB); first1 += srcOuterStep; first2 += dstOuterStep; } @@ -425,7 +426,7 @@ void Unique::slicedTensorExec() { auto src = dst1 + ot * dstOuterStep + colToSort[u].idx * innerLen; auto dst = dst2 + ot * dstOuterStep + u * innerLen; - memcpy(dst, src, innerSizeB); + cpu_memcpy(dst, src, innerSizeB); }); if (defined3outputs) { @@ -460,26 +461,26 @@ void Unique::slicedTensorExec() { } if (definedOutputs[UNIQUE_DATA] && dst1 != dstDataPtr) { - memcpy(dstDataPtr, dst1, dstUniDataLen * sizeof(T)); + cpu_parallel_memcpy(dstDataPtr, dst1, dstUniDataLen * sizeof(T)); } if (definedOutputs[FIRST_UNIQUE_IDX] && first2 != firstPtr) { - memcpy(firstPtr, first2, uniqueLenIB); + cpu_parallel_memcpy(firstPtr, first2, uniqueLenIB); } if (definedOutputs[INPUT_TO_UNIQ_IDX] && inToOut2 != inToOutPtr) { - memcpy(inToOutPtr, inToOut2, axisDim * sizeof(int)); + cpu_parallel_memcpy(inToOutPtr, inToOut2, axisDim * sizeof(int)); } if (definedOutputs[OCCURRENCES_NUM] && occurN2 != occurNPtr) { - memcpy(occurNPtr, occurN2, uniqueLenIB); + cpu_parallel_memcpy(occurNPtr, occurN2, uniqueLenIB); } } else { if (definedOutputs[FIRST_UNIQUE_IDX]) { - memcpy(firstPtr, firstUniTmp.data(), uniqueLenIB); + cpu_parallel_memcpy(firstPtr, firstUniTmp.data(), uniqueLenIB); } if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - memcpy(inToOutPtr, inToOutTmp.data(), axisDim * sizeof(int)); + cpu_parallel_memcpy(inToOutPtr, inToOutTmp.data(), axisDim * sizeof(int)); } if (definedOutputs[OCCURRENCES_NUM]) { - memcpy(occurNPtr, occurTmp.data(), uniqueLenIB); + cpu_parallel_memcpy(occurNPtr, occurTmp.data(), uniqueLenIB); } } }