diff --git a/src/plugins/intel_cpu/src/nodes/unique.cpp b/src/plugins/intel_cpu/src/nodes/unique.cpp index 3687888fd9ff04..9ee7f7ae6058a4 100644 --- a/src/plugins/intel_cpu/src/nodes/unique.cpp +++ b/src/plugins/intel_cpu/src/nodes/unique.cpp @@ -2,11 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include - #include "unique.hpp" -#include + +#include "ie_parallel.hpp" +#include #include using namespace InferenceEngine; @@ -21,7 +20,7 @@ bool Unique::isSupportedOperation(const std::shared_ptr& op, std errorMessage = "Not supported Unique operation version. CPU plug-in supports only 10th version."; return false; } - if (op->get_input_size() > AXIS && !ov::is_type(op->get_input_node_ptr(AXIS))) { + if (op->get_input_size() > AXIS && !ov::is_type(op->get_input_node_ptr(AXIS))) { errorMessage = "CPU plug-in supports only constant Axis input."; return false; } @@ -46,15 +45,15 @@ Unique::Unique(const std::shared_ptr& op, const GraphContext::CPtr con definedOutputs[i] = !op->get_output_target_inputs(i).empty(); } - sorted = ov::as_type_ptr(op)->get_sorted(); + sorted = ov::as_type_ptr(op)->get_sorted(); if (op->get_input_size() > AXIS) { flattened = false; - axis = ov::as_type(op->get_input_node_ptr(AXIS))->cast_vector()[0]; + axis = ov::as_type(op->get_input_node_ptr(AXIS))->cast_vector()[0]; if (axis < 0) { axis += op->get_input_partial_shape(IN_DATA).rank().get_length(); } if (axis < 0 || axis >= op->get_input_partial_shape(IN_DATA).rank().get_length()) { - THROW_ERROR << "has invalid axis value: " << ov::as_type(op->get_input_node_ptr(AXIS))->cast_vector()[0]; + THROW_ERROR << "has invalid axis value: " << ov::as_type(op->get_input_node_ptr(AXIS))->cast_vector()[0]; } } else { flattened = true; @@ -281,10 +280,9 @@ void Unique::flattenTensorExec() { template void Unique::slicedTensorExec() { - const T* srcDataPtr = reinterpret_cast(getParentEdgeAt(IN_DATA)->getMemoryPtr()->GetPtr()); - const size_t inputLen = getParentEdgeAt(IN_DATA)->getMemoryPtr()->GetSize() / sizeof(T); - std::vector uniDataTmp(inputLen); - auto uniDataTmpPtr = uniDataTmp.data(); + auto inDataMemPtr = getParentEdgeAt(IN_DATA)->getMemoryPtr(); + auto srcDataPtr = reinterpret_cast(inDataMemPtr->GetPtr()); + const size_t inputLen = inDataMemPtr->GetSize() / sizeof(T); int *firstTmpPtr = nullptr, *inToOutTmpPtr = nullptr, *occurTmpPtr = nullptr; if (definedOutputs[FIRST_UNIQUE_IDX]) { firstTmpPtr = firstUniTmp.data(); @@ -296,19 +294,19 @@ void Unique::slicedTensorExec() { occurTmpPtr = occurTmp.data(); } - const auto& srcDataShape = getParentEdgeAt(IN_DATA)->getMemoryPtr()->getStaticDims(); + const auto& srcDataShape = inDataMemPtr->getStaticDims(); - const auto cmpBlNum = srcDataShape[axis]; // Blocks to compare. - int64_t partsInBl = 1; // Parts in block + const auto axisDim = srcDataShape[axis]; + int64_t outerLen = 1lu; if (axis > 0) { - partsInBl = std::accumulate(srcDataShape.begin(), srcDataShape.begin() + axis, 1, std::multiplies()); + outerLen = std::accumulate(srcDataShape.begin(), srcDataShape.begin() + axis, 1, std::multiplies()); } - int64_t elPerPart = 1; // Elements number in part. + int64_t innerLen = 1; if (static_cast(axis) < srcDataShape.size() - 1) { - elPerPart = std::accumulate(srcDataShape.begin() + axis + 1, srcDataShape.end(), 1, std::multiplies()); + innerLen = std::accumulate(srcDataShape.begin() + axis + 1, srcDataShape.end(), 1, std::multiplies()); } - const auto partLenB = elPerPart * dataPrecision.size(); - const auto partStep = elPerPart * cmpBlNum; + const auto innerSizeB = innerLen * sizeof(T); + const auto srcOuterStep = innerLen * axisDim; if (definedOutputs[FIRST_UNIQUE_IDX]) { firstTmpPtr[0] = 0; @@ -318,28 +316,29 @@ void Unique::slicedTensorExec() { } if (definedOutputs[OCCURRENCES_NUM]) { occurTmpPtr[0] = 1; - std::fill(occurTmpPtr, occurTmpPtr + cmpBlNum, 1); + std::fill(occurTmpPtr, occurTmpPtr + axisDim, 1); } - uniqueLen = 1; - std::vector uniqIdx(cmpBlNum, 0); - for (size_t b1 = 1; b1 < cmpBlNum; b1++) { - auto first1 = srcDataPtr + b1 * elPerPart; - auto last1 = srcDataPtr + (b1 + 1) * elPerPart; + uniqueLen = 1lu; + std::vector uniqIdx(axisDim, 0lu); + // Search for unique slices. + for (size_t a = 1lu; a < axisDim; a++) { + auto first1 = srcDataPtr + a * innerLen; + auto last1 = srcDataPtr + (a + 1lu) * innerLen; bool equal = true; - size_t b2 = 0; + size_t uIdx = 0lu; // Compare with unique blocks. - for (; b2 < uniqueLen; b2++) { - auto first2 = srcDataPtr + uniqIdx[b2] * elPerPart; + for (; uIdx < uniqueLen; uIdx++) { + auto first2 = srcDataPtr + uniqIdx[uIdx] * innerLen; equal = true; - for (int p = 0; p < partsInBl; p++) { + for (size_t o = 0lu; o < outerLen; o++) { equal = std::equal(first1, last1, first2); if (!equal) { break; } - first1 += partStep; - last1 += partStep; - first2 += partStep; + first1 += srcOuterStep; + last1 += srcOuterStep; + first2 += srcOuterStep; } if (equal) { break; @@ -347,149 +346,142 @@ void Unique::slicedTensorExec() { } if (!equal) { if (definedOutputs[FIRST_UNIQUE_IDX]) { - firstTmpPtr[uniqueLen] = b1; + firstTmpPtr[uniqueLen] = a; } - uniqIdx[uniqueLen++] = b1; + uniqIdx[uniqueLen++] = a; } else { if (definedOutputs[OCCURRENCES_NUM]) { - occurTmpPtr[b2]++; + occurTmpPtr[uIdx]++; } } if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - inToOutTmpPtr[b1] = b2; + inToOutTmpPtr[a] = uIdx; } } - const auto dstPrtStep = elPerPart * uniqueLen; - for (size_t b1 = 0; b1 < uniqueLen; b1++) { - auto first1 = srcDataPtr + uniqIdx[b1] * elPerPart; - auto first2 = uniDataTmpPtr + b1 * elPerPart; - for (int p = 0; p < partsInBl; p++) { - memcpy(first2, first1, partLenB); - first1 += partStep; - first2 += dstPrtStep; - } + // Redefinition of output shapes. + auto dstDataShape = srcDataShape; + dstDataShape[axis] = uniqueLen; + redefineOutputMemory({ dstDataShape, {uniqueLen}, {axisDim}, {uniqueLen}}); + + int *firstPtr = nullptr, *inToOutPtr = nullptr, *occurNPtr = nullptr; + if (definedOutputs[FIRST_UNIQUE_IDX]) { + firstPtr = reinterpret_cast(getChildEdgesAtPort(FIRST_UNIQUE_IDX)[0]->getMemoryPtr()->GetPtr()); + } + if (definedOutputs[INPUT_TO_UNIQ_IDX]) { + inToOutPtr = reinterpret_cast(getChildEdgesAtPort(INPUT_TO_UNIQ_IDX)[0]->getMemoryPtr()->GetPtr()); + } + if (definedOutputs[OCCURRENCES_NUM]) { + occurNPtr = reinterpret_cast(getChildEdgesAtPort(OCCURRENCES_NUM)[0]->getMemoryPtr()->GetPtr()); + } + + T* dstDataPtr = reinterpret_cast(getChildEdgesAtPort(UNIQUE_DATA)[0]->getMemoryPtr()->GetPtr()); + const auto dstOuterStep = innerLen * uniqueLen; + // Filling of the first output if needed. + if (sorted || definedOutputs[UNIQUE_DATA]) { + parallel_for(uniqueLen, [&](size_t u) { + auto first1 = srcDataPtr + uniqIdx[u] * innerLen; + auto first2 = dstDataPtr + u * innerLen; + for (size_t p = 0lu; p < outerLen; p++) { + memcpy(first2, first1, innerSizeB); + first1 += srcOuterStep; + first2 += dstOuterStep; + } + }); } + const auto uniqueLenIB = uniqueLen * sizeof(T); + if (sorted) { - const auto elInBl = elPerPart * partsInBl; + const auto dstUniDataLen = dstOuterStep * outerLen; + std::vector vDstBuff(dstUniDataLen); + auto dstBuff = vDstBuff.data(); + + const auto elInBl = innerLen * outerLen; struct OrdEl { T val; int64_t idx; }; std::vector colToSort(uniqueLen); - std::vector moveTo(uniqueLen); - for (size_t k = 0; k < uniqueLen; k++) { - moveTo[k] = k; - } - std::vector buff1(elPerPart); - std::vector buff2(elPerPart); - for (int64_t p = partsInBl - 1; p >= 0; p--) { - for (int64_t e = elPerPart - 1; e >= 0 ; e--) { - int64_t pos1 = p * dstPrtStep + e; - for (int64_t i = 0; i < static_cast(uniqueLen); i++) { - int64_t pos2 = i * elInBl + pos1; - colToSort[i] = {uniDataTmpPtr[pos2], i}; + T *dst1 = dstDataPtr, *dst2 = dstBuff; + int *first1 = firstPtr, *first2 = firstTmpPtr; + int *occurN1 = occurNPtr, *occurN2 = occurTmpPtr; + int *inToOut1 = inToOutPtr, *inToOut2 = inToOutTmpPtr; + + const bool defined3outputs = definedOutputs[FIRST_UNIQUE_IDX] || definedOutputs[OCCURRENCES_NUM] || definedOutputs[INPUT_TO_UNIQ_IDX]; + + for (int64_t o = outerLen - 1; o >= 0; o--) { // Backward loop through the outer block. + const int64_t pos1Lim = o * dstOuterStep; + int64_t pos1 = pos1Lim + innerLen - 1; + for (; pos1 >= pos1Lim ; pos1--) { // Backward loop through the inner block. + int64_t pos2 = pos1; + for (int64_t k = 0; k < static_cast(uniqueLen); k++, pos2 += innerLen) { + colToSort[k] = { dst1[pos2], k }; } std::stable_sort(colToSort.begin(), colToSort.end(), [](const OrdEl &el1, const OrdEl &el2) { return el1.val < el2.val; }); - for (size_t k = 0; k < uniqueLen; k++) { - moveTo[colToSort[k].idx] = k; - } - // perm - for (int64_t pb = 0; pb < partsInBl; pb++) { - auto currDst = uniDataTmpPtr + pb * dstPrtStep; - memcpy(buff1.data(), currDst, partLenB); - auto dstIdx = moveTo[0]; - for (size_t b = 0; b < uniqueLen; b++) { - if (dstIdx == moveTo[dstIdx]) { - dstIdx = moveTo[dstIdx + 1]; - continue; - } - T* dst = currDst + dstIdx * elPerPart; + // Permutation + parallel_for2d(outerLen, uniqueLen, [&](int64_t ot, size_t u) { + auto src = dst1 + ot * dstOuterStep + colToSort[u].idx * innerLen; + auto dst = dst2 + ot * dstOuterStep + u * innerLen; - auto& bSrc = b % 2 == 0 ? buff1 : buff2; - auto& bDst = b % 2 == 0 ? buff2 : buff1; - memcpy(bDst.data(), dst, partLenB); - memcpy(dst, bSrc.data(), partLenB); + memcpy(dst, src, innerSizeB); + }); - dstIdx = moveTo[dstIdx]; - } + if (defined3outputs) { + parallel_for(uniqueLen, [&](size_t u) { + if (definedOutputs[FIRST_UNIQUE_IDX]) { + first1[u] = first2[colToSort[u].idx]; + } + if (definedOutputs[OCCURRENCES_NUM]) { + occurN1[u] = occurN2[colToSort[u].idx]; + } + if (definedOutputs[INPUT_TO_UNIQ_IDX]) { + for (size_t ax = 0; ax < axisDim; ax++) { + if (inToOut2[ax] == colToSort[u].idx) { + inToOut1[ax] = u; + } + } + } + }); } - auto mPos = moveTo[0]; - int32_t firstSrc = 0, firstDst = 0, ocSrc = 0, ocDst = 0; + std::swap(dst1, dst2); if (definedOutputs[FIRST_UNIQUE_IDX]) { - firstSrc = firstTmpPtr[0]; + std::swap(first1, first2); } if (definedOutputs[OCCURRENCES_NUM]) { - ocSrc = occurTmpPtr[0]; + std::swap(occurN1, occurN2); } - for (size_t k = 0; k < uniqueLen; k++) { - if (mPos == moveTo[mPos]) { - mPos = moveTo[mPos + 1]; - continue; - } - - if (definedOutputs[FIRST_UNIQUE_IDX]) { - auto& fSrc = k % 2 == 0 ? firstSrc : firstDst; - auto& fDst = k % 2 == 0 ? firstDst : firstSrc; - fDst = firstTmpPtr[mPos]; - firstTmpPtr[mPos] = fSrc; - } - if (definedOutputs[OCCURRENCES_NUM]) { - auto& oSrc = k % 2 == 0 ? ocSrc : ocDst; - auto& oDst = k % 2 == 0 ? ocDst : ocSrc; - oDst = occurTmpPtr[mPos]; - occurTmpPtr[mPos] = oSrc; - } - - mPos = moveTo[mPos]; + if (definedOutputs[INPUT_TO_UNIQ_IDX]) { + std::swap(inToOut1, inToOut2); } } } + if (definedOutputs[UNIQUE_DATA] && dst1 != dstDataPtr) { + memcpy(dstDataPtr, dst1, dstUniDataLen * sizeof(T)); + } + if (definedOutputs[FIRST_UNIQUE_IDX] && first2 != firstPtr) { + memcpy(firstPtr, first2, uniqueLenIB); + } + if (definedOutputs[INPUT_TO_UNIQ_IDX] && inToOut2 != inToOutPtr) { + memcpy(inToOutPtr, inToOut2, axisDim * sizeof(int)); + } + if (definedOutputs[OCCURRENCES_NUM] && occurN2 != occurNPtr) { + memcpy(occurNPtr, occurN2, uniqueLenIB); + } + } else { + if (definedOutputs[FIRST_UNIQUE_IDX]) { + memcpy(firstPtr, firstUniTmp.data(), uniqueLenIB); + } if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - for (size_t b1 = 0; b1 < cmpBlNum; b1++) { - auto first1 = srcDataPtr + b1 * elPerPart; - auto last1 = srcDataPtr + (b1 + 1) * elPerPart; - bool equal = true; - for (size_t b2 = 0; b2 < uniqueLen; b2++) { - auto first2 = uniDataTmpPtr + b2 * elPerPart; - equal = true; - for (int p = 0; p < partsInBl; p++) { - equal = std::equal(first1, last1, first2); - if (!equal) { - break; - } - first2 += dstPrtStep; - } - if (equal) { - inToOutTmpPtr[b1] = b2; - } - } - } + memcpy(inToOutPtr, inToOutTmp.data(), axisDim * sizeof(int)); + } + if (definedOutputs[OCCURRENCES_NUM]) { + memcpy(occurNPtr, occurTmp.data(), uniqueLenIB); } - } - - auto dstDataShape = srcDataShape; - dstDataShape[axis] = uniqueLen; - redefineOutputMemory({ dstDataShape, {uniqueLen}, {cmpBlNum}, {uniqueLen}}); - - T* uniDataPtr = reinterpret_cast(getChildEdgesAtPort(UNIQUE_DATA)[0]->getMemoryPtr()->GetPtr()); - memcpy(uniDataPtr, uniDataTmpPtr, getChildEdgesAtPort(UNIQUE_DATA)[0]->getMemoryPtr()->GetSize()); - if (definedOutputs[FIRST_UNIQUE_IDX]) { - int *firstPtr = reinterpret_cast(getChildEdgesAtPort(FIRST_UNIQUE_IDX)[0]->getMemoryPtr()->GetPtr()); - memcpy(firstPtr, firstUniTmp.data(), uniqueLen * sizeof(int)); - } - if (definedOutputs[INPUT_TO_UNIQ_IDX]) { - auto inToOutPtr = reinterpret_cast(getChildEdgesAtPort(INPUT_TO_UNIQ_IDX)[0]->getMemoryPtr()->GetPtr()); - memcpy(inToOutPtr, inToOutTmp.data(), cmpBlNum * sizeof(int)); - } - if (definedOutputs[OCCURRENCES_NUM]) { - auto occurPtr = reinterpret_cast(getChildEdgesAtPort(OCCURRENCES_NUM)[0]->getMemoryPtr()->GetPtr()); - memcpy(occurPtr, occurTmp.data(), uniqueLen * sizeof(int)); } } diff --git a/src/plugins/intel_cpu/src/nodes/unique.hpp b/src/plugins/intel_cpu/src/nodes/unique.hpp index f76fe004c671c6..65b8636abe3d01 100644 --- a/src/plugins/intel_cpu/src/nodes/unique.hpp +++ b/src/plugins/intel_cpu/src/nodes/unique.hpp @@ -6,10 +6,6 @@ #include -#include -#include -#include - namespace ov { namespace intel_cpu { namespace node { @@ -50,8 +46,8 @@ class Unique : public Node { int axis = 0; bool definedOutputs[4] = { false, false, false, false }; InferenceEngine::Precision dataPrecision; - int64_t dataTypeSize = 1; - size_t uniqueLen = 1; + int64_t dataTypeSize = 1l; + size_t uniqueLen = 1lu; static constexpr size_t IN_DATA = 0; static constexpr size_t AXIS = 1; diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 363cb45861b13a..cdeee29aa7f03f 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -167,8 +167,6 @@ std::vector disabledTestPatterns() { // The kernel does not have such garbage. The diff 0.000000745 is taken into account in calculations and affects further type conversion. // Reorder->GridSample->Reorder also does not work here. Potential fix is to use nearest conversion instead of truncation. R"(.*GridSampleLayerTestCPU.*(BILINEAR|BICUBIC).*(i32|i8).*)", - // 98151. Not valid sorting for slices in reference. - R"(.*UniqueLayerTestCPU.*axis.*True.*)", // AUTO does not support import / export R"(.*smoke_Auto_BehaviorTests/OVCompiledGraphImportExportTest.*(mportExport|readFromV10IR).*/targetDevice=(AUTO).*)", // AdaptiveAvgPool is converted into Reduce op for suitable parameters. CPU Reduce impl doesn't support non planar layout for 3D case diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/unique.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/unique.cpp index 1cb772440eea0a..acfac9a31278a6 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/unique.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/unique.cpp @@ -160,6 +160,26 @@ std::vector getCPUInfo() { return resCPUParams; } +std::vector> statShapes1D = { + {{{}, {{1}}}}, // Static shapes + {{{}, {{5}}}}, // Static shapes + {{{}, {{8}}}}, // Static shapes + {{{}, {{16}}}}, // Static shapes + {{{}, {{32}}}}, // Static shapes + {{{}, {{64}}}}, // Static shapes + {{{}, {{99}}}}, // Static shapes +}; + +INSTANTIATE_TEST_SUITE_P(smoke_static_1D, UniqueLayerTestCPU, + ::testing::Combine( + ::testing::ValuesIn(statShapes1D), + ::testing::ValuesIn(std::vector>{{true, 0}, {false, 0}}), + ::testing::ValuesIn(sorted), + ::testing::ValuesIn(dataPrecisionSmoke), + ::testing::ValuesIn(getCPUInfo()), + ::testing::Values(additionalConfig[0])), + UniqueLayerTestCPU::getTestCaseName); + std::vector> getStaticShapes() { std::vector> result = { { { {}, { {1, 1, 1} } } }, // Static shapes