Skip to content

Commit

Permalink
[CPU] ONNX Unique tests failing on CPU
Browse files Browse the repository at this point in the history
  • Loading branch information
nshchego committed Jun 16, 2023
1 parent f5dc8e7 commit 77d14ef
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 149 deletions.
274 changes: 133 additions & 141 deletions src/plugins/intel_cpu/src/nodes/unique.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
// SPDX-License-Identifier: Apache-2.0
//

#include <string>
#include <vector>

#include "unique.hpp"
#include <ngraph/opsets/opset1.hpp>

#include "ie_parallel.hpp"
#include <openvino/op/unique.hpp>
#include <utils/shape_inference/shape_inference_internal_dyn.hpp>

using namespace InferenceEngine;
Expand All @@ -21,7 +20,7 @@ bool Unique::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std
errorMessage = "Not supported Unique operation version. CPU plug-in supports only 10th version.";
return false;
}
if (op->get_input_size() > AXIS && !ov::is_type<ov::op::v0::Constant>(op->get_input_node_ptr(AXIS))) {
if (op->get_input_size() > AXIS && !ov::is_type<op::v0::Constant>(op->get_input_node_ptr(AXIS))) {
errorMessage = "CPU plug-in supports only constant Axis input.";
return false;
}
Expand All @@ -46,15 +45,15 @@ Unique::Unique(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr con
definedOutputs[i] = !op->get_output_target_inputs(i).empty();
}

sorted = ov::as_type_ptr<ov::op::v10::Unique>(op)->get_sorted();
sorted = ov::as_type_ptr<op::v10::Unique>(op)->get_sorted();
if (op->get_input_size() > AXIS) {
flattened = false;
axis = ov::as_type<ov::op::v0::Constant>(op->get_input_node_ptr(AXIS))->cast_vector<int>()[0];
axis = ov::as_type<op::v0::Constant>(op->get_input_node_ptr(AXIS))->cast_vector<int>()[0];
if (axis < 0) {
axis += op->get_input_partial_shape(IN_DATA).rank().get_length();
}
if (axis < 0 || axis >= op->get_input_partial_shape(IN_DATA).rank().get_length()) {
THROW_ERROR << "has invalid axis value: " << ov::as_type<ov::op::v0::Constant>(op->get_input_node_ptr(AXIS))->cast_vector<int>()[0];
THROW_ERROR << "has invalid axis value: " << ov::as_type<op::v0::Constant>(op->get_input_node_ptr(AXIS))->cast_vector<int>()[0];
}
} else {
flattened = true;
Expand Down Expand Up @@ -281,10 +280,9 @@ void Unique::flattenTensorExec() {

template <typename T>
void Unique::slicedTensorExec() {
const T* srcDataPtr = reinterpret_cast<const T*>(getParentEdgeAt(IN_DATA)->getMemoryPtr()->GetPtr());
const size_t inputLen = getParentEdgeAt(IN_DATA)->getMemoryPtr()->GetSize() / sizeof(T);
std::vector<T> uniDataTmp(inputLen);
auto uniDataTmpPtr = uniDataTmp.data();
auto inDataMemPtr = getParentEdgeAt(IN_DATA)->getMemoryPtr();
auto srcDataPtr = reinterpret_cast<const T*>(inDataMemPtr->GetPtr());
const size_t inputLen = inDataMemPtr->GetSize() / sizeof(T);
int *firstTmpPtr = nullptr, *inToOutTmpPtr = nullptr, *occurTmpPtr = nullptr;
if (definedOutputs[FIRST_UNIQUE_IDX]) {
firstTmpPtr = firstUniTmp.data();
Expand All @@ -296,19 +294,19 @@ void Unique::slicedTensorExec() {
occurTmpPtr = occurTmp.data();
}

const auto& srcDataShape = getParentEdgeAt(IN_DATA)->getMemoryPtr()->getStaticDims();
const auto& srcDataShape = inDataMemPtr->getStaticDims();

const auto cmpBlNum = srcDataShape[axis]; // Blocks to compare.
int64_t partsInBl = 1; // Parts in block
const auto axisDim = srcDataShape[axis];
int64_t outerLen = 1lu;
if (axis > 0) {
partsInBl = std::accumulate(srcDataShape.begin(), srcDataShape.begin() + axis, 1, std::multiplies<Dim>());
outerLen = std::accumulate(srcDataShape.begin(), srcDataShape.begin() + axis, 1, std::multiplies<Dim>());
}
int64_t elPerPart = 1; // Elements number in part.
int64_t innerLen = 1;
if (static_cast<size_t>(axis) < srcDataShape.size() - 1) {
elPerPart = std::accumulate(srcDataShape.begin() + axis + 1, srcDataShape.end(), 1, std::multiplies<Dim>());
innerLen = std::accumulate(srcDataShape.begin() + axis + 1, srcDataShape.end(), 1, std::multiplies<Dim>());
}
const auto partLenB = elPerPart * dataPrecision.size();
const auto partStep = elPerPart * cmpBlNum;
const auto innerSizeB = innerLen * sizeof(T);
const auto srcOuterStep = innerLen * axisDim;

if (definedOutputs[FIRST_UNIQUE_IDX]) {
firstTmpPtr[0] = 0;
Expand All @@ -318,178 +316,172 @@ void Unique::slicedTensorExec() {
}
if (definedOutputs[OCCURRENCES_NUM]) {
occurTmpPtr[0] = 1;
std::fill(occurTmpPtr, occurTmpPtr + cmpBlNum, 1);
std::fill(occurTmpPtr, occurTmpPtr + axisDim, 1);
}

uniqueLen = 1;
std::vector<int64_t> uniqIdx(cmpBlNum, 0);
for (size_t b1 = 1; b1 < cmpBlNum; b1++) {
auto first1 = srcDataPtr + b1 * elPerPart;
auto last1 = srcDataPtr + (b1 + 1) * elPerPart;
uniqueLen = 1lu;
std::vector<size_t> uniqIdx(axisDim, 0lu);
// Search for unique slices.
for (size_t a = 1lu; a < axisDim; a++) {
auto first1 = srcDataPtr + a * innerLen;
auto last1 = srcDataPtr + (a + 1lu) * innerLen;
bool equal = true;
size_t b2 = 0;
size_t uIdx = 0lu;
// Compare with unique blocks.
for (; b2 < uniqueLen; b2++) {
auto first2 = srcDataPtr + uniqIdx[b2] * elPerPart;
for (; uIdx < uniqueLen; uIdx++) {
auto first2 = srcDataPtr + uniqIdx[uIdx] * innerLen;
equal = true;
for (int p = 0; p < partsInBl; p++) {
for (size_t o = 0lu; o < outerLen; o++) {
equal = std::equal(first1, last1, first2);
if (!equal) {
break;
}
first1 += partStep;
last1 += partStep;
first2 += partStep;
first1 += srcOuterStep;
last1 += srcOuterStep;
first2 += srcOuterStep;
}
if (equal) {
break;
}
}
if (!equal) {
if (definedOutputs[FIRST_UNIQUE_IDX]) {
firstTmpPtr[uniqueLen] = b1;
firstTmpPtr[uniqueLen] = a;
}

uniqIdx[uniqueLen++] = b1;
uniqIdx[uniqueLen++] = a;
} else {
if (definedOutputs[OCCURRENCES_NUM]) {
occurTmpPtr[b2]++;
occurTmpPtr[uIdx]++;
}
}
if (definedOutputs[INPUT_TO_UNIQ_IDX]) {
inToOutTmpPtr[b1] = b2;
inToOutTmpPtr[a] = uIdx;
}
}

const auto dstPrtStep = elPerPart * uniqueLen;
for (size_t b1 = 0; b1 < uniqueLen; b1++) {
auto first1 = srcDataPtr + uniqIdx[b1] * elPerPart;
auto first2 = uniDataTmpPtr + b1 * elPerPart;
for (int p = 0; p < partsInBl; p++) {
memcpy(first2, first1, partLenB);
first1 += partStep;
first2 += dstPrtStep;
}
// Redefinition of output shapes.
auto dstDataShape = srcDataShape;
dstDataShape[axis] = uniqueLen;
redefineOutputMemory({ dstDataShape, {uniqueLen}, {axisDim}, {uniqueLen}});

int *firstPtr = nullptr, *inToOutPtr = nullptr, *occurNPtr = nullptr;
if (definedOutputs[FIRST_UNIQUE_IDX]) {
firstPtr = reinterpret_cast<int*>(getChildEdgesAtPort(FIRST_UNIQUE_IDX)[0]->getMemoryPtr()->GetPtr());
}
if (definedOutputs[INPUT_TO_UNIQ_IDX]) {
inToOutPtr = reinterpret_cast<int*>(getChildEdgesAtPort(INPUT_TO_UNIQ_IDX)[0]->getMemoryPtr()->GetPtr());
}
if (definedOutputs[OCCURRENCES_NUM]) {
occurNPtr = reinterpret_cast<int*>(getChildEdgesAtPort(OCCURRENCES_NUM)[0]->getMemoryPtr()->GetPtr());
}

T* dstDataPtr = reinterpret_cast<T*>(getChildEdgesAtPort(UNIQUE_DATA)[0]->getMemoryPtr()->GetPtr());
const auto dstOuterStep = innerLen * uniqueLen;
// Filling of the first output if needed.
if (sorted || definedOutputs[UNIQUE_DATA]) {
parallel_for(uniqueLen, [&](size_t u) {
auto first1 = srcDataPtr + uniqIdx[u] * innerLen;
auto first2 = dstDataPtr + u * innerLen;
for (size_t p = 0lu; p < outerLen; p++) {
memcpy(first2, first1, innerSizeB);
first1 += srcOuterStep;
first2 += dstOuterStep;
}
});
}

const auto uniqueLenIB = uniqueLen * sizeof(T);

if (sorted) {
const auto elInBl = elPerPart * partsInBl;
const auto dstUniDataLen = dstOuterStep * outerLen;
std::vector<T> vDstBuff(dstUniDataLen);
auto dstBuff = vDstBuff.data();

const auto elInBl = innerLen * outerLen;
struct OrdEl {
T val;
int64_t idx;
};

std::vector<OrdEl> colToSort(uniqueLen);
std::vector<int64_t> moveTo(uniqueLen);
for (size_t k = 0; k < uniqueLen; k++) {
moveTo[k] = k;
}
std::vector<T> buff1(elPerPart);
std::vector<T> buff2(elPerPart);
for (int64_t p = partsInBl - 1; p >= 0; p--) {
for (int64_t e = elPerPart - 1; e >= 0 ; e--) {
int64_t pos1 = p * dstPrtStep + e;
for (int64_t i = 0; i < static_cast<int64_t>(uniqueLen); i++) {
int64_t pos2 = i * elInBl + pos1;
colToSort[i] = {uniDataTmpPtr[pos2], i};
T *dst1 = dstDataPtr, *dst2 = dstBuff;
int *first1 = firstPtr, *first2 = firstTmpPtr;
int *occurN1 = occurNPtr, *occurN2 = occurTmpPtr;
int *inToOut1 = inToOutPtr, *inToOut2 = inToOutTmpPtr;

const bool defined3outputs = definedOutputs[FIRST_UNIQUE_IDX] || definedOutputs[OCCURRENCES_NUM] || definedOutputs[INPUT_TO_UNIQ_IDX];

for (int64_t o = outerLen - 1; o >= 0; o--) { // Backward loop through the outer block.
const int64_t pos1Lim = o * dstOuterStep;
int64_t pos1 = pos1Lim + innerLen - 1;
for (; pos1 >= pos1Lim ; pos1--) { // Backward loop through the inner block.
int64_t pos2 = pos1;
for (int64_t k = 0; k < static_cast<int64_t>(uniqueLen); k++, pos2 += innerLen) {
colToSort[k] = { dst1[pos2], k };
}
std::stable_sort(colToSort.begin(), colToSort.end(), [](const OrdEl &el1, const OrdEl &el2) { return el1.val < el2.val; });
for (size_t k = 0; k < uniqueLen; k++) {
moveTo[colToSort[k].idx] = k;
}

// perm
for (int64_t pb = 0; pb < partsInBl; pb++) {
auto currDst = uniDataTmpPtr + pb * dstPrtStep;
memcpy(buff1.data(), currDst, partLenB);
auto dstIdx = moveTo[0];
for (size_t b = 0; b < uniqueLen; b++) {
if (dstIdx == moveTo[dstIdx]) {
dstIdx = moveTo[dstIdx + 1];
continue;
}
T* dst = currDst + dstIdx * elPerPart;
// Permutation
parallel_for2d(outerLen, uniqueLen, [&](int64_t ot, size_t u) {
auto src = dst1 + ot * dstOuterStep + colToSort[u].idx * innerLen;
auto dst = dst2 + ot * dstOuterStep + u * innerLen;

auto& bSrc = b % 2 == 0 ? buff1 : buff2;
auto& bDst = b % 2 == 0 ? buff2 : buff1;
memcpy(bDst.data(), dst, partLenB);
memcpy(dst, bSrc.data(), partLenB);
memcpy(dst, src, innerSizeB);
});

dstIdx = moveTo[dstIdx];
}
if (defined3outputs) {
parallel_for(uniqueLen, [&](size_t u) {
if (definedOutputs[FIRST_UNIQUE_IDX]) {
first1[u] = first2[colToSort[u].idx];
}
if (definedOutputs[OCCURRENCES_NUM]) {
occurN1[u] = occurN2[colToSort[u].idx];
}
if (definedOutputs[INPUT_TO_UNIQ_IDX]) {
for (size_t ax = 0; ax < axisDim; ax++) {
if (inToOut2[ax] == colToSort[u].idx) {
inToOut1[ax] = u;
}
}
}
});
}

auto mPos = moveTo[0];
int32_t firstSrc = 0, firstDst = 0, ocSrc = 0, ocDst = 0;
std::swap(dst1, dst2);
if (definedOutputs[FIRST_UNIQUE_IDX]) {
firstSrc = firstTmpPtr[0];
std::swap(first1, first2);
}
if (definedOutputs[OCCURRENCES_NUM]) {
ocSrc = occurTmpPtr[0];
std::swap(occurN1, occurN2);
}
for (size_t k = 0; k < uniqueLen; k++) {
if (mPos == moveTo[mPos]) {
mPos = moveTo[mPos + 1];
continue;
}

if (definedOutputs[FIRST_UNIQUE_IDX]) {
auto& fSrc = k % 2 == 0 ? firstSrc : firstDst;
auto& fDst = k % 2 == 0 ? firstDst : firstSrc;
fDst = firstTmpPtr[mPos];
firstTmpPtr[mPos] = fSrc;
}
if (definedOutputs[OCCURRENCES_NUM]) {
auto& oSrc = k % 2 == 0 ? ocSrc : ocDst;
auto& oDst = k % 2 == 0 ? ocDst : ocSrc;
oDst = occurTmpPtr[mPos];
occurTmpPtr[mPos] = oSrc;
}

mPos = moveTo[mPos];
if (definedOutputs[INPUT_TO_UNIQ_IDX]) {
std::swap(inToOut1, inToOut2);
}
}
}

if (definedOutputs[UNIQUE_DATA] && dst1 != dstDataPtr) {
memcpy(dstDataPtr, dst1, dstUniDataLen * sizeof(T));
}
if (definedOutputs[FIRST_UNIQUE_IDX] && first2 != firstPtr) {
memcpy(firstPtr, first2, uniqueLenIB);
}
if (definedOutputs[INPUT_TO_UNIQ_IDX] && inToOut2 != inToOutPtr) {
memcpy(inToOutPtr, inToOut2, axisDim * sizeof(int));
}
if (definedOutputs[OCCURRENCES_NUM] && occurN2 != occurNPtr) {
memcpy(occurNPtr, occurN2, uniqueLenIB);
}
} else {
if (definedOutputs[FIRST_UNIQUE_IDX]) {
memcpy(firstPtr, firstUniTmp.data(), uniqueLenIB);
}
if (definedOutputs[INPUT_TO_UNIQ_IDX]) {
for (size_t b1 = 0; b1 < cmpBlNum; b1++) {
auto first1 = srcDataPtr + b1 * elPerPart;
auto last1 = srcDataPtr + (b1 + 1) * elPerPart;
bool equal = true;
for (size_t b2 = 0; b2 < uniqueLen; b2++) {
auto first2 = uniDataTmpPtr + b2 * elPerPart;
equal = true;
for (int p = 0; p < partsInBl; p++) {
equal = std::equal(first1, last1, first2);
if (!equal) {
break;
}
first2 += dstPrtStep;
}
if (equal) {
inToOutTmpPtr[b1] = b2;
}
}
}
memcpy(inToOutPtr, inToOutTmp.data(), axisDim * sizeof(int));
}
if (definedOutputs[OCCURRENCES_NUM]) {
memcpy(occurNPtr, occurTmp.data(), uniqueLenIB);
}
}

auto dstDataShape = srcDataShape;
dstDataShape[axis] = uniqueLen;
redefineOutputMemory({ dstDataShape, {uniqueLen}, {cmpBlNum}, {uniqueLen}});

T* uniDataPtr = reinterpret_cast<T*>(getChildEdgesAtPort(UNIQUE_DATA)[0]->getMemoryPtr()->GetPtr());
memcpy(uniDataPtr, uniDataTmpPtr, getChildEdgesAtPort(UNIQUE_DATA)[0]->getMemoryPtr()->GetSize());
if (definedOutputs[FIRST_UNIQUE_IDX]) {
int *firstPtr = reinterpret_cast<int*>(getChildEdgesAtPort(FIRST_UNIQUE_IDX)[0]->getMemoryPtr()->GetPtr());
memcpy(firstPtr, firstUniTmp.data(), uniqueLen * sizeof(int));
}
if (definedOutputs[INPUT_TO_UNIQ_IDX]) {
auto inToOutPtr = reinterpret_cast<int*>(getChildEdgesAtPort(INPUT_TO_UNIQ_IDX)[0]->getMemoryPtr()->GetPtr());
memcpy(inToOutPtr, inToOutTmp.data(), cmpBlNum * sizeof(int));
}
if (definedOutputs[OCCURRENCES_NUM]) {
auto occurPtr = reinterpret_cast<int*>(getChildEdgesAtPort(OCCURRENCES_NUM)[0]->getMemoryPtr()->GetPtr());
memcpy(occurPtr, occurTmp.data(), uniqueLen * sizeof(int));
}
}
Loading

0 comments on commit 77d14ef

Please sign in to comment.