From c8fa56940ac930731a360d199ff297a5112a240e Mon Sep 17 00:00:00 2001 From: mandrono Date: Mon, 26 Oct 2020 16:35:26 +0300 Subject: [PATCH] [CPU] Added support NMS-5 --- .../nodes/non_max_suppression.cpp | 502 +++++++++++++----- 1 file changed, 365 insertions(+), 137 deletions(-) diff --git a/inference-engine/src/mkldnn_plugin/nodes/non_max_suppression.cpp b/inference-engine/src/mkldnn_plugin/nodes/non_max_suppression.cpp index 407eb51228ffbd..9c74812d38abd1 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/non_max_suppression.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/non_max_suppression.cpp @@ -10,7 +10,9 @@ #include #include #include +#include #include "ie_parallel.hpp" +#include "common/cpu_memcpy.h" namespace InferenceEngine { namespace Extensions { @@ -20,82 +22,131 @@ class NonMaxSuppressionImpl: public ExtLayerBase { public: explicit NonMaxSuppressionImpl(const CNNLayer* layer) { try { - if (layer->insData.size() < 2 || layer->insData.size() > 5) - THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!"; + logPrefix = "NMS layer with name '" + layer->name + "' "; + if (layer->insData.size() < 2 || layer->insData.size() > 6) + THROW_IE_EXCEPTION << logPrefix << "has incorrect number of input edges: " << layer->insData.size(); + + if (layer->outData.size() < 1 || layer->outData.size() > 3) + THROW_IE_EXCEPTION << logPrefix << "has incorrect number of output edges: " << layer->outData.size(); + + // TODO: remove legacy attribute presentation after migration on opset1 + if (layer->CheckParamPresence("center_point_box")) { + bool center_point_box = layer->GetParamAsBool("center_point_box", false); + boxEncodingType = center_point_box ? boxEncoding::CENTER : boxEncoding::CORNER; + } else if (layer->CheckParamPresence("box_encoding")) { + std::string boxEncAttr = layer->GetParamAsString("box_encoding", "corner"); + if (boxEncAttr == "corner") { + boxEncodingType = boxEncoding::CORNER; + } else if (boxEncAttr == "center") { + boxEncodingType = boxEncoding::CENTER; + } else { + THROW_IE_EXCEPTION << logPrefix << "has unsupported 'box_encoding' attribute: " << boxEncAttr; + } + } - if (layer->outData.size() != 1) - THROW_IE_EXCEPTION << layer->name << " Incorrect number of output edges!"; + sort_result_descending = layer->GetParamAsBool("sort_result_descending", true); - if (layer->insData[NMS_BOXES].lock()->getTensorDesc().getPrecision() != Precision::FP32) - THROW_IE_EXCEPTION << layer->name << " Incorrect 'boxes' input precision. Only FP32 is supported!"; - SizeVector boxes_dims = layer->insData[NMS_BOXES].lock()->getTensorDesc().getDims(); - if (boxes_dims.size() != 3 || boxes_dims[2] != 4) - THROW_IE_EXCEPTION << layer->name << " 'boxes' should be with shape [num_batches, spatial_dimension, 4]"; + const std::vector supportedFloatPrecision = {Precision::FP32, Precision::BF16}; + const std::vector supportedIntOutputPrecision = {Precision::I32, Precision::I64}; - if (layer->insData[NMS_SCORES].lock()->getTensorDesc().getPrecision() != Precision::FP32) - THROW_IE_EXCEPTION << layer->name << " Incorrect 'scores' input precision. Only FP32 is supported!"; - SizeVector scores_dims = layer->insData[NMS_SCORES].lock()->getTensorDesc().getDims(); + auto boxesDataPtr = layer->insData[NMS_BOXES].lock(); + if (boxesDataPtr == nullptr) { + THROW_IE_EXCEPTION << logPrefix << "has nullable 'boxes' input"; + } + checkPrecision(boxesDataPtr, supportedFloatPrecision, "boxes", inType); + const SizeVector &boxes_dims = boxesDataPtr->getTensorDesc().getDims(); + num_batches = boxes_dims[0]; + num_boxes = boxes_dims[1]; + if (boxes_dims.size() != 3) + THROW_IE_EXCEPTION << logPrefix << "has unsupported 'boxes' input rank: " << boxes_dims.size(); + if (boxes_dims[2] != 4) + THROW_IE_EXCEPTION << logPrefix << "has unsupported 'boxes' input 3rd dimension size: " << boxes_dims[2]; + + + auto scoresDataPtr = layer->insData[NMS_SCORES].lock(); + if (scoresDataPtr == nullptr) { + THROW_IE_EXCEPTION << logPrefix << "has nullable 'scores' input"; + } + checkPrecision(scoresDataPtr, supportedFloatPrecision, "scores", inType); + const SizeVector &scores_dims = scoresDataPtr->getTensorDesc().getDims(); + num_classes = scores_dims[1]; if (scores_dims.size() != 3) - THROW_IE_EXCEPTION << layer->name << " 'scores' should be with shape [num_batches, num_classes, spatial_dimension]"; + THROW_IE_EXCEPTION << logPrefix << "has unsupported 'scores' input rank: " << scores_dims.size(); - if (boxes_dims[0] != scores_dims[0]) - THROW_IE_EXCEPTION << layer->name << " num_batches is different in 'boxes' and 'scores' tensors"; - if (boxes_dims[1] != scores_dims[2]) - THROW_IE_EXCEPTION << layer->name << " spatial_dimension is different in 'boxes' and 'scores' tensors"; + if (num_batches != scores_dims[0]) + THROW_IE_EXCEPTION << logPrefix << " num_batches is different in 'boxes' and 'scores' inputs"; + if (num_boxes != scores_dims[2]) + THROW_IE_EXCEPTION << logPrefix << " num_boxes is different in 'boxes' and 'scores' inputs"; - if (layer->insData.size() > 2) { - if (layer->insData[NMS_MAXOUTPUTBOXESPERCLASS].lock()->getTensorDesc().getPrecision() != Precision::I32) - THROW_IE_EXCEPTION << layer->name << " Incorrect 'max_output_boxes_per_class' input precision. Only I32 is supported!"; - SizeVector max_output_boxes_per_class_dims = layer->insData[NMS_MAXOUTPUTBOXESPERCLASS].lock()->getTensorDesc().getDims(); - if (max_output_boxes_per_class_dims.size() && max_output_boxes_per_class_dims[0] != 1) - THROW_IE_EXCEPTION << layer->name << " 'max_output_boxes_per_class' should be scalar"; + numFiltBox.resize(num_batches); + for (size_t i = 0; i < numFiltBox.size(); i++) + numFiltBox[i].resize(num_classes); + + if (layer->insData.size() > NMS_MAXOUTPUTBOXESPERCLASS) { + const std::vector supportedPrecision = {Precision::I16, Precision::U8, Precision::I8, Precision::U16, Precision::I32, + Precision::U32, Precision::I64, Precision::U64}; + check1DInput(layer->insData[NMS_MAXOUTPUTBOXESPERCLASS], supportedPrecision, "max_output_boxes_per_class"); + } + + if (layer->insData.size() > NMS_IOUTHRESHOLD) { + check1DInput(layer->insData[NMS_IOUTHRESHOLD], supportedFloatPrecision, "iou_threshold"); } - if (layer->insData.size() > 3) { - if (layer->insData[NMS_IOUTHRESHOLD].lock()->getTensorDesc().getPrecision() != Precision::FP32) - THROW_IE_EXCEPTION << layer->name << " Incorrect 'iou_threshold' input precision. Only FP32 is supported!"; - SizeVector iou_threshold_dims = layer->insData[NMS_IOUTHRESHOLD].lock()->getTensorDesc().getDims(); - if (iou_threshold_dims.size() && iou_threshold_dims[0] != 1) - THROW_IE_EXCEPTION << layer->name << " 'iou_threshold' should be scalar"; + if (layer->insData.size() > NMS_SCORETHRESHOLD) { + check1DInput(layer->insData[NMS_SCORETHRESHOLD], supportedFloatPrecision, "score_threshold"); } - if (layer->insData.size() > 4) { - if (layer->insData[NMS_SCORETHRESHOLD].lock()->getTensorDesc().getPrecision() != Precision::FP32) - THROW_IE_EXCEPTION << layer->name << " Incorrect 'score_threshold' input precision. Only FP32 is supported!"; - SizeVector score_threshold_dims = layer->insData[NMS_SCORETHRESHOLD].lock()->getTensorDesc().getDims(); - if (score_threshold_dims.size() && score_threshold_dims[0] != 1) - THROW_IE_EXCEPTION << layer->name << " 'score_threshold' should be scalar"; + if (layer->insData.size() > NMS_SOFTNMSSIGMA) { + check1DInput(layer->insData[NMS_SOFTNMSSIGMA], supportedFloatPrecision, "soft_nms_sigma"); } - if (layer->outData[0]->getTensorDesc().getPrecision() != Precision::I32) - THROW_IE_EXCEPTION << layer->name << " Incorrect 'selected_indices' input precision. Only I32 is supported!"; - SizeVector selected_indices_dims = layer->outData[0]->getTensorDesc().getDims(); - if (selected_indices_dims.size() != 2 || selected_indices_dims[1] != 3) - THROW_IE_EXCEPTION << layer->name << " 'selected_indices' should be with shape [num_selected_indices, 3]"; + checkOutput(layer->outData[NMS_SELECTEDINDICES], supportedIntOutputPrecision, "selected_indices"); - center_point_box = layer->GetParamAsBool("center_point_box", false); - sort_result_descending = layer->GetParamAsBool("sort_result_descending", true); + if (layer->outData.size() > NMS_SELECTEDSCORES) { + checkOutput(layer->outData[NMS_SELECTEDSCORES], supportedFloatPrecision, "selected_scores"); + } + + if (layer->outData.size() > NMS_VALIDOUTPUTS) { + checkPrecision(layer->outData[NMS_VALIDOUTPUTS], supportedIntOutputPrecision, "valid_outputs", outType); + const SizeVector &valid_outputs_dims = layer->outData[NMS_VALIDOUTPUTS]->getTensorDesc().getDims(); + if (valid_outputs_dims.size() != 1) + THROW_IE_EXCEPTION << logPrefix << "has unsupported 'valid_outputs' output rank: " << valid_outputs_dims.size(); + if (valid_outputs_dims[0] != 1) + THROW_IE_EXCEPTION << logPrefix << "has unsupported 'valid_outputs' output 1st dimension size: " << valid_outputs_dims[1]; + } - if (layer->insData.size() == 2) { - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); - } else if (layer->insData.size() == 3) { - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); - } else if (layer->insData.size() == 4) { - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), - DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); - } else { - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), - DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); + LayerConfig config; + for (size_t i = 0; i < layer->insData.size(); i++) { + DataConfig inConfig; + + Precision inPrecision = i == NMS_MAXOUTPUTBOXESPERCLASS ? Precision::I32 : Precision::FP32; + auto validDataPtr = layer->insData[i].lock(); + if (validDataPtr == nullptr) { + THROW_IE_EXCEPTION << logPrefix << "has nullable " << i << "th input"; + } + const SizeVector& inDims = validDataPtr->getTensorDesc().getDims(); + inConfig.desc = TensorDesc(inPrecision, inDims, InferenceEngine::TensorDesc::getLayoutByDims(inDims)); + config.inConfs.push_back(inConfig); } + for (size_t i = 0; i < layer->outData.size(); i++) { + DataConfig outConfig; + + Precision outPrecision = i == NMS_SELECTEDSCORES ? Precision::FP32 : Precision::I32; + const SizeVector& outDims = layer->outData[i]->getTensorDesc().getDims(); + outConfig.desc = TensorDesc(outPrecision, outDims, InferenceEngine::TensorDesc::getLayoutByDims(outDims)); + config.outConfs.push_back(outConfig); + } + + config.dynBatchSupport = false; + confs.push_back(config); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } } - static float intersectionOverUnion(float* boxesI, float* boxesJ, bool center_point_box) { + float intersectionOverUnion(const float *boxesI, const float *boxesJ) { float yminI, xminI, ymaxI, xmaxI, yminJ, xminJ, ymaxJ, xmaxJ; - if (center_point_box) { + if (boxEncodingType == boxEncoding::CENTER) { // box format: x_center, y_center, width, height yminI = boxesI[1] - boxesI[3] / 2.f; xminI = boxesI[0] - boxesI[2] / 2.f; @@ -128,117 +179,294 @@ class NonMaxSuppressionImpl: public ExtLayerBase { return intersection_area / (areaI + areaJ - intersection_area); } - typedef struct { + struct filteredBoxes { float score; int batch_index; int class_index; int box_index; - } filteredBoxes; + filteredBoxes() {} + filteredBoxes(float _score, int _batch_index, int _class_index, int _box_index) : + score(_score), batch_index(_batch_index), class_index(_class_index), box_index(_box_index) {} + }; - StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - float *boxes = inputs[NMS_BOXES]->cbuffer().as() + - inputs[NMS_BOXES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - float *scores = inputs[NMS_SCORES]->cbuffer().as() + - inputs[NMS_SCORES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - SizeVector scores_dims = inputs[NMS_SCORES]->getTensorDesc().getDims(); - int num_boxes = static_cast(scores_dims[2]); - int max_output_boxes_per_class = num_boxes; - if (inputs.size() > 2) - max_output_boxes_per_class = (std::min)(max_output_boxes_per_class, - (inputs[NMS_MAXOUTPUTBOXESPERCLASS]->cbuffer().as() + - inputs[NMS_MAXOUTPUTBOXESPERCLASS]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]); - - float iou_threshold = 1.f; // Value range [0, 1] - if (inputs.size() > 3) - iou_threshold = (std::min)(iou_threshold, (inputs[NMS_IOUTHRESHOLD]->cbuffer().as() + - inputs[NMS_IOUTHRESHOLD]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]); - - float score_threshold = 0.f; - if (inputs.size() > 4) - score_threshold = (inputs[NMS_SCORETHRESHOLD]->cbuffer().as() + - inputs[NMS_SCORETHRESHOLD]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; - int* selected_indices = outputs[0]->cbuffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - SizeVector selected_indices_dims = outputs[0]->getTensorDesc().getDims(); - - SizeVector boxesStrides = inputs[NMS_BOXES]->getTensorDesc().getBlockingDesc().getStrides(); - SizeVector scoresStrides = inputs[NMS_SCORES]->getTensorDesc().getBlockingDesc().getStrides(); - - // boxes shape: {num_batches, num_boxes, 4} - // scores shape: {num_batches, num_classes, num_boxes} - int num_batches = static_cast(scores_dims[0]); - int num_classes = static_cast(scores_dims[1]); - std::vector fb; - - for (int batch = 0; batch < num_batches; batch++) { - float *boxesPtr = boxes + batch * boxesStrides[0]; - for (int class_idx = 0; class_idx < num_classes; class_idx++) { - float *scoresPtr = scores + batch * scoresStrides[0] + class_idx * scoresStrides[1]; - std::vector > scores_vector; - for (int box_idx = 0; box_idx < num_boxes; box_idx++) { - if (scoresPtr[box_idx] > score_threshold) - scores_vector.push_back(std::make_pair(scoresPtr[box_idx], box_idx)); - } + struct boxInfo { + float score; + int idx; + int suppress_begin_index; + }; + + void nmsWithSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, const SizeVector &scoresStrides, + std::vector &filtBoxes) { + auto less = [](const boxInfo& l, const boxInfo& r) { + return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx)); + }; + + auto coeff = [&](float iou) { + const float weight = std::exp(scale * iou * iou); + return iou <= iou_threshold ? weight : 0.0f; + }; + + parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) { + std::vector fb; + const float *boxesPtr = boxes + batch_idx * boxesStrides[0]; + const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; + + std::priority_queue, decltype(less)> sorted_boxes(less); + for (int box_idx = 0; box_idx < num_boxes; box_idx++) { + if (scoresPtr[box_idx] > score_threshold) + sorted_boxes.emplace(boxInfo({scoresPtr[box_idx], box_idx, 0})); + } - if (scores_vector.size()) { - parallel_sort(scores_vector.begin(), scores_vector.end(), - [](const std::pair& l, const std::pair& r) { return l.first > r.first; }); - - int io_selection_size = 1; - fb.push_back({ scores_vector[0].first, batch, class_idx, scores_vector[0].second }); - for (int box_idx = 1; (box_idx < static_cast(scores_vector.size()) && io_selection_size < max_output_boxes_per_class); box_idx++) { - bool box_is_selected = true; - for (int idx = io_selection_size - 1; idx >= 0; idx--) { - float iou = intersectionOverUnion(&boxesPtr[scores_vector[box_idx].second * 4], - &boxesPtr[scores_vector[idx].second * 4], center_point_box); - if (iou > iou_threshold) { - box_is_selected = false; - break; - } + fb.reserve(sorted_boxes.size()); + if (sorted_boxes.size() > 0) { + while (fb.size() < max_output_boxes_per_class && !sorted_boxes.empty()) { + boxInfo currBox = sorted_boxes.top(); + float origScore = currBox.score; + sorted_boxes.pop(); + + bool box_is_selected = true; + for (int idx = static_cast(fb.size()) - 1; idx >= currBox.suppress_begin_index; idx--) { + float iou = intersectionOverUnion(&boxesPtr[currBox.idx * 4], &boxesPtr[fb[idx].box_index * 4]); + currBox.score *= coeff(iou); + if (iou >= iou_threshold) { + box_is_selected = false; + break; } + if (currBox.score <= score_threshold) + break; + } - if (box_is_selected) { - scores_vector[io_selection_size] = scores_vector[box_idx]; - io_selection_size++; - fb.push_back({ scores_vector[box_idx].first, batch, class_idx, scores_vector[box_idx].second }); + currBox.suppress_begin_index = fb.size(); + if (box_is_selected) { + if (currBox.score == origScore) { + fb.push_back({ currBox.score, batch_idx, class_idx, currBox.idx }); + continue; + } + if (currBox.score > score_threshold) { + sorted_boxes.push(currBox); } } } } + numFiltBox[batch_idx][class_idx] = fb.size(); + size_t offset = batch_idx*num_classes*max_output_boxes_per_class + class_idx*max_output_boxes_per_class; + cpu_memcpy(filtBoxes.data() + offset, fb.data(), fb.size() * sizeof(filteredBoxes)); + }); + } + + void nmsWithoutSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, const SizeVector &scoresStrides, + std::vector &filtBoxes) { + int max_out_box = static_cast(max_output_boxes_per_class); + parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) { + const float *boxesPtr = boxes + batch_idx * boxesStrides[0]; + const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; + + std::vector> sorted_boxes; + for (int box_idx = 0; box_idx < num_boxes; box_idx++) { + if (scoresPtr[box_idx] > score_threshold) + sorted_boxes.emplace_back(std::make_pair(scoresPtr[box_idx], box_idx)); + } + + int io_selection_size = 0; + if (sorted_boxes.size() > 0) { + parallel_sort(sorted_boxes.begin(), sorted_boxes.end(), + [](const std::pair& l, const std::pair& r) { + return (l.first > r.first || ((l.first == r.first) && (l.second < r.second))); + }); + size_t offset = batch_idx*num_classes*max_output_boxes_per_class + class_idx*max_output_boxes_per_class; + filteredBoxes *fb = filtBoxes.data() + offset; + fb[0] = filteredBoxes(sorted_boxes[0].first, batch_idx, class_idx, sorted_boxes[0].second); + io_selection_size++; + for (size_t box_idx = 1; (box_idx < sorted_boxes.size()) && (io_selection_size < max_out_box); box_idx++) { + bool box_is_selected = true; + for (int idx = io_selection_size - 1; idx >= 0; idx--) { + float iou = intersectionOverUnion(&boxesPtr[sorted_boxes[box_idx].second * 4], &boxesPtr[fb[idx].box_index * 4]); + if (iou >= iou_threshold) { + box_is_selected = false; + break; + } + } + + if (box_is_selected) { + fb[io_selection_size] = filteredBoxes(sorted_boxes[box_idx].first, batch_idx, class_idx, sorted_boxes[box_idx].second); + io_selection_size++; + } + } + } + numFiltBox[batch_idx][class_idx] = io_selection_size; + }); + } + + StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { + const float *boxes = inputs[NMS_BOXES]->cbuffer().as() + inputs[NMS_BOXES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + const float *scores = inputs[NMS_SCORES]->cbuffer().as() + inputs[NMS_SCORES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + max_output_boxes_per_class = outputs.size() > NMS_SELECTEDSCORES ? 0 : num_boxes; + if (inputs.size() > NMS_MAXOUTPUTBOXESPERCLASS) { + max_output_boxes_per_class = (inputs[NMS_MAXOUTPUTBOXESPERCLASS]->cbuffer().as() + + inputs[NMS_MAXOUTPUTBOXESPERCLASS]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; } - if (sort_result_descending) { - parallel_sort(fb.begin(), fb.end(), [](const filteredBoxes& l, const filteredBoxes& r) { return l.score > r.score; }); + if (max_output_boxes_per_class == 0) + return OK; + + iou_threshold = outputs.size() > NMS_SELECTEDSCORES ? 0.0f : 1.0f; + if (inputs.size() > NMS_IOUTHRESHOLD) + iou_threshold = (inputs[NMS_IOUTHRESHOLD]->cbuffer().as() + + inputs[NMS_IOUTHRESHOLD]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; + + score_threshold = 0.0f; + if (inputs.size() > NMS_SCORETHRESHOLD) + score_threshold = (inputs[NMS_SCORETHRESHOLD]->cbuffer().as() + + inputs[NMS_SCORETHRESHOLD]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; + + soft_nms_sigma = 0.0f; + if (inputs.size() > NMS_SOFTNMSSIGMA) + soft_nms_sigma = (inputs[NMS_SOFTNMSSIGMA]->cbuffer().as() + + inputs[NMS_SOFTNMSSIGMA]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; + scale = 0.0f; + if (soft_nms_sigma > 0.0) { + scale = -0.5 / soft_nms_sigma; + } + + int *selected_indices = outputs[NMS_SELECTEDINDICES]->buffer().as() + + outputs[NMS_SELECTEDINDICES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + float *selected_scores = nullptr; + if (outputs.size() > NMS_SELECTEDSCORES) + selected_scores = outputs[NMS_SELECTEDSCORES]->buffer().as() + + outputs[NMS_SELECTEDSCORES]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + int *valid_outputs = nullptr; + if (outputs.size() > NMS_VALIDOUTPUTS) + valid_outputs = outputs[NMS_VALIDOUTPUTS]->buffer().as() + + outputs[NMS_VALIDOUTPUTS]->getTensorDesc().getBlockingDesc().getOffsetPadding(); + + const SizeVector &boxesStrides = inputs[NMS_BOXES]->getTensorDesc().getBlockingDesc().getStrides(); + const SizeVector &scoresStrides = inputs[NMS_SCORES]->getTensorDesc().getBlockingDesc().getStrides(); + + std::vector filtBoxes(max_output_boxes_per_class * num_batches * num_classes); + + if (soft_nms_sigma == 0.0f) { + nmsWithoutSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes); + } else { + nmsWithSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes); + } + + size_t startOffset = numFiltBox[0][0]; + for (size_t b = 0; b < numFiltBox.size(); b++) { + size_t batchOffset = b*num_classes*max_output_boxes_per_class; + for (size_t c = (b == 0 ? 1 : 0); c < numFiltBox[b].size(); c++) { + size_t offset = batchOffset + c*max_output_boxes_per_class; + cpu_memcpy(filtBoxes.data() + startOffset, filtBoxes.data() + offset, + numFiltBox[b][c] * sizeof(filteredBoxes)); + startOffset += numFiltBox[b][c]; + } } + filtBoxes.resize(startOffset); - int selected_indicesStride = outputs[0]->getTensorDesc().getBlockingDesc().getStrides()[0]; - int* selected_indicesPtr = selected_indices; - size_t idx; - for (idx = 0; idx < (std::min)(selected_indices_dims[0], fb.size()); idx++) { - selected_indicesPtr[0] = fb[idx].batch_index; - selected_indicesPtr[1] = fb[idx].class_index; - selected_indicesPtr[2] = fb[idx].box_index; - selected_indicesPtr += selected_indicesStride; + // need more particular comparator to get deterministic behaviour + // escape situation when filtred boxes with same score have different position from launch to launch + if (sort_result_descending) { + parallel_sort(filtBoxes.begin(), filtBoxes.end(), + [](const filteredBoxes& l, const filteredBoxes& r) { + return (l.score > r.score) || + (l.score == r.score && l.batch_index < r.batch_index) || + (l.score == r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) || + (l.score == r.score && l.batch_index == r.batch_index && l.class_index == r.class_index && l.box_index < r.box_index); + }); } - for (; idx < selected_indices_dims[0]; idx++) { - selected_indicesPtr[0] = -1; - selected_indicesPtr[1] = -1; - selected_indicesPtr[2] = -1; - selected_indicesPtr += selected_indicesStride; + + size_t validOutputs = std::min(filtBoxes.size(), static_cast(outputs[NMS_SELECTEDINDICES]->getTensorDesc().getDims()[0])); + + int selectedIndicesStride = outputs[NMS_SELECTEDINDICES]->getTensorDesc().getBlockingDesc().getStrides()[0]; + int *selectedIndicesPtr = selected_indices; + float *selectedScoresPtr = selected_scores; + + for (size_t idx = 0; idx < validOutputs; idx++) { + selectedIndicesPtr[0] = filtBoxes[idx].batch_index; + selectedIndicesPtr[1] = filtBoxes[idx].class_index; + selectedIndicesPtr[2] = filtBoxes[idx].box_index; + selectedIndicesPtr += selectedIndicesStride; + if (outputs.size() > NMS_SELECTEDSCORES) { + selectedScoresPtr[0] = static_cast(filtBoxes[idx].batch_index); + selectedScoresPtr[1] = static_cast(filtBoxes[idx].class_index); + selectedScoresPtr[2] = static_cast(filtBoxes[idx].score); + selectedScoresPtr += selectedIndicesStride; + } } + if (outputs.size() > NMS_VALIDOUTPUTS) + *valid_outputs = static_cast(validOutputs); return OK; } private: + // input const size_t NMS_BOXES = 0; const size_t NMS_SCORES = 1; const size_t NMS_MAXOUTPUTBOXESPERCLASS = 2; const size_t NMS_IOUTHRESHOLD = 3; const size_t NMS_SCORETHRESHOLD = 4; - bool center_point_box = false; + const size_t NMS_SOFTNMSSIGMA = 5; + + // output + const size_t NMS_SELECTEDINDICES = 0; + const size_t NMS_SELECTEDSCORES = 1; + const size_t NMS_VALIDOUTPUTS = 2; + + enum class boxEncoding { + CORNER, + CENTER + }; + boxEncoding boxEncodingType = boxEncoding::CORNER; bool sort_result_descending = true; + + size_t num_batches; + size_t num_boxes; + size_t num_classes; + + size_t max_output_boxes_per_class; + float iou_threshold; + float score_threshold; + float soft_nms_sigma; + float scale; + + std::vector> numFiltBox; + const std::string inType = "input", outType = "output"; + std::string logPrefix; + + void checkPrecision(const DataPtr &dataPtr, const std::vector precList, const std::string name, const std::string type) { + const TensorDesc &tensorDesc = dataPtr->getTensorDesc(); + if (std::find(precList.begin(), precList.end(), tensorDesc.getPrecision()) == precList.end()) + THROW_IE_EXCEPTION << logPrefix << " has unsupported '" << name << "' " << type << " precision: " << tensorDesc.getPrecision(); + } + + void check1DInput(const DataWeakPtr &dataPtr, const std::vector precList, const std::string name) { + auto lockDataPtr = dataPtr.lock(); + if (lockDataPtr == nullptr) { + THROW_IE_EXCEPTION << logPrefix << "has nullable '" << name << "' input"; + } + + checkPrecision(lockDataPtr, precList, name, inType); + + const SizeVector &dims = lockDataPtr->getTensorDesc().getDims(); + if (dims.size() != 0 && dims.size() != 1) + THROW_IE_EXCEPTION << logPrefix << "has unsupported '" << name << "' input rank: " << dims.size(); + if (dims.size() == 1) + if (dims[0] != 1) + THROW_IE_EXCEPTION << logPrefix << "has unsupported '" << name << "' input 1st dimension size: " << dims[0]; + } + + void checkOutput(const DataPtr &dataPtr, const std::vector precList, const std::string name) { + checkPrecision(dataPtr, precList, name, outType); + + const SizeVector &dims = dataPtr->getTensorDesc().getDims(); + if (dims.size() != 2) + THROW_IE_EXCEPTION << logPrefix << "has unsupported '" << name << "' output rank: " << dims.size(); + if (dims[1] != 3) + THROW_IE_EXCEPTION << logPrefix << "has unsupported '" << name << "' output 2nd dimension size: " << dims[1]; + } }; REG_FACTORY_FOR(NonMaxSuppressionImpl, NonMaxSuppression);