From fcc07f80cd51b68ef760c002a6aef604adae619f Mon Sep 17 00:00:00 2001 From: Egor Shulman Date: Wed, 23 Jun 2021 14:22:10 +0300 Subject: [PATCH] [IE CPU] Reimplement extension nodes via MKLDNNNode API (#5784) --- .../src/mkldnn_plugin/cpu_types.h | 23 +- .../src/mkldnn_plugin/mkldnn_node.cpp | 21 + .../src/mkldnn_plugin/mkldnn_node.h | 44 +- .../src/mkldnn_plugin/nodes/bucketize.cpp | 242 ------- .../nodes/ctc_greedy_decoder.cpp | 183 ----- .../nodes/ctc_greedy_decoder_seq_len.cpp | 203 ------ .../src/mkldnn_plugin/nodes/ctc_loss.cpp | 302 -------- .../src/mkldnn_plugin/nodes/cum_sum.cpp | 271 ------- .../mkldnn_plugin/nodes/detectionoutput.cpp | 663 ------------------ .../nodes/detectionoutput_onnx.cpp | 402 ----------- .../src/mkldnn_plugin/nodes/gather_tree.cpp | 184 ----- .../src/mkldnn_plugin/nodes/grn.cpp | 91 --- .../src/mkldnn_plugin/nodes/list_tbl.hpp | 21 - .../src/mkldnn_plugin/nodes/log_softmax.cpp | 136 ---- .../nodes/mkldnn_bucketize_node.cpp | 218 ++++++ .../nodes/mkldnn_bucketize_node.h | 43 ++ .../nodes/mkldnn_concat_node.cpp | 2 +- .../nodes/mkldnn_ctc_greedy_decoder_node.cpp | 167 +++++ .../nodes/mkldnn_ctc_greedy_decoder_node.h | 32 + ...mkldnn_ctc_greedy_decoder_seq_len_node.cpp | 170 +++++ .../mkldnn_ctc_greedy_decoder_seq_len_node.h | 35 + .../nodes/mkldnn_ctc_loss_node.cpp | 279 ++++++++ .../nodes/mkldnn_ctc_loss_node.h | 32 + .../nodes/mkldnn_cum_sum_node.cpp | 279 ++++++++ .../mkldnn_plugin/nodes/mkldnn_cum_sum_node.h | 50 ++ .../nodes/mkldnn_def_conv_node.cpp | 2 +- .../nodes/mkldnn_detection_output_node.cpp | 601 ++++++++++++++++ .../nodes/mkldnn_detection_output_node.h | 86 +++ .../mkldnn_plugin/nodes/mkldnn_eltwise_node.h | 1 - .../mkldnn_embedding_bag_offset_sum_node.cpp | 2 +- .../mkldnn_embedding_bag_packed_sum_node.cpp | 2 +- .../mkldnn_embedding_segments_sum_node.cpp | 2 +- ...mental_detectron_detection_output_node.cpp | 369 ++++++++++ ...rimental_detectron_detection_output_node.h | 46 ++ ...n_generate_proposals_single_image_node.cpp | 429 ++++++++++++ ...ron_generate_proposals_single_image_node.h | 50 ++ ...ntal_detectron_priorgridgenerator_node.cpp | 95 +++ ...mental_detectron_priorgridgenerator_node.h | 46 ++ ...tal_detectron_roifeatureextractor_node.cpp | 413 +++++++++++ ...ental_detectron_roifeatureextractor_node.h | 41 ++ ...n_experimental_detectron_topkrois_node.cpp | 82 +++ ...dnn_experimental_detectron_topkrois_node.h | 40 ++ ... => mkldnn_extract_image_patches_node.cpp} | 278 ++++---- ...pp => mkldnn_extract_image_patches_node.h} | 36 +- .../nodes/mkldnn_gather_elements_node.cpp | 2 +- .../nodes/mkldnn_gather_nd_node.cpp | 2 +- .../nodes/mkldnn_gather_node.cpp | 4 +- .../nodes/mkldnn_gather_tree_node.cpp | 148 ++++ .../nodes/mkldnn_gather_tree_node.h | 38 + .../mkldnn_plugin/nodes/mkldnn_grn_node.cpp | 81 +++ .../src/mkldnn_plugin/nodes/mkldnn_grn_node.h | 30 + .../nodes/mkldnn_log_softmax_node.cpp | 116 +++ .../nodes/mkldnn_log_softmax_node.h | 34 + .../mkldnn_plugin/nodes/mkldnn_math_node.cpp | 10 +- .../mkldnn_plugin/nodes/mkldnn_math_node.h | 1 - .../nodes/mkldnn_non_max_suppression_node.cpp | 406 +++++++++++ .../nodes/mkldnn_non_max_suppression_node.h | 102 +++ .../nodes/mkldnn_proposal_node.cpp | 198 ++++++ .../nodes/mkldnn_proposal_node.h | 42 ++ .../mkldnn_plugin/nodes/mkldnn_range_node.cpp | 140 ++++ .../mkldnn_plugin/nodes/mkldnn_range_node.h | 34 + .../nodes/mkldnn_reorg_yolo_node.cpp | 93 +++ .../nodes/mkldnn_reorg_yolo_node.h | 30 + .../nodes/mkldnn_reverse_sequence_node.cpp | 182 +++++ .../nodes/mkldnn_reverse_sequence_node.h | 38 + .../nodes/mkldnn_softmax_node.cpp | 2 +- .../mkldnn_plugin/nodes/mkldnn_topk_node.cpp | 478 +++++++++++++ .../mkldnn_plugin/nodes/mkldnn_topk_node.h | 114 +++ .../nodes/mkldnn_transpose_node.cpp | 2 +- .../nodes/non_max_suppression.cpp | 464 ------------ .../nodes/priorgridgenerator_onnx.cpp | 121 ---- .../src/mkldnn_plugin/nodes/proposal.cpp | 227 ------ .../src/mkldnn_plugin/nodes/proposal_onnx.cpp | 450 ------------ .../src/mkldnn_plugin/nodes/range.cpp | 164 ----- .../src/mkldnn_plugin/nodes/reorg_yolo.cpp | 99 --- .../mkldnn_plugin/nodes/reverse_sequence.cpp | 209 ------ .../nodes/roifeatureextractor_onnx.cpp | 433 ------------ .../src/mkldnn_plugin/nodes/topk.cpp | 572 --------------- .../src/mkldnn_plugin/nodes/topkrois_onnx.cpp | 101 --- .../extract_image_patches.cpp | 2 +- 80 files changed, 6169 insertions(+), 5714 deletions(-) delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder_seq_len.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/cum_sum.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/detectionoutput_onnx.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/grn.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.h rename inference-engine/src/mkldnn_plugin/nodes/{extract_image_patches.cpp => mkldnn_extract_image_patches_node.cpp} (66%) rename inference-engine/src/mkldnn_plugin/nodes/{extract_image_patches.hpp => mkldnn_extract_image_patches_node.h} (64%) create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.h create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.cpp create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.h delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/non_max_suppression.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/priorgridgenerator_onnx.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/proposal.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/proposal_onnx.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/range.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/reorg_yolo.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/reverse_sequence.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/topk.cpp delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/topkrois_onnx.cpp diff --git a/inference-engine/src/mkldnn_plugin/cpu_types.h b/inference-engine/src/mkldnn_plugin/cpu_types.h index d7f55446024c8c..e5bc8af0b5c745 100644 --- a/inference-engine/src/mkldnn_plugin/cpu_types.h +++ b/inference-engine/src/mkldnn_plugin/cpu_types.h @@ -64,7 +64,28 @@ enum Type { Reference, ShuffleChannels, DFT, - Math + Math, + CTCLoss, + Bucketize, + CTCGreedyDecoder, + CTCGreedyDecoderSeqLen, + CumSum, + DetectionOutput, + ExperimentalDetectronDetectionOutput, + LogSoftmax, + TopK, + GatherTree, + GRN, + Range, + Proposal, + ReorgYolo, + ReverseSequence, + ExperimentalDetectronTopKROIs, + ExperimentalDetectronROIFeatureExtractor, + ExperimentalDetectronPriorGridGenerator, + ExperimentalDetectronGenerateProposalsSingleImage, + ExtractImagePatches, + NonMaxSuppression }; enum Algorithm { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp index e2e2a3276b8c78..e46c7a7b0bdf9e 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp @@ -203,6 +203,27 @@ static const InferenceEngine::details::caseless_unordered_map { "SoftPlus", Math}, { "Softsign", Math}, { "Tan", Math}, + { "CTCLoss", CTCLoss}, + { "Bucketize", Bucketize}, + { "CTCGreedyDecoder", CTCGreedyDecoder}, + { "CTCGreedyDecoderSeqLen", CTCGreedyDecoderSeqLen}, + { "CumSum", CumSum}, + { "DetectionOutput", DetectionOutput}, + { "ExperimentalDetectronDetectionOutput", ExperimentalDetectronDetectionOutput}, + { "LogSoftmax", LogSoftmax}, + { "TopK", TopK}, + { "GatherTree", GatherTree}, + { "GRN", GRN}, + { "Range", Range}, + { "Proposal", Proposal}, + { "ReorgYolo", ReorgYolo}, + { "ReverseSequence", ReverseSequence}, + { "ExperimentalDetectronTopKROIs", ExperimentalDetectronTopKROIs}, + { "ExperimentalDetectronROIFeatureExtractor", ExperimentalDetectronROIFeatureExtractor}, + { "ExperimentalDetectronPriorGridGenerator", ExperimentalDetectronPriorGridGenerator}, + { "ExperimentalDetectronGenerateProposalsSingleImage", ExperimentalDetectronGenerateProposalsSingleImage}, + { "ExtractImagePatches", ExtractImagePatches}, + { "NonMaxSuppressionIEInternal", NonMaxSuppression} }; Type TypeFromName(const std::string type) { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h index e5f86f03ea0c4a..29618d51fdbaf5 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h @@ -129,7 +129,7 @@ static std::string NameFromType(Type type) { case EmbeddingBagPackedSum: return "EmbeddingBagPackedSum"; case EmbeddingBagOffsetsSum: - return "EmbeddingBagPackedSum"; + return "EmbeddingBagOffsetsSum"; case Gather: return "Gather"; case GatherElements: @@ -150,6 +150,48 @@ static std::string NameFromType(Type type) { return "DFT"; case Math: return "Math"; + case CTCLoss: + return "CTCLoss"; + case Bucketize: + return "Bucketize"; + case CTCGreedyDecoder: + return "CTCGreedyDecoder"; + case CTCGreedyDecoderSeqLen: + return "CTCGreedyDecoderSeqLen"; + case CumSum: + return "CumSum"; + case DetectionOutput: + return "DetectionOutput"; + case ExperimentalDetectronDetectionOutput: + return "ExperimentalDetectronDetectionOutput"; + case LogSoftmax: + return "LogSoftmax"; + case TopK: + return "TopK"; + case GatherTree: + return "GatherTree"; + case GRN: + return "GRN"; + case Range: + return "Range"; + case Proposal: + return "Proposal"; + case ReorgYolo: + return "ReorgYolo"; + case ReverseSequence: + return "ReverseSequence"; + case ExperimentalDetectronTopKROIs: + return "ExperimentalDetectronTopKROIs"; + case ExperimentalDetectronROIFeatureExtractor: + return "ExperimentalDetectronROIFeatureExtractor"; + case ExperimentalDetectronPriorGridGenerator: + return "ExperimentalDetectronPriorGridGenerator"; + case ExperimentalDetectronGenerateProposalsSingleImage: + return "ExperimentalDetectronGenerateProposalsSingleImage"; + case ExtractImagePatches: + return "ExtractImagePatches"; + case NonMaxSuppression: + return "NonMaxSuppression"; default: return "Unknown"; } diff --git a/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp b/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp deleted file mode 100644 index febdf1a8dfd0f2..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include "ie_parallel.hpp" -#include - -using namespace MKLDNNPlugin; - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -class BucketizeImpl : public ExtLayerBase { - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - const auto bucketsize = std::dynamic_pointer_cast(op); - if (!bucketsize) { - errorMessage = "Only opset3 Bucketize operation is supported"; - return false; - } - } catch (...) { - return false; - } - return true; - } - - std::string errorPrefix; - -public: - explicit BucketizeImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - errorPrefix = "Bucketize layer with name '" + op->get_friendly_name() + "' "; - const auto bucketsize = std::dynamic_pointer_cast(op); - - if (op->get_input_size() != 2 || op->get_output_size() != 1) { - IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; - } - - // check one attribute - with_right = bucketsize->get_with_right_bound(); - - // check precisions for input and output tensors - input_precision = details::convertPrecision(op->get_input_element_type(INPUT_TENSOR_PORT)); - if (input_precision != Precision::FP32 && input_precision != Precision::I32 && - input_precision != Precision::I64) { - input_precision = Precision::FP32; - } - boundaries_precision = details::convertPrecision(op->get_input_element_type(INPUT_BINS_PORT)); - if (boundaries_precision != Precision::FP32 && boundaries_precision != Precision::I32 && - boundaries_precision != Precision::I64) { - boundaries_precision = Precision::FP32; - } - output_precision = details::convertPrecision(op->get_output_element_type(OUTPUT_TENSOR_PORT)); - if (output_precision != Precision::I32 && output_precision != Precision::I64) { - output_precision = Precision::I32; - } - - // check dimensions of input tensors - SizeVector input_tensor_dims = op->get_input_shape(INPUT_TENSOR_PORT); - if (input_tensor_dims.size() < 1) { - IE_THROW() << errorPrefix << " has incorrect dimensions of the input."; - } - SizeVector input_bin_dims = op->get_input_shape(INPUT_BINS_PORT); - if (input_bin_dims.size() != 1) { - IE_THROW() << errorPrefix << " has incorrect dimensions of the boundaries tensor."; - } - if (input_bin_dims[0] != 0) { - with_bins = true; - } - num_bin_values = input_bin_dims[0]; - - num_values = std::accumulate(input_tensor_dims.begin(), input_tensor_dims.end(), size_t(1), std::multiplies()); - - addConfig(op, {{TensorDescCreatorTypes::ncsp, input_precision}, - {TensorDescCreatorTypes::ncsp, boundaries_precision}}, - {{TensorDescCreatorTypes::ncsp, output_precision}}); - } - catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - auto precision_mask = getPrecisionMask(input_precision, boundaries_precision, output_precision); - - switch (precision_mask) { - case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::I32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::I32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::I64, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::I64, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::I32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::I32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::I64, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::I64, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - default: - return GENERAL_ERROR; - } - - return OK; - } - -private: - template - void bucketize(Blob::Ptr input, Blob::Ptr boundaries, Blob::Ptr output) { - const auto *input_data = input->cbuffer().as(); - const auto *boundaries_data = boundaries->cbuffer().as(); - auto *output_data = output->buffer().as(); - - if (with_bins == false) { - memset(output_data, 0, num_values * sizeof(T_IND)); - return; - } - - // boundaries are assumed to be sorted and to have unique elements - parallel_for(num_values, [&](size_t ind) { - T value = input_data[ind]; - if (with_right) { - auto low = std::lower_bound(boundaries_data, boundaries_data + num_bin_values, value); - output_data[ind] = static_cast(low - boundaries_data); - } else { - auto up = std::upper_bound(boundaries_data, boundaries_data + num_bin_values, value); - output_data[ind] = static_cast(up - boundaries_data); - } - }); - } - - const size_t INPUT_TENSOR_PORT = 0; - const size_t INPUT_BINS_PORT = 1; - const size_t OUTPUT_TENSOR_PORT = 0; - - size_t num_values = 0; - size_t num_bin_values = 0; - bool with_right = false; - bool with_bins = false; - - Precision input_precision; - Precision boundaries_precision; - Precision output_precision; -}; - -REG_FACTORY_FOR(BucketizeImpl, Bucketize); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder.cpp b/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder.cpp deleted file mode 100644 index 0ba6ca7e960230..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder.cpp +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" -#include "ie_parallel.hpp" -#include -#include - -#include -#include - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -class CTCGreedyDecoderImpl: public ExtLayerBase { -public: - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto greedyDecOp = ngraph::as_type_ptr(op); - if (!greedyDecOp) { - errorMessage = "Node is not an instance of the CTCGreedyDecoder operation from operation set v0."; - return false; - } - } catch (...) { - return false; - } - - return true; - } - - explicit CTCGreedyDecoderImpl(const std::shared_ptr& op) : mergeRepeated_(true) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - std::string errPrefix = "CTCGreedyDecoder layer with name '" + op->get_friendly_name() + "' "; - if (op->get_input_size() != 2) - IE_THROW() << errPrefix << "has invalid number of input edges: " << op->get_input_size(); - if (op->get_output_size() != 1) - IE_THROW() << errPrefix << "has invalid number of outputs edges: " << op->get_output_size(); - - if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0] && - op->get_input_shape(DATA_INDEX)[1] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[1]) - IE_THROW() << errPrefix << "has invalid input shapes."; - - Precision inDataPrecision = details::convertPrecision(op->get_input_element_type(DATA_INDEX)); - if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16) - IE_THROW() << errPrefix << "has unsupported 'data' input precision: " << inDataPrecision; - - Precision seqLenPrecision = details::convertPrecision(op->get_input_element_type(SEQUENCE_LENGTH_INDEX)); - if (seqLenPrecision != Precision::FP32 && seqLenPrecision != Precision::BF16) - IE_THROW() << errPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision; - - auto greedyDecOp = ngraph::as_type_ptr(op); - mergeRepeated_ = greedyDecOp->get_ctc_merge_repeated(); - - addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}, - {TensorDescCreatorTypes::ncsp, Precision::FP32}}, - {{TensorDescCreatorTypes::ncsp, Precision::FP32}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { - const float* probabilities = inputs[DATA_INDEX]->cbuffer().as() + - inputs[DATA_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const float* sequenceMask = inputs[SEQUENCE_LENGTH_INDEX]->cbuffer().as() + - inputs[SEQUENCE_LENGTH_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - float* outputSequences = outputs[0]->buffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - const size_t T = inputs[DATA_INDEX]->getTensorDesc().getDims()[0]; - const size_t B = inputs[DATA_INDEX]->getTensorDesc().getDims()[1]; - const int C = inputs[DATA_INDEX]->getTensorDesc().getDims()[2]; - const size_t BC = B * C; - const size_t CB1 = C * (B - 1); - - const int blankIndex = C - 1; - - std::vector sequenceLengths(B, 0); - parallel_for(B, [&](size_t b) { - size_t t = 0; - for (; t < T; t++) { - if (sequenceMask[B * t + b] == 0.f) - break; - } - sequenceLengths[b] = t; - }); - - size_t workAmount = 0; - for (size_t b = 0; b < B; b++) { - workAmount += sequenceLengths[b]; - } - - // Parallelization could not be made directly by T due to output index depends on merged classes and - // blank index, thus could not be shared between threads. Better to divide operation on two steps. - // At the first stage find the maximum index. At second stage merge if needed. - // Such approach makes parallelization more efficient. - auto threadBody = [&](const int ithr, const int nthr) { - size_t start(0lu), end(0lu); - splitter(workAmount, nthr, ithr, start, end); - if (start >= end) - return; - size_t tStart = 0lu, bStart = 0lu; - for (; bStart < B; bStart++) { - tStart += sequenceLengths[bStart]; - if (tStart >= start) { - tStart = start - (tStart - sequenceLengths[bStart]); - break; - } - } - - size_t workCounter = start; - - for (size_t b = bStart; b < B; ++b) { - size_t outputIndex = b * T + tStart; - const float* probs = probabilities + b * C + BC * tStart; - size_t sequenceLength = sequenceLengths[b]; - - for (size_t t = tStart; t < sequenceLength; ++t) { - int maxClassIdx = 0; - - float maxProb = probs[0]; - ++probs; - - for (int c = 1; c < C; ++c, ++probs) { - if (*probs > maxProb) { - maxClassIdx = c; - maxProb = *probs; - } - } - probs += CB1; - outputSequences[outputIndex++] = static_cast(maxClassIdx); - - if (++workCounter >= end) { - return; - } - } - tStart = 0lu; - } - }; // thread body - - parallel_nt(0, threadBody); - - parallel_for(B, [&](size_t b) { - int prevClassIdx = -1; - size_t outputIndex = b * T; - const size_t sequenceLength = sequenceLengths[b]; - float* shiftedOut = outputSequences + b * T; - for (size_t t = 0; t < sequenceLength; ++t) { - if (*shiftedOut < blankIndex && - !(mergeRepeated_ && *shiftedOut == prevClassIdx)) { - outputSequences[outputIndex++] = *shiftedOut; - } - prevClassIdx = *shiftedOut; - shiftedOut++; - } - std::fill(outputSequences + outputIndex, outputSequences + (b + 1) * T, -1.f); - }); - - return OK; - } - -private: - const size_t DATA_INDEX = 0lu; - const size_t SEQUENCE_LENGTH_INDEX = 1lu; - bool mergeRepeated_; -}; - -REG_FACTORY_FOR(CTCGreedyDecoderImpl, CTCGreedyDecoder); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder_seq_len.cpp b/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder_seq_len.cpp deleted file mode 100644 index c60684ee0af3f8..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder_seq_len.cpp +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" -#include "ie_parallel.hpp" -#include -#include - -#include -#include - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -class CTCGreedyDecoderSeqLenImpl: public ExtLayerBase { -public: - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto greedyDecOp = ngraph::as_type_ptr(op); - if (!greedyDecOp) { - errorMessage = "Node is not an instance of the CTCGreedyDecoderSeqLen operation from operation set v6."; - return false; - } - } catch (...) { - return false; - } - - return true; - } - - explicit CTCGreedyDecoderSeqLenImpl(const std::shared_ptr& op) : mergeRepeated_(true) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - std::string errPrefix = "CTCGreedyDecoderSeqLen layer with name '" + op->get_friendly_name() + "' "; - if (op->get_input_size() < 2 || op->get_input_size() > 3) - IE_THROW() << errPrefix << "has invalid number of input edges: " << op->get_input_size(); - if (op->get_output_size() != 2) - IE_THROW() << errPrefix << "has invalid number of outputs edges: " << op->get_output_size(); - - if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0]) - IE_THROW() << errPrefix << "has invalid input shapes."; - - Precision inDataPrecision = details::convertPrecision(op->get_input_element_type(DATA_INDEX)); - if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16) - IE_THROW() << errPrefix << "has unsupported 'data' input precision: " << inDataPrecision; - - Precision seqLenPrecision = details::convertPrecision(op->get_input_element_type(SEQUENCE_LENGTH_INDEX)); - if (seqLenPrecision != Precision::I32 && seqLenPrecision != Precision::I64) - IE_THROW() << errPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision; - - auto greedyDecOp = ngraph::as_type_ptr(op); - mergeRepeated_ = greedyDecOp->get_merge_repeated(); - - if (op->get_input_size() == BLANK_INDEX) { - addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}}, - {{TensorDescCreatorTypes::ncsp, Precision::I32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}}); - } else { - Precision blIdxPrecision = details::convertPrecision(op->get_input_element_type(BLANK_INDEX)); - if (blIdxPrecision != Precision::I32 && blIdxPrecision != Precision::I64) - IE_THROW() << errPrefix << "has unsupported 'blank_index' input precision: " << blIdxPrecision; - - addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}}, - {{TensorDescCreatorTypes::ncsp, Precision::I32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}}); - } - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { - const float* probabilities = inputs[DATA_INDEX]->cbuffer().as() + - inputs[DATA_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const int* sequenceLengths = inputs[SEQUENCE_LENGTH_INDEX]->cbuffer().as() + - inputs[SEQUENCE_LENGTH_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - int* decodedClasses = outputs[DECODED_CLASSES_INDEX]->buffer().as() + - outputs[DECODED_CLASSES_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - int* decodedClassesLength = outputs[DECODED_CLASSES_LENGTH_INDEX]->buffer().as() + - outputs[DECODED_CLASSES_LENGTH_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - const auto& inDims = inputs[DATA_INDEX]->getTensorDesc().getDims(); - const size_t B = inDims[0]; - const size_t T = inDims[1]; - const int C = inDims[2]; - const size_t TC = T * C; - - int blankIndex = C - 1; - if (inputs.size() > BLANK_INDEX) - blankIndex = (inputs[BLANK_INDEX]->cbuffer().as() + - inputs[BLANK_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; - - size_t workAmount = 0; - for (size_t b = 0; b < B; b++) { - if (sequenceLengths[b] > T) { - if (resp) { - std::string errorMsg = errPrefix - + ". Sequence length " + std::to_string(sequenceLengths[b]) - + " cannot be greater than according decoded classes dimension size " - + std::to_string(outputs[DECODED_CLASSES_INDEX]->getTensorDesc().getDims()[1]); - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return PARAMETER_MISMATCH; - } - workAmount += sequenceLengths[b]; - } - // Parallelization could not be made directly by T due to output index depends on merged classes and - // blank index, thus could not be shared between threads. Better to divide operation on two steps. - // At the first stage find the maximum index. At second stage merge if needed. - // Such approach makes parallelization more efficient. - auto threadBody = [&](const int ithr, const int nthr) { - size_t start(0lu), end(0lu); - splitter(workAmount, nthr, ithr, start, end); - if (start >= end) - return; - size_t tStart = 0lu, bStart = 0lu; - for (; bStart < B; bStart++) { - tStart += sequenceLengths[bStart]; - if (tStart >= start) { - tStart = start - (tStart - sequenceLengths[bStart]); - break; - } - } - - size_t workCounter = start; - - for (size_t b = bStart; b < B; ++b) { - size_t outputIndex = b * T + tStart; - const float* probs = probabilities + b * TC + C * tStart; - const size_t actualSeqLen = sequenceLengths[b]; - - for (size_t t = tStart; t < actualSeqLen; ++t) { - int maxClassIdx = 0; - float maxProb = probs[0]; - probs++; - - for (int c = 1; c < C; c++, probs++) { - if (*probs > maxProb) { - maxClassIdx = c; - maxProb = *probs; - } - } - decodedClasses[outputIndex++] = maxClassIdx; - - if (++workCounter >= end) { - return; - } - } - tStart = 0lu; - } - }; // thread body - - parallel_nt(0, threadBody); - - parallel_for(B, [&](size_t b) { - int prevClassIdx = -1; - size_t outputIndex = b * T; - const size_t actualSeqLen = sequenceLengths[b]; - int* shiftedOut = decodedClasses + b * T; - - for (size_t t = 0; t < actualSeqLen; ++t) { - if (*shiftedOut != blankIndex && - !(mergeRepeated_ && *shiftedOut == prevClassIdx)) { - decodedClasses[outputIndex++] = *shiftedOut; - } - prevClassIdx = *shiftedOut; - shiftedOut++; - } - std::fill(decodedClasses + outputIndex, decodedClasses + (b + 1) * T, -1); - decodedClassesLength[b] = outputIndex - b * T; - }); - - return OK; - } - -private: - const size_t DATA_INDEX = 0lu; - const size_t SEQUENCE_LENGTH_INDEX = 1lu; - const size_t BLANK_INDEX = 2lu; - const size_t DECODED_CLASSES_INDEX = 0lu; - const size_t DECODED_CLASSES_LENGTH_INDEX = 1lu; - bool mergeRepeated_; - std::string errPrefix; -}; - -REG_FACTORY_FOR(CTCGreedyDecoderSeqLenImpl, CTCGreedyDecoderSeqLen); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp b/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp deleted file mode 100644 index 84d6b55a1a47e9..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp +++ /dev/null @@ -1,302 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" -#include "ie_parallel.hpp" -#include -#include - -#include - - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -class CTCLossImpl : public ExtLayerBase { -public: - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto ctcLossOp = ngraph::as_type_ptr(op); - if (!ctcLossOp) { - errorMessage = "Node is not an instance of the CTCLoss operation from operation set v4."; - return false; - } - } catch (...) { - return false; - } - - return true; - } - - explicit CTCLossImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - _logPrefix = std::string("CTCLoss layer with name '") + op->get_friendly_name() + "'"; - - if (op->get_input_size() != 4 && op->get_input_size() != 5) - IE_THROW() << _logPrefix << " has invalid inputs number."; - - auto ctcLossOp = ngraph::as_type_ptr(op); - _ctcMergeRepeated = ctcLossOp->get_ctc_merge_repeated(); - _preprocessCollapseRepeated = ctcLossOp->get_preprocess_collapse_repeated(); - _unique = ctcLossOp->get_unique(); - - std::vector inDataConfigurators; - inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::FP32}); - for (int i = 1; i < op->get_input_size(); i++) { - inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::I32}); - } - addConfig(op, inDataConfigurators, - {{TensorDescCreatorTypes::ncsp, Precision::FP32}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, - std::vector& outputs, - ResponseDesc *resp) noexcept override { - StatusCode returnCode = OK; - - const float* logits = inputs[0]->cbuffer().as() + - inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const int* logitsLength = inputs[1]->cbuffer().as() + - inputs[1]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const int* labels = inputs[2]->cbuffer().as() + - inputs[2]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const int* labelsLength = inputs[3]->cbuffer().as() + - inputs[3]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - float* dstData = outputs[0]->buffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - const auto& logitsShape = inputs[0]->getTensorDesc().getDims(); - const size_t batchNum = logitsShape[0]; - const size_t maxTime = logitsShape[1]; - const size_t classesNum = logitsShape[2]; - - int blankIndex = classesNum - 1; - if (inputs.size() > 4) { - blankIndex = inputs[4]->cbuffer().as()[0]; - } - - std::vector decodedTargetLenB(batchNum, 0); - std::vector> targetDB(batchNum); - std::vector>> logProbabilitiesB(batchNum); - std::vector errorMsgB(parallel_get_max_threads()); - - auto threadBody_1 = [&](const int ithr, const int nthr) { - size_t start(0lu), end(0lu); - splitter(batchNum, nthr, ithr, start, end); - if (start >= end) - return; - - for (size_t b = start; b < end; b++) { - if (logitsLength[b] < 0 || labelsLength[b] < 0 || logitsLength[b] > maxTime || labelsLength[b] > logitsLength[b]) { - errorMsgB[ithr] = _logPrefix + ". Logit length cannot be greater than max sequence length. " - + "Label length cannot be greater than a logit length" - + " and both cannot be negative.\nMaxSeqLen: " - + std::to_string(maxTime) + "; Logit len: " + std::to_string(logitsLength[b]) - + "; Label len: " + std::to_string(labelsLength[b]); - returnCode = GENERAL_ERROR; - return; - } - const size_t actualLogitLen = logitsLength[b]; - const size_t actualTargetLen = labelsLength[b]; - size_t decodedTargetLen = 0lu; - - // Decoding target: merge repeated characters if preprocess_collapse_repeated == True, - // find unique elemnts if unique == True. - // Inserts blanks before each index and a blank at the end. - const int* target = &labels[b * maxTime]; - targetDB[b].resize(actualTargetLen * 2 + 1); - auto& targetD = targetDB[b]; - if (_unique) { - std::unordered_set uniqVals; - for (size_t t = 0lu; t < actualTargetLen; t++) { - if (uniqVals.find(target[t]) != uniqVals.end()) { - continue; - } - uniqVals.insert(target[t]); - targetD[decodedTargetLen++] = blankIndex; - targetD[decodedTargetLen++] = target[t]; - } - targetD[decodedTargetLen++] = blankIndex; - } else if (_preprocessCollapseRepeated) { - auto prevValue = target[0]; - targetD[decodedTargetLen++] = blankIndex; - targetD[decodedTargetLen++] = target[0]; - for (size_t t = 1lu; t < actualTargetLen; t++) { - if (target[t] == prevValue) { - continue; - } - targetD[decodedTargetLen++] = blankIndex; - targetD[decodedTargetLen++] = prevValue = target[t]; - } - targetD[decodedTargetLen++] = blankIndex; - } else { - for (size_t t = 0lu; t < actualTargetLen; t++) { - targetD[decodedTargetLen++] = blankIndex; - targetD[decodedTargetLen++] = target[t]; - } - targetD[decodedTargetLen++] = blankIndex; - } - decodedTargetLenB[b] = decodedTargetLen; - - auto& logProbabilities = logProbabilitiesB[b]; - logProbabilities.resize(actualLogitLen); - for (size_t ll = 0; ll < actualLogitLen; ll++) { - logProbabilities[ll].resize(decodedTargetLen); - } - } // for batch - }; // threadBody_1 - - parallel_nt(0, threadBody_1); - if (returnCode != OK) { - std::string resErr(""); - for (auto& err : errorMsgB) { - if (!err.empty()) - resErr += err + "\n"; - resErr.copy(resp->msg, sizeof(resp->msg) - 1); - } - return returnCode; - } - - const size_t TC = maxTime * classesNum; - - size_t workAmount2 = 0lu; - for (size_t b = 0; b < batchNum; b++) { - workAmount2 += logitsLength[b]; - } - - auto threadBody_2 = [&](const int ithr, const int nthr) { - size_t start(0lu), end(0lu); - size_t sB(0lu), sT(0lu); - splitter(workAmount2, nthr, ithr, start, end); - if (start >= end) - return; - int64_t cw = 0, st = start; - for (; sB < batchNum; sB++) { - cw += logitsLength[sB]; - if (cw >= st) { - sT = logitsLength[sB] + st - cw; - break; - } - } - size_t workCounter = start; - - for (size_t b = sB; b < batchNum; b++) { - const size_t actualLogitLen = logitsLength[b]; - const size_t decodedTargetLen = decodedTargetLenB[b]; - auto& logProbabilities = logProbabilitiesB[b]; - auto& targetD = targetDB[b]; - - double expSum = 0.0; - size_t btcT = b * TC + sT * classesNum; - // logProbabilities = logSoftmax = logits[b][t][c] - ln(sum_c(exp(logits[b][t]))) - for (size_t t = sT; t < actualLogitLen; t++) { - expSum = 0.0; - for (size_t c = 0lu; c < classesNum; c++) { - expSum += std::exp(logits[btcT + c]); - } - for (size_t s = 0lu; s < decodedTargetLen; s++) { - logProbabilities[t][s] = logits[btcT + targetD[s]] - std::log(expSum); - } - btcT += classesNum; - if (++workCounter >= end) { - return; - } - } - sT = 0lu; - } // for batch - }; // threadBody_2 - - parallel_nt(0, threadBody_2); - - const auto float_inf = std::numeric_limits::infinity(); - - auto sumLogs = [&float_inf](float log1, float log2) { - if (log1 == -float_inf) { - return log2; - } else if (log2 == -float_inf) { - return log1; - } else { - if (log1 > log2) - return log1 + std::log1pf(std::exp(log2 - log1)); - else - return log2 + std::log1pf(std::exp(log1 - log2)); - } - }; - - auto threadBody_3 = [&](const int ithr, const int nthr) { - size_t start(0lu), end(0lu); - splitter(batchNum, nthr, ithr, start, end); - if (start >= end) - return; - - // As per Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural Networks: - // Graves et al., 2016, paragraph 4.1 (10) - for (size_t b = start; b < end; b++) { - auto& targetD = targetDB[b]; - auto& logProbabilities = logProbabilitiesB[b]; - const int actualLogitLen = logitsLength[b]; - const int decodedTargetLen = decodedTargetLenB[b]; - std::vector> logBwd(decodedTargetLen, std::vector(actualLogitLen, -float_inf)); - for (int s = decodedTargetLen - 2; s < decodedTargetLen; s++) - logBwd[s][actualLogitLen - 1] = 0.f; - - for (int t = actualLogitLen - 2; t >= 0; t--) { - const int t_1 = t + 1; - for (int s = std::max(0, decodedTargetLen - (2 * (actualLogitLen - t))); - s < std::min(decodedTargetLen, 2 * (t_1)); s++) { - if (_ctcMergeRepeated || targetD[s] == blankIndex) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s][t_1] + logProbabilities[t_1][s]); - } - - if (s + 1 < decodedTargetLen) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s + 1][t_1] + logProbabilities[t_1][s + 1]); - } - - if (s + 2 < decodedTargetLen) { - if (targetD[s] != blankIndex && (!_ctcMergeRepeated || (targetD[s] != targetD[s + 2]))) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s + 2][t_1] + logProbabilities[t_1][s + 2]); - } - } - } - } - - logBwd[0][0] += logProbabilities[0][0]; - logBwd[1][0] += logProbabilities[0][(decodedTargetLen > 1) ? 1 : 0]; - - dstData[b] = -sumLogs(logBwd[0][0], logBwd[1][0]); - } // for batch - }; // threadBody_3 - - parallel_nt(0, threadBody_3); - - return returnCode; - } // execute - -protected: - bool _ctcMergeRepeated; - bool _preprocessCollapseRepeated; - bool _unique; - - std::string _logPrefix; -}; - -REG_FACTORY_FOR(CTCLossImpl, CTCLoss); -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/cum_sum.cpp b/inference-engine/src/mkldnn_plugin/nodes/cum_sum.cpp deleted file mode 100644 index 8940527713cd36..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/cum_sum.cpp +++ /dev/null @@ -1,271 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "list.hpp" -#include "base.hpp" - -#include -#include -#include "ie_parallel.hpp" -#include "ie_precision.hpp" -#include -#include -#include - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -class CumSumImpl: public ExtLayerBase { - enum { CUM_SUM_DATA, AXIS, numOfInputs }; - bool exclusive; - bool reverse; - size_t numOfDims; - size_t axis = 0; - std::vector shape; - - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - const auto cumsum = std::dynamic_pointer_cast(op); - if (!cumsum) { - errorMessage = "Only opset3 CumSum operation is supported"; - return false; - } - } catch (...) { - return false; - } - return true; - } - -public: - explicit CumSumImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - layerName = op->get_friendly_name(); - if ((op->get_input_size() != numOfInputs && op->get_input_size() != (numOfInputs - 1)) || op->get_output_size() != 1) - IE_THROW() << "CumSum layer with name '" << layerName << "' has incorrect number of input/output edges!"; - - const auto &dataShape = op->get_input_shape(CUM_SUM_DATA); - if (dataShape.size() < 1) { - IE_THROW() << "CumSum layer with name '" << layerName << "' doesn't support 'data' input tensor with rank: " << dataShape.size(); - } - numOfDims = dataShape.size(); - - const auto cumsum = std::dynamic_pointer_cast(op); - exclusive = cumsum->is_exclusive(); - reverse = cumsum->is_reverse(); - - auto dataPrecision = details::convertPrecision(cumsum->get_input_element_type(CUM_SUM_DATA)); - if (dataPrecision != Precision::I8 && dataPrecision != Precision::U8 && dataPrecision != Precision::I16 && dataPrecision != Precision::I32 && - dataPrecision != Precision::FP32 && dataPrecision != Precision::I64 && dataPrecision != Precision::U64 && dataPrecision != Precision::BF16) - IE_THROW() << "CumSum layer with name '" << layerName << "' has unsupported 'data' input precision: " << dataPrecision.name(); - - if (cumsum->get_input_size() == numOfInputs) { - const auto& axisTensorPrec = details::convertPrecision(cumsum->get_input_element_type(AXIS)); - if (axisTensorPrec != Precision::I32 && axisTensorPrec != Precision::I64) - IE_THROW() << "CumSum layer with name '" << layerName << "' has unsupported 'axis' input precision: " << axisTensorPrec.name(); - - if (!ngraph::is_scalar(cumsum->get_input_shape(AXIS))) - IE_THROW() << "CumSum layer with name '" << layerName << "' doesn't support 'axis' input tensor with non scalar rank"; - } - - if (dataShape != cumsum->get_output_shape(0)) - IE_THROW() << "CumSum layer with name '" << layerName << "' has different 'data' input and output dimensions"; - - shape = dataShape; - - std::vector inDataConfigurators; - if (dataPrecision == Precision::BF16) - dataPrecision = Precision::FP32; - inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, dataPrecision}); - if (op->get_input_size() > 1) - inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::I32}); - addConfig(op, inDataConfigurators, {{TensorDescCreatorTypes::ncsp, dataPrecision}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - if (inputs.size() == numOfInputs) - axis = getAxis(inputs[AXIS], inputs[CUM_SUM_DATA]); - - const auto &dataPrecision = inputs[CUM_SUM_DATA]->getTensorDesc().getPrecision(); - switch (dataPrecision) { - case Precision::I8 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::U8 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::I16 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::I32 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::FP32 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::I64 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::U64 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - default : { - if (resp) { - std::string errorMsg = "CumSum layer with name '" + layerName + "' has unsupported 'data' input precision: " + dataPrecision.name(); - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return GENERAL_ERROR; - } - } - return OK; - } - -private: - template - void execImpl(const Blob::CPtr& _input, const Blob::Ptr& _output) { - const auto *input = _input->cbuffer().as() + _input->getTensorDesc().getBlockingDesc().getOffsetPadding(); - auto *output = _output->buffer().as() + _output->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const std::vector strides = _input->getTensorDesc().getBlockingDesc().getStrides(); - - if (reverse) { - if (exclusive) { - cumSum(input, output, strides); - } else { - cumSum(input, output, strides); - } - } else { - if (exclusive) { - cumSum(input, output, strides); - } else { - cumSum(input, output, strides); - } - } - } - - template - void cumSum(const dataType *input, dataType *output, const std::vector &strides) { - SizeVector iterationRange(numOfDims - 1); - size_t j = 0; - for (size_t i = 0; i < shape.size(); i++) { - if (i == axis) - continue; - iterationRange[j++] = shape[i]; - } - size_t work_amount_dst = std::accumulate(iterationRange.begin(), iterationRange.end(), 1, std::multiplies()); - parallel_nt(0, [&](const int ithr, const int nthr) { - size_t start = 0, end = 0; - SizeVector counters(numOfDims - 1, 0); - splitter(work_amount_dst, nthr, ithr, start, end); - - parallelItInit(start, counters, iterationRange); - - for (size_t iwork = start; iwork < end; ++iwork) { - std::vector forStartOffset(numOfDims); - forStartOffset[axis] = 0; - for (size_t offsetIdx = 0, countersIdx = 0; offsetIdx < numOfDims; ++offsetIdx) { - if (offsetIdx == axis) { - continue; - } - forStartOffset[offsetIdx] = counters[countersIdx++]; - } - - size_t startOffset = getStartOffset(forStartOffset, strides); - - const dataType *inputStart = input + startOffset; - dataType *outputStart = output + startOffset; - - size_t offset = strides[axis]; - if (reverse) { - if (exclusive) { - outputStart[offset*(shape[axis] - 1)] = 0; - for (int64_t i = shape[axis] - 2; i >= 0; i--) { - outputStart[i*offset] = inputStart[(i+1)*offset] + outputStart[(i+1)*offset]; - } - } else { - outputStart[offset*(shape[axis] - 1)] = inputStart[offset * (shape[axis] - 1)]; - for (int64_t i = shape[axis] - 2; i >= 0; i--) { - outputStart[i*offset] = inputStart[i*offset] + outputStart[(i+1)*offset]; - } - } - } else { - if (exclusive) { - outputStart[0] = 0; - for (size_t i = 1; i < shape[axis]; i++) { - outputStart[i*offset] = inputStart[(i-1)*offset] + outputStart[(i-1)*offset]; - } - } else { - outputStart[0] = inputStart[0]; - for (size_t i = 1; i < shape[axis]; i++) { - outputStart[i*offset] = inputStart[i*offset] + outputStart[(i-1)*offset]; - } - } - } - - parallelItStep(counters, iterationRange); - } - }); - } - - void parallelItInit(size_t start, std::vector& counters, const std::vector& iterationRange) { - auto itCounter = counters.rbegin(); - auto itWork = iterationRange.rbegin(); - while (itCounter != counters.rend() && itWork != iterationRange.rend()) { - *itCounter = start % *itWork; - start /= *itWork; - ++itCounter; - ++itWork; - } - } - - inline void parallelItStep(std::vector& counters, const std::vector& iterationRange) { - auto itCounter = counters.rbegin(); - auto itWork = iterationRange.rbegin(); - - while (itCounter != counters.rend() && itWork != iterationRange.rend()) { - *itCounter = (*itCounter + 1) % *itWork; - if (*itCounter != 0) { - break; - } - ++itCounter; - ++itWork; - } - } - - inline size_t getStartOffset(const std::vector &forStartOffset, const std::vector& strides) const { - size_t startOffset = 0; - for (size_t idx = 0; idx < forStartOffset.size(); ++idx) { - startOffset += forStartOffset[idx] * strides[idx]; - } - return startOffset; - } - - size_t getAxis(const Blob::CPtr& _axis, const Blob::CPtr& _data) const { - const auto& axisPrecision = _axis->getTensorDesc().getPrecision(); - const int64_t dataShapeSize = static_cast(_data->getTensorDesc().getDims().size()); - int64_t axisValueFromBlob; - switch (axisPrecision) { - case Precision::I32 : { - const auto *axisPtr = _axis->cbuffer().as(); - axisValueFromBlob = static_cast(axisPtr[0]); - break; - } - case Precision::I64 : { - const auto *axisPtr = _axis->cbuffer().as(); - axisValueFromBlob = axisPtr[0]; - break; - } - default : { - IE_THROW() << "CumSum layer with name '" << layerName << "' doesn't support 'axis' input with precision: " << axisPrecision.name(); - } - } - if (axisValueFromBlob < -dataShapeSize || axisValueFromBlob > dataShapeSize - 1) - IE_THROW() << "CumSum layer with name '" << layerName << "' has axis with a value out of range: " << axisValueFromBlob; - return axisValueFromBlob >= 0 ? axisValueFromBlob : (axisValueFromBlob + dataShapeSize); - } - -private: - std::string layerName; -}; - -REG_FACTORY_FOR(CumSumImpl, CumSum); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine \ No newline at end of file diff --git a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp b/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp deleted file mode 100644 index bd3b1da8fc878c..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp +++ /dev/null @@ -1,663 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" - -#include -#include -#include -#include -#include -#include -#include "caseless.hpp" -#include "ie_parallel.hpp" -#include "common/tensor_desc_creator.h" -#include - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -template -static bool SortScorePairDescend(const std::pair& pair1, - const std::pair& pair2) { - return pair1.first > pair2.first; -} - -class DetectionOutputImpl: public ExtLayerBase { -public: - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto doOp = ngraph::as_type_ptr(op); - if (!doOp) { - errorMessage = "Node is not an instance of the DetectionOutput from the operations set v0."; - return false; - } - if (!details::CaselessEq()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CENTER_SIZE") && - !details::CaselessEq()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CORNER")) { - errorMessage = "Unsupported code_type attribute."; - return false; - } - } catch (...) { - return false; - } - return true; - } - - explicit DetectionOutputImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - if (op->get_input_size() != 3 && op->get_input_size() != 5) - IE_THROW() << "Invalid number of input edges."; - - if (op->get_output_size() != 1) - IE_THROW() << "Invalid number of output edges."; - - auto doOp = ngraph::as_type_ptr(op); - auto attributes = doOp->get_attrs(); - - _num_classes = attributes.num_classes; - _background_label_id = attributes.background_label_id; - _top_k = attributes.top_k; - _variance_encoded_in_target = attributes.variance_encoded_in_target; - _keep_top_k = attributes.keep_top_k[0]; - _nms_threshold = attributes.nms_threshold; - _confidence_threshold = attributes.confidence_threshold; - _share_location = attributes.share_location; - _clip_before_nms = attributes.clip_before_nms; - _clip_after_nms = attributes.clip_after_nms; - _decrease_label_id = attributes.decrease_label_id; - _normalized = attributes.normalized; - _image_height = attributes.input_height; - _image_width = attributes.input_width; - _prior_size = _normalized ? 4 : 5; - _offset = _normalized ? 0 : 1; - _num_loc_classes = _share_location ? 1 : _num_classes; - - with_add_box_pred = op->get_input_size() == 5; - _objectness_score = attributes.objectness_score; - - _code_type = (details::CaselessEq()(attributes.code_type, "caffe.PriorBoxParameter.CENTER_SIZE") ? - CodeType::CENTER_SIZE : CodeType::CORNER); - - _num_priors = static_cast(op->get_input_shape(idx_priors).back() / _prior_size); - _priors_batches = op->get_input_shape(idx_priors).front() != 1; - - if (_num_priors * _num_loc_classes * 4 != static_cast(op->get_input_shape(idx_location)[1])) - IE_THROW() << "Number of priors must match number of location predictions (" - << _num_priors * _num_loc_classes * 4 << " vs " - << op->get_input_shape(idx_location)[1] << ")"; - - if (_num_priors * _num_classes != static_cast(op->get_input_shape(idx_confidence).back())) - IE_THROW() << "Number of priors must match number of confidence predictions."; - - if (_decrease_label_id && _background_label_id != 0) - IE_THROW() << "Cannot use decrease_label_id and background_label_id parameter simultaneously."; - - _num = static_cast(op->get_input_shape(idx_confidence)[0]); - - _decoded_bboxes.resize(_num * _num_classes * _num_priors * 4); - _buffer.resize(_num * _num_classes * _num_priors); - _indices.resize(_num * _num_classes * _num_priors); - _detections_count.resize(_num * _num_classes); - _bbox_sizes.resize(_num * _num_classes * _num_priors); - _num_priors_actual.resize(_num); - - const auto &confSize = op->get_input_shape(idx_confidence); - _reordered_conf.resize(std::accumulate(confSize.begin(), confSize.end(), 1, std::multiplies())); - - std::vector inDataConfigurators(op->get_input_size(), {TensorDescCreatorTypes::ncsp, Precision::FP32}); - addConfig(op, inDataConfigurators, - {{TensorDescCreatorTypes::ncsp, Precision::FP32}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { - float *dst_data = outputs[0]->buffer(); - - const float *loc_data = inputs[idx_location]->buffer().as(); - const float *conf_data = inputs[idx_confidence]->buffer().as(); - const float *prior_data = inputs[idx_priors]->buffer().as(); - const float *arm_conf_data = inputs.size() > 3 ? inputs[idx_arm_confidence]->buffer().as() : nullptr; - const float *arm_loc_data = inputs.size() > 4 ? inputs[idx_arm_location]->buffer().as() : nullptr; - - const int N = inputs[idx_confidence]->getTensorDesc().getDims()[0]; - - float *decoded_bboxes_data = _decoded_bboxes.data(); - float *reordered_conf_data = _reordered_conf.data(); - float *bbox_sizes_data = _bbox_sizes.data(); - int *detections_data = _detections_count.data(); - int *buffer_data = _buffer.data(); - int *indices_data = _indices.data(); - int *num_priors_actual = _num_priors_actual.data(); - - for (int n = 0; n < N; ++n) { - const float *ppriors = prior_data; - const float *prior_variances = prior_data + _num_priors*_prior_size; - if (_priors_batches) { - ppriors += _variance_encoded_in_target ? n*_num_priors*_prior_size : 2*n*_num_priors*_prior_size; - prior_variances += _variance_encoded_in_target ? 0 : 2*n*_num_priors*_prior_size; - } - - if (_share_location) { - const float *ploc = loc_data + n*4*_num_priors; - float *pboxes = decoded_bboxes_data + n*4*_num_priors; - float *psizes = bbox_sizes_data + n*_num_priors; - - if (with_add_box_pred) { - const float *p_arm_loc = arm_loc_data + n*4*_num_priors; - decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); - decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false); - } else { - decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); - } - } else { - for (int c = 0; c < _num_loc_classes; ++c) { - if (c == _background_label_id) { - continue; - } - const float *ploc = loc_data + n*4*_num_loc_classes*_num_priors + c*4; - float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors + c*4*_num_priors; - float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors + c*_num_priors; - if (with_add_box_pred) { - const float *p_arm_loc = arm_loc_data + n*4*_num_loc_classes*_num_priors + c*4; - decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); - decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false); - } else { - decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); - } - } - } - } - - if (with_add_box_pred) { - for (int n = 0; n < N; ++n) { - for (int p = 0; p < _num_priors; ++p) { - if (arm_conf_data[n*_num_priors*2 + p * 2 + 1] < _objectness_score) { - for (int c = 0; c < _num_classes; ++c) { - reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = c == _background_label_id ? 1.0f : 0.0f; - } - } else { - for (int c = 0; c < _num_classes; ++c) { - reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c]; - } - } - } - } - } else { - for (int n = 0; n < N; ++n) { - for (int c = 0; c < _num_classes; ++c) { - for (int p = 0; p < _num_priors; ++p) { - reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c]; - } - } - } - } - - memset(detections_data, 0, N*_num_classes*sizeof(int)); - - for (int n = 0; n < N; ++n) { - int detections_total = 0; - - if (!_decrease_label_id) { - // Caffe style - parallel_for(_num_classes, [&](int c) { - if (c != _background_label_id) { // Ignore background class - int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors; - int *pbuffer = buffer_data + c*_num_priors; - int *pdetections = detections_data + n*_num_classes + c; - - const float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors; - const float *pboxes; - const float *psizes; - if (_share_location) { - pboxes = decoded_bboxes_data + n*4*_num_priors; - psizes = bbox_sizes_data + n*_num_priors; - } else { - pboxes = decoded_bboxes_data + n*4*_num_classes*_num_priors + c*4*_num_priors; - psizes = bbox_sizes_data + n*_num_classes*_num_priors + c*_num_priors; - } - - nms_cf(pconf, pboxes, psizes, pbuffer, pindices, *pdetections, num_priors_actual[n]); - } - }); - } else { - // MXNet style - int *pindices = indices_data + n*_num_classes*_num_priors; - int *pbuffer = buffer_data; - int *pdetections = detections_data + n*_num_classes; - - const float *pconf = reordered_conf_data + n*_num_classes*_num_priors; - const float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors; - const float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors; - - nms_mx(pconf, pboxes, psizes, pbuffer, pindices, pdetections, _num_priors); - } - - for (int c = 0; c < _num_classes; ++c) { - detections_total += detections_data[n*_num_classes + c]; - } - - if (_keep_top_k > -1 && detections_total > _keep_top_k) { - std::vector>> conf_index_class_map; - - for (int c = 0; c < _num_classes; ++c) { - int detections = detections_data[n*_num_classes + c]; - int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors; - - float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors; - - for (int i = 0; i < detections; ++i) { - int idx = pindices[i]; - conf_index_class_map.push_back(std::make_pair(pconf[idx], std::make_pair(c, idx))); - } - } - - std::sort(conf_index_class_map.begin(), conf_index_class_map.end(), - SortScorePairDescend>); - conf_index_class_map.resize(_keep_top_k); - - // Store the new indices. - memset(detections_data + n*_num_classes, 0, _num_classes * sizeof(int)); - - for (size_t j = 0; j < conf_index_class_map.size(); ++j) { - int label = conf_index_class_map[j].second.first; - int idx = conf_index_class_map[j].second.second; - int *pindices = indices_data + n * _num_classes * _num_priors + label * _num_priors; - pindices[detections_data[n*_num_classes + label]] = idx; - detections_data[n*_num_classes + label]++; - } - } - } - - const int num_results = outputs[0]->getTensorDesc().getDims()[2]; - const int DETECTION_SIZE = outputs[0]->getTensorDesc().getDims()[3]; - if (DETECTION_SIZE != 7) { - return NOT_IMPLEMENTED; - } - - int dst_data_size = 0; - if (_keep_top_k > 0) - dst_data_size = N * _keep_top_k * DETECTION_SIZE * sizeof(float); - else if (_top_k > 0) - dst_data_size = N * _top_k * _num_classes * DETECTION_SIZE * sizeof(float); - else - dst_data_size = N * _num_classes * _num_priors * DETECTION_SIZE * sizeof(float); - - if (dst_data_size > outputs[0]->byteSize()) { - return OUT_OF_BOUNDS; - } - memset(dst_data, 0, dst_data_size); - - int count = 0; - for (int n = 0; n < N; ++n) { - const float *pconf = reordered_conf_data + n * _num_priors * _num_classes; - const float *pboxes = decoded_bboxes_data + n*_num_priors*4*_num_loc_classes; - const int *pindices = indices_data + n*_num_classes*_num_priors; - - for (int c = 0; c < _num_classes; ++c) { - for (int i = 0; i < detections_data[n*_num_classes + c]; ++i) { - int idx = pindices[c*_num_priors + i]; - - dst_data[count * DETECTION_SIZE + 0] = static_cast(n); - dst_data[count * DETECTION_SIZE + 1] = static_cast(_decrease_label_id ? c-1 : c); - dst_data[count * DETECTION_SIZE + 2] = pconf[c*_num_priors + idx]; - - float xmin = _share_location ? pboxes[idx*4 + 0] : - pboxes[c*4*_num_priors + idx*4 + 0]; - float ymin = _share_location ? pboxes[idx*4 + 1] : - pboxes[c*4*_num_priors + idx*4 + 1]; - float xmax = _share_location ? pboxes[idx*4 + 2] : - pboxes[c*4*_num_priors + idx*4 + 2]; - float ymax = _share_location ? pboxes[idx*4 + 3] : - pboxes[c*4*_num_priors + idx*4 + 3]; - - if (_clip_after_nms) { - xmin = (std::max)(0.0f, (std::min)(1.0f, xmin)); - ymin = (std::max)(0.0f, (std::min)(1.0f, ymin)); - xmax = (std::max)(0.0f, (std::min)(1.0f, xmax)); - ymax = (std::max)(0.0f, (std::min)(1.0f, ymax)); - } - - dst_data[count * DETECTION_SIZE + 3] = xmin; - dst_data[count * DETECTION_SIZE + 4] = ymin; - dst_data[count * DETECTION_SIZE + 5] = xmax; - dst_data[count * DETECTION_SIZE + 6] = ymax; - - ++count; - } - } - } - - if (count < num_results) { - // marker at end of boxes list - dst_data[count * DETECTION_SIZE + 0] = -1; - } - - return OK; - } - -private: - const int idx_location = 0; - const int idx_confidence = 1; - const int idx_priors = 2; - const int idx_arm_confidence = 3; - const int idx_arm_location = 4; - - int _num_classes = 0; - int _background_label_id = 0; - int _top_k = 0; - int _variance_encoded_in_target = 0; - int _keep_top_k = 0; - int _code_type = 0; - - bool _share_location = false; - bool _clip_before_nms = false; // clip bounding boxes before nms step - bool _clip_after_nms = false; // clip bounding boxes after nms step - bool _decrease_label_id = false; - - bool with_add_box_pred = false; - - int _image_width = 0; - int _image_height = 0; - int _prior_size = 4; - bool _normalized = true; - int _offset = 0; - - float _nms_threshold = 0.0f; - float _confidence_threshold = 0.0f; - float _objectness_score = 0.0f; - - int _num = 0; - int _num_loc_classes = 0; - int _num_priors = 0; - bool _priors_batches = false; - - enum CodeType { - CORNER = 1, - CENTER_SIZE = 2, - }; - - void decodeBBoxes(const float *prior_data, const float *loc_data, const float *variance_data, - float *decoded_bboxes, float *decoded_bbox_sizes, int* num_priors_actual, int n, const int& offs, const int& pr_size, - bool decodeType = true); // after ARM = false - - void nms_cf(const float *conf_data, const float *bboxes, const float *sizes, - int *buffer, int *indices, int &detections, int num_priors_actual); - - void nms_mx(const float *conf_data, const float *bboxes, const float *sizes, - int *buffer, int *indices, int *detections, int num_priors_actual); - - std::vector _decoded_bboxes; - std::vector _buffer; - std::vector _indices; - std::vector _detections_count; - std::vector _reordered_conf; - std::vector _bbox_sizes; - std::vector _num_priors_actual; -}; - -struct ConfidenceComparator { - explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {} - - bool operator()(int idx1, int idx2) { - if (_conf_data[idx1] > _conf_data[idx2]) return true; - if (_conf_data[idx1] < _conf_data[idx2]) return false; - return idx1 < idx2; - } - - const float* _conf_data; -}; - -static inline float JaccardOverlap(const float *decoded_bbox, - const float *bbox_sizes, - const int idx1, - const int idx2) { - float xmin1 = decoded_bbox[idx1*4 + 0]; - float ymin1 = decoded_bbox[idx1*4 + 1]; - float xmax1 = decoded_bbox[idx1*4 + 2]; - float ymax1 = decoded_bbox[idx1*4 + 3]; - - float xmin2 = decoded_bbox[idx2*4 + 0]; - float ymin2 = decoded_bbox[idx2*4 + 1]; - float xmax2 = decoded_bbox[idx2*4 + 2]; - float ymax2 = decoded_bbox[idx2*4 + 3]; - - if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) { - return 0.0f; - } - - float intersect_xmin = (std::max)(xmin1, xmin2); - float intersect_ymin = (std::max)(ymin1, ymin2); - float intersect_xmax = (std::min)(xmax1, xmax2); - float intersect_ymax = (std::min)(ymax1, ymax2); - - float intersect_width = intersect_xmax - intersect_xmin; - float intersect_height = intersect_ymax - intersect_ymin; - - if (intersect_width <= 0 || intersect_height <= 0) { - return 0.0f; - } - - float intersect_size = intersect_width * intersect_height; - float bbox1_size = bbox_sizes[idx1]; - float bbox2_size = bbox_sizes[idx2]; - - return intersect_size / (bbox1_size + bbox2_size - intersect_size); -} - -void DetectionOutputImpl::decodeBBoxes(const float *prior_data, - const float *loc_data, - const float *variance_data, - float *decoded_bboxes, - float *decoded_bbox_sizes, - int* num_priors_actual, - int n, - const int& offs, - const int& pr_size, - bool decodeType) { - num_priors_actual[n] = _num_priors; - if (!_normalized && decodeType) { - int num = 0; - for (; num < _num_priors; ++num) { - float batch_id = prior_data[num * pr_size + 0]; - if (batch_id == -1.f) { - num_priors_actual[n] = num; - break; - } - } - } - parallel_for(num_priors_actual[n], [&](int p) { - float new_xmin = 0.0f; - float new_ymin = 0.0f; - float new_xmax = 0.0f; - float new_ymax = 0.0f; - - float prior_xmin = prior_data[p*pr_size + 0 + offs]; - float prior_ymin = prior_data[p*pr_size + 1 + offs]; - float prior_xmax = prior_data[p*pr_size + 2 + offs]; - float prior_ymax = prior_data[p*pr_size + 3 + offs]; - - float loc_xmin = loc_data[4*p*_num_loc_classes + 0]; - float loc_ymin = loc_data[4*p*_num_loc_classes + 1]; - float loc_xmax = loc_data[4*p*_num_loc_classes + 2]; - float loc_ymax = loc_data[4*p*_num_loc_classes + 3]; - - if (!_normalized) { - prior_xmin /= _image_width; - prior_ymin /= _image_height; - prior_xmax /= _image_width; - prior_ymax /= _image_height; - } - - if (_code_type == CodeType::CORNER) { - if (_variance_encoded_in_target) { - // variance is encoded in target, we simply need to add the offset predictions. - new_xmin = prior_xmin + loc_xmin; - new_ymin = prior_ymin + loc_ymin; - new_xmax = prior_xmax + loc_xmax; - new_ymax = prior_ymax + loc_ymax; - } else { - new_xmin = prior_xmin + variance_data[p*4 + 0] * loc_xmin; - new_ymin = prior_ymin + variance_data[p*4 + 1] * loc_ymin; - new_xmax = prior_xmax + variance_data[p*4 + 2] * loc_xmax; - new_ymax = prior_ymax + variance_data[p*4 + 3] * loc_ymax; - } - } else if (_code_type == CodeType::CENTER_SIZE) { - float prior_width = prior_xmax - prior_xmin; - float prior_height = prior_ymax - prior_ymin; - float prior_center_x = (prior_xmin + prior_xmax) / 2.0f; - float prior_center_y = (prior_ymin + prior_ymax) / 2.0f; - - float decode_bbox_center_x, decode_bbox_center_y; - float decode_bbox_width, decode_bbox_height; - - if (_variance_encoded_in_target) { - // variance is encoded in target, we simply need to restore the offset predictions. - decode_bbox_center_x = loc_xmin * prior_width + prior_center_x; - decode_bbox_center_y = loc_ymin * prior_height + prior_center_y; - decode_bbox_width = std::exp(loc_xmax) * prior_width; - decode_bbox_height = std::exp(loc_ymax) * prior_height; - } else { - // variance is encoded in bbox, we need to scale the offset accordingly. - decode_bbox_center_x = variance_data[p*4 + 0] * loc_xmin * prior_width + prior_center_x; - decode_bbox_center_y = variance_data[p*4 + 1] * loc_ymin * prior_height + prior_center_y; - decode_bbox_width = std::exp(variance_data[p*4 + 2] * loc_xmax) * prior_width; - decode_bbox_height = std::exp(variance_data[p*4 + 3] * loc_ymax) * prior_height; - } - - new_xmin = decode_bbox_center_x - decode_bbox_width / 2.0f; - new_ymin = decode_bbox_center_y - decode_bbox_height / 2.0f; - new_xmax = decode_bbox_center_x + decode_bbox_width / 2.0f; - new_ymax = decode_bbox_center_y + decode_bbox_height / 2.0f; - } - - if (_clip_before_nms) { - new_xmin = (std::max)(0.0f, (std::min)(1.0f, new_xmin)); - new_ymin = (std::max)(0.0f, (std::min)(1.0f, new_ymin)); - new_xmax = (std::max)(0.0f, (std::min)(1.0f, new_xmax)); - new_ymax = (std::max)(0.0f, (std::min)(1.0f, new_ymax)); - } - - decoded_bboxes[p*4 + 0] = new_xmin; - decoded_bboxes[p*4 + 1] = new_ymin; - decoded_bboxes[p*4 + 2] = new_xmax; - decoded_bboxes[p*4 + 3] = new_ymax; - - decoded_bbox_sizes[p] = (new_xmax - new_xmin) * (new_ymax - new_ymin); - }); -} - -void DetectionOutputImpl::nms_cf(const float* conf_data, - const float* bboxes, - const float* sizes, - int* buffer, - int* indices, - int& detections, - int num_priors_actual) { - int count = 0; - for (int i = 0; i < num_priors_actual; ++i) { - if (conf_data[i] > _confidence_threshold) { - indices[count] = i; - count++; - } - } - - int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count)); - - std::partial_sort_copy(indices, indices + count, - buffer, buffer + num_output_scores, - ConfidenceComparator(conf_data)); - - for (int i = 0; i < num_output_scores; ++i) { - const int idx = buffer[i]; - - bool keep = true; - for (int k = 0; k < detections; ++k) { - const int kept_idx = indices[k]; - float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx); - if (overlap > _nms_threshold) { - keep = false; - break; - } - } - if (keep) { - indices[detections] = idx; - detections++; - } - } -} - -void DetectionOutputImpl::nms_mx(const float* conf_data, - const float* bboxes, - const float* sizes, - int* buffer, - int* indices, - int* detections, - int num_priors_actual) { - int count = 0; - for (int i = 0; i < num_priors_actual; ++i) { - float conf = -1; - int id = 0; - for (int c = 1; c < _num_classes; ++c) { - float temp = conf_data[c*_num_priors + i]; - if (temp > conf) { - conf = temp; - id = c; - } - } - - if (id > 0 && conf >= _confidence_threshold) { - indices[count++] = id*_num_priors + i; - } - } - - int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count)); - - std::partial_sort_copy(indices, indices + count, - buffer, buffer + num_output_scores, - ConfidenceComparator(conf_data)); - - for (int i = 0; i < num_output_scores; ++i) { - const int idx = buffer[i]; - const int cls = idx/_num_priors; - const int prior = idx%_num_priors; - - int &ndetection = detections[cls]; - int *pindices = indices + cls*_num_priors; - - bool keep = true; - for (int k = 0; k < ndetection; ++k) { - const int kept_idx = pindices[k]; - float overlap = 0.0f; - if (_share_location) { - overlap = JaccardOverlap(bboxes, sizes, prior, kept_idx); - } else { - overlap = JaccardOverlap(bboxes, sizes, cls*_num_priors + prior, cls*_num_priors + kept_idx); - } - if (overlap > _nms_threshold) { - keep = false; - break; - } - } - if (keep) { - pindices[ndetection++] = prior; - } - } -} - -REG_FACTORY_FOR(DetectionOutputImpl, DetectionOutput); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/detectionoutput_onnx.cpp deleted file mode 100644 index fefcee872cea4f..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput_onnx.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include "ie_parallel.hpp" -#include "common/tensor_desc_creator.h" -#include - - -namespace { -struct Indexer { - const std::vector dims_; - int total_{1}; - - explicit Indexer(const std::vector& dims) : dims_(dims) { - total_ = 1; - for (size_t i = 0; i < dims_.size(); ++i) { - total_ *= dims_[i]; - } - } - - int operator()(const std::vector& idx) const { - int flat_idx = 0; - assert(idx.size() == dims_.size()); - for (size_t i = 0; i < dims_.size(); ++i) { - assert(0 <= idx[i] && idx[i] < dims_[i]); - flat_idx = flat_idx * dims_[i] + idx[i]; - } - assert(flat_idx < total_); - return flat_idx; - } -}; -} // namespace - - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -static -void refine_boxes(const float* boxes, const float* deltas, const float* weights, const float* scores, - float* refined_boxes, float* refined_boxes_areas, float* refined_scores, - const int rois_num, const int classes_num, - const float img_H, const float img_W, - const float max_delta_log_wh, - float coordinates_offset) { - Indexer box_idx({rois_num, 4}); - Indexer delta_idx({rois_num, classes_num, 4}); - Indexer score_idx({rois_num, classes_num}); - - Indexer refined_box_idx({classes_num, rois_num, 4}); - Indexer refined_score_idx({classes_num, rois_num}); - - for (int roi_idx = 0; roi_idx < rois_num; ++roi_idx) { - float x0 = boxes[box_idx({roi_idx, 0})]; - float y0 = boxes[box_idx({roi_idx, 1})]; - float x1 = boxes[box_idx({roi_idx, 2})]; - float y1 = boxes[box_idx({roi_idx, 3})]; - - if (x1 - x0 <= 0 || y1 - y0 <= 0) { - continue; - } - - // width & height of box - const float ww = x1 - x0 + coordinates_offset; - const float hh = y1 - y0 + coordinates_offset; - // center location of box - const float ctr_x = x0 + 0.5f * ww; - const float ctr_y = y0 + 0.5f * hh; - - for (int class_idx = 1; class_idx < classes_num; ++class_idx) { - const float dx = deltas[delta_idx({roi_idx, class_idx, 0})] / weights[0]; - const float dy = deltas[delta_idx({roi_idx, class_idx, 1})] / weights[1]; - const float d_log_w = deltas[delta_idx({roi_idx, class_idx, 2})] / weights[2]; - const float d_log_h = deltas[delta_idx({roi_idx, class_idx, 3})] / weights[3]; - - // new center location according to deltas (dx, dy) - const float pred_ctr_x = dx * ww + ctr_x; - const float pred_ctr_y = dy * hh + ctr_y; - // new width & height according to deltas d(log w), d(log h) - const float pred_w = std::exp((std::min)(d_log_w, max_delta_log_wh)) * ww; - const float pred_h = std::exp((std::min)(d_log_h, max_delta_log_wh)) * hh; - - // update upper-left corner location - float x0_new = pred_ctr_x - 0.5f * pred_w; - float y0_new = pred_ctr_y - 0.5f * pred_h; - // update lower-right corner location - float x1_new = pred_ctr_x + 0.5f * pred_w - coordinates_offset; - float y1_new = pred_ctr_y + 0.5f * pred_h - coordinates_offset; - - // adjust new corner locations to be within the image region, - x0_new = std::max(0.0f, x0_new); - y0_new = std::max(0.0f, y0_new); - x1_new = std::max(0.0f, x1_new); - y1_new = std::max(0.0f, y1_new); - - // recompute new width & height - const float box_w = x1_new - x0_new + coordinates_offset; - const float box_h = y1_new - y0_new + coordinates_offset; - - refined_boxes[refined_box_idx({class_idx, roi_idx, 0})] = x0_new; - refined_boxes[refined_box_idx({class_idx, roi_idx, 1})] = y0_new; - refined_boxes[refined_box_idx({class_idx, roi_idx, 2})] = x1_new; - refined_boxes[refined_box_idx({class_idx, roi_idx, 3})] = y1_new; - - refined_boxes_areas[refined_score_idx({class_idx, roi_idx})] = box_w * box_h; - - refined_scores[refined_score_idx({class_idx, roi_idx})] = scores[score_idx({roi_idx, class_idx})]; - } - } -} - -template -static bool SortScorePairDescend(const std::pair& pair1, - const std::pair& pair2) { - return pair1.first > pair2.first; -} - - -struct ConfidenceComparator { - explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {} - - bool operator()(int idx1, int idx2) { - if (_conf_data[idx1] > _conf_data[idx2]) return true; - if (_conf_data[idx1] < _conf_data[idx2]) return false; - return idx1 < idx2; - } - - const float* _conf_data; -}; - -static inline float JaccardOverlap(const float *decoded_bbox, - const float *bbox_sizes, - const int idx1, - const int idx2, - const float coordinates_offset = 1) { - float xmin1 = decoded_bbox[idx1 * 4 + 0]; - float ymin1 = decoded_bbox[idx1 * 4 + 1]; - float xmax1 = decoded_bbox[idx1 * 4 + 2]; - float ymax1 = decoded_bbox[idx1 * 4 + 3]; - - float xmin2 = decoded_bbox[idx2 * 4 + 0]; - float ymin2 = decoded_bbox[idx2 * 4 + 1]; - float ymax2 = decoded_bbox[idx2 * 4 + 3]; - float xmax2 = decoded_bbox[idx2 * 4 + 2]; - - if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) { - return 0.0f; - } - - float intersect_xmin = (std::max)(xmin1, xmin2); - float intersect_ymin = (std::max)(ymin1, ymin2); - float intersect_xmax = (std::min)(xmax1, xmax2); - float intersect_ymax = (std::min)(ymax1, ymax2); - - float intersect_width = intersect_xmax - intersect_xmin + coordinates_offset; - float intersect_height = intersect_ymax - intersect_ymin + coordinates_offset; - - if (intersect_width <= 0 || intersect_height <= 0) { - return 0.0f; - } - - float intersect_size = intersect_width * intersect_height; - float bbox1_size = bbox_sizes[idx1]; - float bbox2_size = bbox_sizes[idx2]; - - return intersect_size / (bbox1_size + bbox2_size - intersect_size); -} - - -static void nms_cf(const float* conf_data, - const float* bboxes, - const float* sizes, - int* buffer, - int* indices, - int& detections, - const int boxes_num, - const int pre_nms_topn, - const int post_nms_topn, - const float confidence_threshold, - const float nms_threshold) { - int count = 0; - for (int i = 0; i < boxes_num; ++i) { - if (conf_data[i] > confidence_threshold) { - indices[count] = i; - count++; - } - } - - int num_output_scores = (pre_nms_topn == -1 ? count : (std::min)(pre_nms_topn, count)); - - std::partial_sort_copy(indices, indices + count, - buffer, buffer + num_output_scores, - ConfidenceComparator(conf_data)); - - detections = 0; - for (int i = 0; i < num_output_scores; ++i) { - const int idx = buffer[i]; - - bool keep = true; - for (int k = 0; k < detections; ++k) { - const int kept_idx = indices[k]; - float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx); - if (overlap > nms_threshold) { - keep = false; - break; - } - } - if (keep) { - indices[detections] = idx; - detections++; - } - } - - detections = (post_nms_topn == -1 ? detections : (std::min)(post_nms_topn, detections)); -} - - -class ExperimentalDetectronDetectionOutputImpl: public ExtLayerBase { -private: - const int INPUT_ROIS {0}; - const int INPUT_DELTAS {1}; - const int INPUT_SCORES {2}; - const int INPUT_IM_INFO {3}; - - const int OUTPUT_BOXES {0}; - const int OUTPUT_CLASSES {1}; - const int OUTPUT_SCORES {2}; - -public: - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto doOp = ngraph::as_type_ptr(op); - if (!doOp) { - errorMessage = "Node is not an instance of the ExperimentalDetectronDetectionOutput from the operations set v6."; - return false; - } - } catch (...) { - return false; - } - return true; - } - - explicit ExperimentalDetectronDetectionOutputImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - auto doOp = ngraph::as_type_ptr(op); - auto attributes = doOp->get_attrs(); - - score_threshold_ = attributes.score_threshold; - nms_threshold_ = attributes.nms_threshold; - max_delta_log_wh_ = attributes.max_delta_log_wh; - classes_num_ = attributes.num_classes; - max_detections_per_class_ = attributes.post_nms_count; - max_detections_per_image_ = attributes.max_detections_per_image; - class_agnostic_box_regression_ = attributes.class_agnostic_box_regression; - deltas_weights_ = attributes.deltas_weights; - - std::vector inDataConfigurators(op->get_input_size(), {TensorDescCreatorTypes::ncsp, Precision::FP32}); - - addConfig(op, inDataConfigurators, - {{TensorDescCreatorTypes::ncsp, Precision::FP32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}, - {TensorDescCreatorTypes::ncsp, Precision::FP32}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { - const int rois_num = inputs[INPUT_ROIS]->getTensorDesc().getDims()[0]; - assert(classes_num_ == static_cast(inputs[INPUT_SCORES]->getTensorDesc().getDims()[1])); - assert(4 * classes_num_ == static_cast(inputs[INPUT_DELTAS]->getTensorDesc().getDims()[1])); - - const auto* boxes = inputs[INPUT_ROIS]->buffer().as(); - const auto* deltas = inputs[INPUT_DELTAS]->buffer().as(); - const auto* scores = inputs[INPUT_SCORES]->buffer().as(); - const auto* im_info = inputs[INPUT_IM_INFO]->buffer().as(); - - auto* output_boxes = outputs[OUTPUT_BOXES]->buffer().as(); - auto* output_scores = outputs[OUTPUT_SCORES]->buffer().as(); - auto* output_classes = outputs[OUTPUT_CLASSES]->buffer().as(); - - const float img_H = im_info[0]; - const float img_W = im_info[1]; - - // Apply deltas. - std::vector refined_boxes(classes_num_ * rois_num * 4, 0); - std::vector refined_scores(classes_num_ * rois_num, 0); - std::vector refined_boxes_areas(classes_num_ * rois_num, 0); - Indexer refined_box_idx({classes_num_, rois_num, 4}); - Indexer refined_score_idx({classes_num_, rois_num}); - - refine_boxes(boxes, deltas, &deltas_weights_[0], scores, - &refined_boxes[0], &refined_boxes_areas[0], &refined_scores[0], - rois_num, classes_num_, - img_H, img_W, - max_delta_log_wh_, - 1.0f); - - // Apply NMS class-wise. - std::vector buffer(rois_num, 0); - std::vector indices(classes_num_ * rois_num, 0); - std::vector detections_per_class(classes_num_, 0); - int total_detections_num = 0; - - for (int class_idx = 1; class_idx < classes_num_; ++class_idx) { - nms_cf(&refined_scores[refined_score_idx({class_idx, 0})], - &refined_boxes[refined_box_idx({class_idx, 0, 0})], - &refined_boxes_areas[refined_score_idx({class_idx, 0})], - &buffer[0], - &indices[total_detections_num], - detections_per_class[class_idx], - rois_num, - -1, - max_detections_per_class_, - score_threshold_, - nms_threshold_); - total_detections_num += detections_per_class[class_idx]; - } - - // Leave only max_detections_per_image_ detections. - // confidence, - std::vector>> conf_index_class_map; - - int indices_offset = 0; - for (int c = 0; c < classes_num_; ++c) { - int n = detections_per_class[c]; - for (int i = 0; i < n; ++i) { - int idx = indices[indices_offset + i]; - float score = refined_scores[refined_score_idx({c, idx})]; - conf_index_class_map.push_back(std::make_pair(score, std::make_pair(c, idx))); - } - indices_offset += n; - } - - assert(max_detections_per_image_ > 0); - if (total_detections_num > max_detections_per_image_) { - std::partial_sort(conf_index_class_map.begin(), - conf_index_class_map.begin() + max_detections_per_image_, - conf_index_class_map.end(), - SortScorePairDescend>); - conf_index_class_map.resize(max_detections_per_image_); - total_detections_num = max_detections_per_image_; - } - - // Fill outputs. - memset(output_boxes, 0, max_detections_per_image_ * 4 * sizeof(output_boxes[0])); - memset(output_scores, 0, max_detections_per_image_ * sizeof(output_scores[0])); - memset(output_classes, 0, max_detections_per_image_ * sizeof(output_classes[0])); - - int i = 0; - for (const auto & detection : conf_index_class_map) { - float score = detection.first; - int cls = detection.second.first; - int idx = detection.second.second; - output_boxes[4 * i + 0] = refined_boxes[refined_box_idx({cls, idx, 0})]; - output_boxes[4 * i + 1] = refined_boxes[refined_box_idx({cls, idx, 1})]; - output_boxes[4 * i + 2] = refined_boxes[refined_box_idx({cls, idx, 2})]; - output_boxes[4 * i + 3] = refined_boxes[refined_box_idx({cls, idx, 3})]; - output_scores[i] = score; - output_classes[i] = cls; - ++i; - } - - return OK; - } - -private: - float score_threshold_; - float nms_threshold_; - float max_delta_log_wh_; - int classes_num_; - int max_detections_per_class_; - int max_detections_per_image_; - bool class_agnostic_box_regression_; - std::vector deltas_weights_; -}; - - - -REG_FACTORY_FOR(ExperimentalDetectronDetectionOutputImpl, ExperimentalDetectronDetectionOutput); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp b/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp deleted file mode 100644 index 4ea74721adca49..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp +++ /dev/null @@ -1,184 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include "ie_parallel.hpp" - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -class GatherTreeImpl: public ExtLayerBase { -public: - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto gatherElementsOp = ngraph::as_type_ptr(op); - if (!gatherElementsOp) { - errorMessage = "Node is not an instance of the GatherTree operation from operation set v1."; - return false; - } - - auto precision = op->get_input_element_type(GATHER_TREE_STEP_IDX); - if (!MKLDNNPlugin::one_of(precision, ngraph::element::f32, ngraph::element::i32)) - precision = ngraph::element::f32; - if (op->get_input_element_type(GATHER_TREE_PARENT_IDX) != precision || - op->get_input_element_type(GATHER_TREE_MAX_SEQ_LEN) != precision || - op->get_input_element_type(GATHER_TREE_END_TOKEN) != precision || - op->get_output_element_type(0) != precision) { - errorMessage = "Node has incorrect input/output data precision. Must be the same."; - return false; - } - } catch (...) { - return false; - } - - return true; - } - - explicit GatherTreeImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - std::string errorPrefix = std::string("Node GatherTree with name '") + op->get_friendly_name() + "'"; - if (op->get_input_size() != 4) - IE_THROW() << errorPrefix << " has incorrect number of input edges."; - if (op->get_output_size() != 1) - IE_THROW() << errorPrefix << " has incorrect number of output edges."; - - precision = details::convertPrecision(op->get_input_element_type(GATHER_TREE_STEP_IDX)); - if (!MKLDNNPlugin::one_of(precision, Precision::FP32, Precision::I32)) - precision = Precision::FP32; - - if (op->get_input_shape(GATHER_TREE_STEP_IDX).size() != 3) - IE_THROW() << errorPrefix << " step_idx vector should be 3 dimension"; - if (op->get_input_shape(GATHER_TREE_PARENT_IDX).size() != 3) - IE_THROW() << errorPrefix << " parent_idx vector should be 3 dimension"; - if (op->get_input_shape(GATHER_TREE_MAX_SEQ_LEN).size() != 1) - IE_THROW() << errorPrefix << " max_seq_len vector should be 1 dimension"; - if (op->get_input_shape(GATHER_TREE_END_TOKEN).size() != 0) - IE_THROW() << errorPrefix << " end_token should be 1 dimension"; - - addConfig(op, {{TensorDescCreatorTypes::ncsp, precision}, - {TensorDescCreatorTypes::ncsp, precision}, - {TensorDescCreatorTypes::ncsp, precision}, - {TensorDescCreatorTypes::ncsp, precision}}, - {{TensorDescCreatorTypes::ncsp, precision}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - if (precision == Precision::FP32) - return execute_impl(inputs, outputs, resp); - else - return execute_impl(inputs, outputs, resp); - } - - template - StatusCode execute_impl(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept { - const auto *step_idx = inputs[GATHER_TREE_STEP_IDX]->cbuffer().as() + - inputs[GATHER_TREE_STEP_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const auto * const parent_idx = inputs[GATHER_TREE_PARENT_IDX]->cbuffer().as() + - inputs[GATHER_TREE_PARENT_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const size_t parent_idx_size = inputs[GATHER_TREE_PARENT_IDX]->size() - - inputs[GATHER_TREE_PARENT_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const auto *max_seq_len = inputs[GATHER_TREE_MAX_SEQ_LEN]->cbuffer().as() + - inputs[GATHER_TREE_MAX_SEQ_LEN]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - auto end_token = (inputs[GATHER_TREE_END_TOKEN]->cbuffer().as() + - inputs[GATHER_TREE_END_TOKEN]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; - auto * final_idx = outputs[0]->cbuffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - SizeVector step_idx_dims = inputs[GATHER_TREE_STEP_IDX]->getTensorDesc().getDims(); - SizeVector parent_idx_dims = inputs[GATHER_TREE_PARENT_IDX]->getTensorDesc().getDims(); - SizeVector max_seq_len_dims = inputs[GATHER_TREE_MAX_SEQ_LEN]->getTensorDesc().getDims(); - SizeVector final_idx_dims = outputs[0]->getTensorDesc().getDims(); - int32_t max_time = step_idx_dims[0]; - const size_t batch_size = step_idx_dims[1]; - const size_t beam_width = step_idx_dims[2]; - const size_t bb_size = batch_size * beam_width; - - if (max_time != static_cast(parent_idx_dims[0]) || max_time != static_cast(final_idx_dims[0]) || - batch_size != parent_idx_dims[1] || batch_size != final_idx_dims[1] || batch_size != max_seq_len_dims[0] || - beam_width != parent_idx_dims[2] || beam_width != final_idx_dims[2]) { - if (resp) { - std::string errorMsg = "Input/Output tensors dimensions mismatch"; - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return PARAMETER_MISMATCH; - } - - bool incorrect_result = false; - parallel_for2d(batch_size, beam_width, [&](size_t batch, size_t beam) { - int32_t max_sequence_in_beam = std::min(max_time, static_cast(max_seq_len[batch])); - if (max_sequence_in_beam > 0) { - int32_t time, idx = (max_time - 1) * bb_size + batch * beam_width; - for (time = (max_time - 1); time >= max_sequence_in_beam; time--, idx -= bb_size) - final_idx[idx + beam] = end_token; - - for (int32_t parent = static_cast(beam); time >= 0; time--, idx -= bb_size) { - if (parent < 0 - || parent >= static_cast(beam_width) - || idx + parent >= parent_idx_size) { - incorrect_result = true; - break; - } - final_idx[idx + beam] = step_idx[idx + parent]; - parent = static_cast(parent_idx[idx + parent]); - } - - bool finished = false; - auto *final = &final_idx[batch * beam_width + beam]; - for (time = 0; time < max_sequence_in_beam; time++, final += bb_size) { - if (finished) - (*final) = end_token; - else if ((*final) == end_token) - finished = true; - } - } - }); - - if (incorrect_result) { - if (resp) { - std::string errorMsg = "Wrong parent index, result is incorrect"; - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return OUT_OF_BOUNDS; - } - - return OK; - } - -private: - static const size_t GATHER_TREE_STEP_IDX = 0; - static const size_t GATHER_TREE_PARENT_IDX = 1; - static const size_t GATHER_TREE_MAX_SEQ_LEN = 2; - static const size_t GATHER_TREE_END_TOKEN = 3; - - InferenceEngine::Precision precision; -}; - -REG_FACTORY_FOR(GatherTreeImpl, GatherTree); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/grn.cpp b/inference-engine/src/mkldnn_plugin/nodes/grn.cpp deleted file mode 100644 index 6ee077fd52ff1e..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/grn.cpp +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" - -#include -#include -#include -#include "ie_parallel.hpp" -#include - -using namespace MKLDNNPlugin; - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -class GRNImpl: public ExtLayerBase { - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - const auto grn = std::dynamic_pointer_cast(op); - if (!grn) { - errorMessage = "Only opset1 GRN operation is supported"; - return false; - } - } catch (...) { - return false; - } - return true; - } - - std::string errorPrefix; - -public: - explicit GRNImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - errorPrefix = "GRN layer with name '" + op->get_friendly_name() + "'"; - const auto grn = std::dynamic_pointer_cast(op); - - if (op->get_input_size() != 1 || op->get_output_size() != 1) - IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; - - bias = grn->get_bias(); - - addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}}, - {{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { - float* src_data = inputs[0]->buffer(); - float* dst_data = outputs[0]->buffer(); - - SizeVector dims = inputs[0]->getTensorDesc().getDims(); - - int N = static_cast((dims.size() > 0) ? dims[0] : 1); - int C = static_cast((dims.size() > 1) ? dims[1] : 1); - int H = static_cast((dims.size() > 2) ? dims[2] : 1); - int W = static_cast((dims.size() > 3) ? dims[3] : 1); - - parallel_for3d(N, H, W, [&](int b, int h, int w) { - double variance = 0; - for (int c = 0; c < C; c++) { - variance += std::pow(src_data[b*C*H*W + c*H*W + h*W + w], 2); - } - variance = std::pow(variance + bias, 0.5f); - for (int c = 0; c < C; c++) { - dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] / static_cast(variance); - } - }); - return OK; - } - -private: - float bias = 1.0f; -}; - -REG_FACTORY_FOR(GRNImpl, GRN); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp index d06cefa7985ac2..d005c1e16b630d 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp +++ b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp @@ -7,24 +7,3 @@ # define MKLDNN_EXTENSION_NODE(__prim, __type) #endif -MKLDNN_EXTENSION_NODE(CTCLossImpl, CTCLoss); -MKLDNN_EXTENSION_NODE(ExperimentalDetectronTopKROIsImpl, ExperimentalDetectronTopKROIs); -MKLDNN_EXTENSION_NODE(ExtractImagePatchesImpl, ExtractImagePatches); -MKLDNN_EXTENSION_NODE(ReverseSequenceImpl, ReverseSequence); -MKLDNN_EXTENSION_NODE(DetectionOutputImpl, DetectionOutput); -MKLDNN_EXTENSION_NODE(ExperimentalDetectronDetectionOutputImpl, ExperimentalDetectronDetectionOutput); -MKLDNN_EXTENSION_NODE(LogSoftmaxImpl, LogSoftmax); -MKLDNN_EXTENSION_NODE(ReorgYoloImpl, ReorgYolo); -MKLDNN_EXTENSION_NODE(ExperimentalDetectronROIFeatureExtractorImpl, ExperimentalDetectronROIFeatureExtractor); -MKLDNN_EXTENSION_NODE(ExperimentalDetectronGenerateProposalsSingleImageImpl, ExperimentalDetectronGenerateProposalsSingleImage); -MKLDNN_EXTENSION_NODE(NonMaxSuppressionImpl, NonMaxSuppressionIEInternal); -MKLDNN_EXTENSION_NODE(TopKImpl, TopK); -MKLDNN_EXTENSION_NODE(ExperimentalDetectronPriorGridGeneratorImpl, ExperimentalDetectronPriorGridGenerator); -MKLDNN_EXTENSION_NODE(GRNImpl, GRN); -MKLDNN_EXTENSION_NODE(BucketizeImpl, Bucketize); -MKLDNN_EXTENSION_NODE(CTCGreedyDecoderImpl, CTCGreedyDecoder); -MKLDNN_EXTENSION_NODE(CTCGreedyDecoderSeqLenImpl, CTCGreedyDecoderSeqLen); -MKLDNN_EXTENSION_NODE(ProposalImpl, Proposal); -MKLDNN_EXTENSION_NODE(RangeImpl, Range); -MKLDNN_EXTENSION_NODE(GatherTreeImpl, GatherTree); -MKLDNN_EXTENSION_NODE(CumSumImpl, CumSum); diff --git a/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp b/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp deleted file mode 100644 index 337549e3434be0..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" - -#include -#include -#include -#include -#include -#include -#include "ie_parallel.hpp" -#include - -using namespace MKLDNNPlugin; - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -class LogSoftmaxImpl: public ExtLayerBase { - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - const auto logSoftMax = std::dynamic_pointer_cast(op); - if (!logSoftMax) { - errorMessage = "Only opset5 LogSoftmax operation is supported"; - return false; - } - } catch (...) { - return false; - } - return true; - } - -public: - explicit LogSoftmaxImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - errorPrefix = "LogSoftmax layer with name '" + op->get_friendly_name() + "'"; - const auto logSoftMax = std::dynamic_pointer_cast(op); - - if (op->get_input_size() != 1 || op->get_output_size() != 1) - IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; - - SizeVector dims = op->get_input_shape(0); - if (!dims.size()) - dims = SizeVector(1, 1); - int axis = logSoftMax->get_axis(); - if (axis < 0) - axis += dims.size(); - - if (dims.size() < static_cast((size_t)(1) + axis)) - IE_THROW() << errorPrefix << " has incorrect input parameters dimensions and axis number!"; - - int j; - for (j = dims.size() - 1; j >= 0; j--) { - if (dims[j] != 1) break; - } - if (j == axis) is_last_dim = true; - - for (int i = 0; i < axis; i++) - axis_step *= dims[i]; - reduced_axis_size = dims[axis]; - for (size_t i = (axis + 1); i < dims.size(); i++) - reduced_axis_stride *= dims[i]; - - addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, - {{TensorDescCreatorTypes::ncsp, Precision::FP32}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - const float *src_data = inputs[0]->cbuffer().as() + - inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - float* dst_data = outputs[0]->buffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - if (is_last_dim) { - parallel_for(axis_step, [&](size_t i) { - const float *src_dataPtr = &src_data[i * reduced_axis_size]; - float *dst_dataPtr = &dst_data[i * reduced_axis_size]; - - float reduce_prod = 0.0f; - const float max = *std::max_element(src_dataPtr, src_dataPtr + reduced_axis_size); - for (size_t j = 0; j < reduced_axis_size; ++j) - reduce_prod += expf(src_dataPtr[j] - max); - - reduce_prod = logf(reduce_prod); - for (size_t j = 0; j < reduced_axis_size; ++j) - dst_dataPtr[j] = src_dataPtr[j] - max - reduce_prod; - }); - } else { - parallel_for2d(axis_step, reduced_axis_stride, [&](size_t k, size_t i) { - const float *src_dataPtr = &src_data[k * reduced_axis_stride * reduced_axis_size + i]; - float *dst_dataPtr = &dst_data[k * reduced_axis_stride * reduced_axis_size + i]; - - float reduce_prod = 0.0f; - float max = std::numeric_limits::min(); - for (size_t j = 0; j < reduced_axis_size; ++j) { - if (src_dataPtr[j * reduced_axis_stride] > max) - max = src_dataPtr[j * reduced_axis_stride]; - } - - for (size_t j = 0; j < reduced_axis_size; ++j) - reduce_prod += expf(src_dataPtr[j * reduced_axis_stride] - max); - - reduce_prod = logf(reduce_prod); - for (size_t j = 0; j < reduced_axis_size; ++j) - dst_dataPtr[j * reduced_axis_stride] = src_dataPtr[j * reduced_axis_stride] - max - reduce_prod; - }); - } - - return OK; - } - -private: - size_t reduced_axis_size; - size_t reduced_axis_stride = 1; - size_t axis_step = 1; - bool is_last_dim = false; - - std::string errorPrefix; -}; - -REG_FACTORY_FOR(LogSoftmaxImpl, LogSoftmax); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.cpp new file mode 100644 index 00000000000000..c6c327a1993f3d --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.cpp @@ -0,0 +1,218 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_bucketize_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNBucketizeNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto bucketsize = std::dynamic_pointer_cast(op); + if (!bucketsize) { + errorMessage = "Only opset3 Bucketize operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNBucketizeNode::MKLDNNBucketizeNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "Bucketize layer with name '" + op->get_friendly_name() + "' "; + const auto bucketsize = std::dynamic_pointer_cast(op); + + if (getOriginalInputsNumber() != 2 || getOriginalOutputsNumber() != 1) { + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + } + + // check one attribute + with_right = bucketsize->get_with_right_bound(); + + // check dimensions of input tensors + SizeVector input_tensor_dims = op->get_input_shape(INPUT_TENSOR_PORT); + if (input_tensor_dims.size() < 1) { + IE_THROW() << errorPrefix << " has incorrect dimensions of the input."; + } + SizeVector input_bin_dims = op->get_input_shape(INPUT_BINS_PORT); + if (input_bin_dims.size() != 1) { + IE_THROW() << errorPrefix << " has incorrect dimensions of the boundaries tensor."; + } + if (input_bin_dims[0] != 0) { + with_bins = true; + } + num_bin_values = input_bin_dims[0]; + + num_values = std::accumulate(input_tensor_dims.begin(), input_tensor_dims.end(), size_t(1), std::multiplies()); +} + +void MKLDNNBucketizeNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + // check precisions for input and output tensors + input_precision = getOriginalInputPrecisionAtPort(INPUT_TENSOR_PORT); + if (input_precision != Precision::FP32 && input_precision != Precision::I32 && + input_precision != Precision::I64) { + input_precision = Precision::FP32; + } + boundaries_precision = getOriginalInputPrecisionAtPort(INPUT_BINS_PORT); + if (boundaries_precision != Precision::FP32 && boundaries_precision != Precision::I32 && + boundaries_precision != Precision::I64) { + boundaries_precision = Precision::FP32; + } + output_precision = getOriginalOutputPrecisionAtPort(OUTPUT_TENSOR_PORT); + if (output_precision != Precision::I32 && output_precision != Precision::I64) { + output_precision = Precision::I32; + } + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, input_precision}, + {TensorDescCreatorTypes::ncsp, boundaries_precision}}, + {{TensorDescCreatorTypes::ncsp, output_precision}}, + impl_desc_type::ref_any); +} + +void MKLDNNBucketizeNode::execute(mkldnn::stream strm) { + auto precision_mask = getPrecisionMask(input_precision, boundaries_precision, output_precision); + + switch (precision_mask) { + case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::I32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::I32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::I64, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::I64, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::I32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::I32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::I64, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::I64, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + default: + IE_THROW() << errorPrefix << " has unsupported precision: " << precision_mask; + } +} + +template +void MKLDNNBucketizeNode::bucketize() { + const auto *input_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + const auto *boundaries_data = reinterpret_cast(getParentEdgeAt(1)->getMemoryPtr()->GetPtr()); + auto *output_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + if (!with_bins) { + memset(output_data, 0, num_values * sizeof(T_IND)); + return; + } + + // boundaries are assumed to be sorted and to have unique elements + parallel_for(num_values, [&](size_t ind) { + T value = input_data[ind]; + if (with_right) { + auto low = std::lower_bound(boundaries_data, boundaries_data + num_bin_values, value); + output_data[ind] = static_cast(low - boundaries_data); + } else { + auto up = std::upper_bound(boundaries_data, boundaries_data + num_bin_values, value); + output_data[ind] = static_cast(up - boundaries_data); + } + }); +} + +bool MKLDNNBucketizeNode::created() const { + return getType() == Bucketize; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNBucketizeNode, Bucketize) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.h new file mode 100644 index 00000000000000..472e6aee3cfb03 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.h @@ -0,0 +1,43 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNBucketizeNode : public MKLDNNNode { +public: + MKLDNNBucketizeNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + template + void bucketize(); + + const size_t INPUT_TENSOR_PORT = 0; + const size_t INPUT_BINS_PORT = 1; + const size_t OUTPUT_TENSOR_PORT = 0; + + size_t num_values = 0; + size_t num_bin_values = 0; + bool with_right = false; + bool with_bins = false; + + InferenceEngine::Precision input_precision; + InferenceEngine::Precision boundaries_precision; + InferenceEngine::Precision output_precision; + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp index ba760cae535806..4990a658d61f1c 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp @@ -33,7 +33,7 @@ namespace { bool MKLDNNConcatNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto concatOp = ngraph::as_type_ptr(op); + const auto concatOp = ngraph::as_type_ptr(op); if (!concatOp) { errorMessage = "Node is not an instance of the Concat operation."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.cpp new file mode 100644 index 00000000000000..34c9aaf191e697 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.cpp @@ -0,0 +1,167 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_ctc_greedy_decoder_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNCTCGreedyDecoderNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto greedyDecOp = ngraph::as_type_ptr(op); + if (!greedyDecOp) { + errorMessage = "Node is not an instance of the CTCGreedyDecoder operation from operation set v0."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNCTCGreedyDecoderNode::MKLDNNCTCGreedyDecoderNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "CTCGreedyDecoder layer with name '" + op->get_friendly_name() + "' "; + if (getOriginalInputsNumber() != 2) + IE_THROW() << errorPrefix << "has invalid number of input edges: " << getOriginalInputsNumber(); + if (getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << "has invalid number of outputs edges: " << getOriginalOutputsNumber(); + + if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0] && + op->get_input_shape(DATA_INDEX)[1] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[1]) + IE_THROW() << errorPrefix << "has invalid input shapes."; + + auto greedyDecOp = ngraph::as_type_ptr(op); + mergeRepeated = greedyDecOp->get_ctc_merge_repeated(); +} + +void MKLDNNCTCGreedyDecoderNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + Precision inDataPrecision = getOriginalInputPrecisionAtPort(DATA_INDEX); + if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16) + IE_THROW() << errorPrefix << "has unsupported 'data' input precision: " << inDataPrecision; + + Precision seqLenPrecision = getOriginalInputPrecisionAtPort(SEQUENCE_LENGTH_INDEX); + if (seqLenPrecision != Precision::FP32 && seqLenPrecision != Precision::BF16) + IE_THROW() << errorPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNCTCGreedyDecoderNode::execute(mkldnn::stream strm) { + const float* probabilities = reinterpret_cast(getParentEdgeAt(DATA_INDEX)->getMemoryPtr()->GetPtr()); + const float* sequenceMask = reinterpret_cast(getParentEdgeAt(SEQUENCE_LENGTH_INDEX)->getMemoryPtr()->GetPtr()); + float* outputSequences = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + const size_t T = getParentEdgeAt(DATA_INDEX)->getDims()[0]; + const size_t B = getParentEdgeAt(DATA_INDEX)->getDims()[1]; + const int C = getParentEdgeAt(DATA_INDEX)->getDims()[2]; + const size_t BC = B * C; + const size_t CB1 = C * (B - 1); + + const int blankIndex = C - 1; + + std::vector sequenceLengths(B, 0); + parallel_for(B, [&](size_t b) { + size_t t = 0; + for (; t < T; t++) { + if (sequenceMask[B * t + b] == 0.f) + break; + } + sequenceLengths[b] = t; + }); + + size_t workAmount = 0; + for (size_t b = 0; b < B; b++) { + workAmount += sequenceLengths[b]; + } + + // Parallelization could not be made directly by T due to output index depends on merged classes and + // blank index, thus could not be shared between threads. Better to divide operation on two steps. + // At the first stage find the maximum index. At second stage merge if needed. + // Such approach makes parallelization more efficient. + auto threadBody = [&](const int ithr, const int nthr) { + size_t start(0lu), end(0lu); + splitter(workAmount, nthr, ithr, start, end); + if (start >= end) + return; + size_t tStart = 0lu, bStart = 0lu; + for (; bStart < B; bStart++) { + tStart += sequenceLengths[bStart]; + if (tStart >= start) { + tStart = start - (tStart - sequenceLengths[bStart]); + break; + } + } + + size_t workCounter = start; + + for (size_t b = bStart; b < B; ++b) { + size_t outputIndex = b * T + tStart; + const float* probs = probabilities + b * C + BC * tStart; + size_t sequenceLength = sequenceLengths[b]; + + for (size_t t = tStart; t < sequenceLength; ++t) { + int maxClassIdx = 0; + + float maxProb = probs[0]; + ++probs; + + for (int c = 1; c < C; ++c, ++probs) { + if (*probs > maxProb) { + maxClassIdx = c; + maxProb = *probs; + } + } + probs += CB1; + outputSequences[outputIndex++] = static_cast(maxClassIdx); + + if (++workCounter >= end) { + return; + } + } + tStart = 0lu; + } + }; // thread body + + parallel_nt(0, threadBody); + + parallel_for(B, [&](size_t b) { + int prevClassIdx = -1; + size_t outputIndex = b * T; + const size_t sequenceLength = sequenceLengths[b]; + float* shiftedOut = outputSequences + b * T; + for (size_t t = 0; t < sequenceLength; ++t) { + if (*shiftedOut < blankIndex && + !(mergeRepeated && *shiftedOut == prevClassIdx)) { + outputSequences[outputIndex++] = *shiftedOut; + } + prevClassIdx = *shiftedOut; + shiftedOut++; + } + std::fill(outputSequences + outputIndex, outputSequences + (b + 1) * T, -1.f); + }); +} + +bool MKLDNNCTCGreedyDecoderNode::created() const { + return getType() == CTCGreedyDecoder; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNCTCGreedyDecoderNode, CTCGreedyDecoder) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.h new file mode 100644 index 00000000000000..26554ae7333dca --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.h @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNCTCGreedyDecoderNode : public MKLDNNNode { +public: + MKLDNNCTCGreedyDecoderNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const size_t DATA_INDEX = 0lu; + const size_t SEQUENCE_LENGTH_INDEX = 1lu; + bool mergeRepeated; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.cpp new file mode 100644 index 00000000000000..0eccdbfa1b5b07 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.cpp @@ -0,0 +1,170 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_ctc_greedy_decoder_seq_len_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNCTCGreedyDecoderSeqLenNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto greedyDecOp = ngraph::as_type_ptr(op); + if (!greedyDecOp) { + errorMessage = "Node is not an instance of the CTCGreedyDecoderSeqLen operation from operation set v6."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNCTCGreedyDecoderSeqLenNode::MKLDNNCTCGreedyDecoderSeqLenNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "CTCGreedyDecoderSeqLen layer with name '" + op->get_friendly_name() + "' "; + if (getOriginalInputsNumber() < 2 || getOriginalInputsNumber() > 3) + IE_THROW() << errorPrefix << "has invalid number of input edges: " << getOriginalInputsNumber(); + if (getOriginalOutputsNumber() != 2) + IE_THROW() << errorPrefix << "has invalid number of outputs edges: " << getOriginalOutputsNumber(); + + if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0]) + IE_THROW() << errorPrefix << "has invalid input shapes."; + + auto greedyDecOp = ngraph::as_type_ptr(op); + mergeRepeated = greedyDecOp->get_merge_repeated(); +} + +void MKLDNNCTCGreedyDecoderSeqLenNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + Precision inDataPrecision = getOriginalInputPrecisionAtPort(DATA_INDEX); + if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16) + IE_THROW() << errorPrefix << "has unsupported 'data' input precision: " << inDataPrecision; + + Precision seqLenPrecision = getOriginalInputPrecisionAtPort(SEQUENCE_LENGTH_INDEX); + if (seqLenPrecision != Precision::I32 && seqLenPrecision != Precision::I64) + IE_THROW() << errorPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision; + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + for (int i = 1; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, Precision::I32}, + {TensorDescCreatorTypes::ncsp, Precision::I32}}, + impl_desc_type::ref_any); +} + +void MKLDNNCTCGreedyDecoderSeqLenNode::execute(mkldnn::stream strm) { + const float* probabilities = reinterpret_cast(getParentEdgeAt(DATA_INDEX)->getMemoryPtr()->GetPtr()); + const int* sequenceLengths = reinterpret_cast(getParentEdgeAt(SEQUENCE_LENGTH_INDEX)->getMemoryPtr()->GetPtr()); + int* decodedClasses = reinterpret_cast(getChildEdgesAtPort(DECODED_CLASSES_INDEX)[0]->getMemoryPtr()->GetPtr()); + int* decodedClassesLength = reinterpret_cast(getChildEdgesAtPort(DECODED_CLASSES_LENGTH_INDEX)[0]->getMemoryPtr()->GetPtr()); + + const size_t B = getParentEdgeAt(DATA_INDEX)->getDims()[0];; + const size_t T = getParentEdgeAt(DATA_INDEX)->getDims()[1];; + const int C = getParentEdgeAt(DATA_INDEX)->getDims()[2];; + const size_t TC = T * C; + + int blankIndex = C - 1; + if (inDims.size() > BLANK_INDEX) + blankIndex = (reinterpret_cast(getParentEdgeAt(BLANK_INDEX)->getMemoryPtr()->GetPtr()))[0]; + + size_t workAmount = 0; + for (size_t b = 0; b < B; b++) { + if (sequenceLengths[b] > T) { + std::string errorMsg = errorPrefix + + ". Sequence length " + std::to_string(sequenceLengths[b]) + + " cannot be greater than according decoded classes dimension size " + + std::to_string(getChildEdgesAtPort(DECODED_CLASSES_INDEX)[0]->getDims()[1]); + IE_THROW() << errorMsg; + } + workAmount += sequenceLengths[b]; + } + // Parallelization could not be made directly by T due to output index depends on merged classes and + // blank index, thus could not be shared between threads. Better to divide operation on two steps. + // At the first stage find the maximum index. At second stage merge if needed. + // Such approach makes parallelization more efficient. + auto threadBody = [&](const int ithr, const int nthr) { + size_t start(0lu), end(0lu); + splitter(workAmount, nthr, ithr, start, end); + if (start >= end) + return; + size_t tStart = 0lu, bStart = 0lu; + for (; bStart < B; bStart++) { + tStart += sequenceLengths[bStart]; + if (tStart >= start) { + tStart = start - (tStart - sequenceLengths[bStart]); + break; + } + } + + size_t workCounter = start; + + for (size_t b = bStart; b < B; ++b) { + size_t outputIndex = b * T + tStart; + const float* probs = probabilities + b * TC + C * tStart; + const size_t actualSeqLen = sequenceLengths[b]; + + for (size_t t = tStart; t < actualSeqLen; ++t) { + int maxClassIdx = 0; + float maxProb = probs[0]; + probs++; + + for (int c = 1; c < C; c++, probs++) { + if (*probs > maxProb) { + maxClassIdx = c; + maxProb = *probs; + } + } + decodedClasses[outputIndex++] = maxClassIdx; + + if (++workCounter >= end) { + return; + } + } + tStart = 0lu; + } + }; // thread body + + parallel_nt(0, threadBody); + + parallel_for(B, [&](size_t b) { + int prevClassIdx = -1; + size_t outputIndex = b * T; + const size_t actualSeqLen = sequenceLengths[b]; + int* shiftedOut = decodedClasses + b * T; + + for (size_t t = 0; t < actualSeqLen; ++t) { + if (*shiftedOut != blankIndex && + !(mergeRepeated && *shiftedOut == prevClassIdx)) { + decodedClasses[outputIndex++] = *shiftedOut; + } + prevClassIdx = *shiftedOut; + shiftedOut++; + } + std::fill(decodedClasses + outputIndex, decodedClasses + (b + 1) * T, -1); + decodedClassesLength[b] = outputIndex - b * T; + }); +} + +bool MKLDNNCTCGreedyDecoderSeqLenNode::created() const { + return getType() == CTCGreedyDecoderSeqLen; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNCTCGreedyDecoderSeqLenNode, CTCGreedyDecoderSeqLen) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.h new file mode 100644 index 00000000000000..b1d5ab6d9ffef3 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.h @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNCTCGreedyDecoderSeqLenNode : public MKLDNNNode { +public: + MKLDNNCTCGreedyDecoderSeqLenNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const size_t DATA_INDEX = 0lu; + const size_t SEQUENCE_LENGTH_INDEX = 1lu; + const size_t BLANK_INDEX = 2lu; + const size_t DECODED_CLASSES_INDEX = 0lu; + const size_t DECODED_CLASSES_LENGTH_INDEX = 1lu; + bool mergeRepeated; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.cpp new file mode 100644 index 00000000000000..b355dcaefcd4b0 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.cpp @@ -0,0 +1,279 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_ctc_loss_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNCTCLossNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto ctcLossOp = ngraph::as_type_ptr(op); + if (!ctcLossOp) { + errorMessage = "Node is not an instance of the CTCLoss operation from operation set v4."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNCTCLossNode::MKLDNNCTCLossNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = std::string("CTCLoss layer with name '") + op->get_friendly_name() + "'"; + + if (getOriginalInputsNumber() != 4 && getOriginalInputsNumber() != 5) + IE_THROW() << errorPrefix << " has invalid inputs number."; + + auto ctcLossOp = ngraph::as_type_ptr(op); + ctcMergeRepeated = ctcLossOp->get_ctc_merge_repeated(); + preprocessCollapseRepeated = ctcLossOp->get_preprocess_collapse_repeated(); + unique = ctcLossOp->get_unique(); +} + +void MKLDNNCTCLossNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + for (int i = 1; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNCTCLossNode::execute(mkldnn::stream strm) { + StatusCode returnCode = OK; + + const float* logits = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + const int* logitsLength = reinterpret_cast(getParentEdgeAt(1)->getMemoryPtr()->GetPtr()); + const int* labels = reinterpret_cast(getParentEdgeAt(2)->getMemoryPtr()->GetPtr()); + const int* labelsLength = reinterpret_cast(getParentEdgeAt(3)->getMemoryPtr()->GetPtr()); + float* dstData = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + const size_t batchNum = getParentEdgeAt(0)->getDims()[0]; + const size_t maxTime = getParentEdgeAt(0)->getDims()[1]; + const size_t classesNum = getParentEdgeAt(0)->getDims()[2]; + + int blankIndex = classesNum - 1; + if (inDims.size() > 4) { + blankIndex = reinterpret_cast(getParentEdgeAt(4)->getMemoryPtr()->GetPtr())[0]; + } + + std::vector decodedTargetLenB(batchNum, 0); + std::vector> targetDB(batchNum); + std::vector>> logProbabilitiesB(batchNum); + std::vector errorMsgB(parallel_get_max_threads()); + + auto threadBody_1 = [&](const int ithr, const int nthr) { + size_t start(0lu), end(0lu); + splitter(batchNum, nthr, ithr, start, end); + if (start >= end) + return; + + for (size_t b = start; b < end; b++) { + if (logitsLength[b] < 0 || labelsLength[b] < 0 || logitsLength[b] > maxTime || labelsLength[b] > logitsLength[b]) { + errorMsgB[ithr] = errorPrefix + ". Logit length cannot be greater than max sequence length. " + + "Label length cannot be greater than a logit length" + + " and both cannot be negative.\nMaxSeqLen: " + + std::to_string(maxTime) + "; Logit len: " + std::to_string(logitsLength[b]) + + "; Label len: " + std::to_string(labelsLength[b]); + returnCode = GENERAL_ERROR; + return; + } + const size_t actualLogitLen = logitsLength[b]; + const size_t actualTargetLen = labelsLength[b]; + size_t decodedTargetLen = 0lu; + + // Decoding target: merge repeated characters if preprocess_collapse_repeated == True, + // find unique elemnts if unique == True. + // Inserts blanks before each index and a blank at the end. + const int* target = &labels[b * maxTime]; + targetDB[b].resize(actualTargetLen * 2 + 1); + auto& targetD = targetDB[b]; + if (unique) { + std::unordered_set uniqVals; + for (size_t t = 0lu; t < actualTargetLen; t++) { + if (uniqVals.find(target[t]) != uniqVals.end()) { + continue; + } + uniqVals.insert(target[t]); + targetD[decodedTargetLen++] = blankIndex; + targetD[decodedTargetLen++] = target[t]; + } + targetD[decodedTargetLen++] = blankIndex; + } else if (preprocessCollapseRepeated) { + auto prevValue = target[0]; + targetD[decodedTargetLen++] = blankIndex; + targetD[decodedTargetLen++] = target[0]; + for (size_t t = 1lu; t < actualTargetLen; t++) { + if (target[t] == prevValue) { + continue; + } + targetD[decodedTargetLen++] = blankIndex; + targetD[decodedTargetLen++] = prevValue = target[t]; + } + targetD[decodedTargetLen++] = blankIndex; + } else { + for (size_t t = 0lu; t < actualTargetLen; t++) { + targetD[decodedTargetLen++] = blankIndex; + targetD[decodedTargetLen++] = target[t]; + } + targetD[decodedTargetLen++] = blankIndex; + } + decodedTargetLenB[b] = decodedTargetLen; + + auto& logProbabilities = logProbabilitiesB[b]; + logProbabilities.resize(actualLogitLen); + for (size_t ll = 0; ll < actualLogitLen; ll++) { + logProbabilities[ll].resize(decodedTargetLen); + } + } // for batch + }; // threadBody_1 + + parallel_nt(0, threadBody_1); + if (returnCode != OK) { + std::string resErr(""); + for (auto& err : errorMsgB) { + if (!err.empty()) + resErr += err + "\n"; + } + IE_THROW() << resErr; + } + + const size_t TC = maxTime * classesNum; + + size_t workAmount2 = 0lu; + for (size_t b = 0; b < batchNum; b++) { + workAmount2 += logitsLength[b]; + } + + auto threadBody_2 = [&](const int ithr, const int nthr) { + size_t start(0lu), end(0lu); + size_t sB(0lu), sT(0lu); + splitter(workAmount2, nthr, ithr, start, end); + if (start >= end) + return; + int64_t cw = 0, st = start; + for (; sB < batchNum; sB++) { + cw += logitsLength[sB]; + if (cw >= st) { + sT = logitsLength[sB] + st - cw; + break; + } + } + size_t workCounter = start; + + for (size_t b = sB; b < batchNum; b++) { + const size_t actualLogitLen = logitsLength[b]; + const size_t decodedTargetLen = decodedTargetLenB[b]; + auto& logProbabilities = logProbabilitiesB[b]; + auto& targetD = targetDB[b]; + + double expSum = 0.0; + size_t btcT = b * TC + sT * classesNum; + // logProbabilities = logSoftmax = logits[b][t][c] - ln(sum_c(exp(logits[b][t]))) + for (size_t t = sT; t < actualLogitLen; t++) { + expSum = 0.0; + for (size_t c = 0lu; c < classesNum; c++) { + expSum += std::exp(logits[btcT + c]); + } + for (size_t s = 0lu; s < decodedTargetLen; s++) { + logProbabilities[t][s] = logits[btcT + targetD[s]] - std::log(expSum); + } + btcT += classesNum; + if (++workCounter >= end) { + return; + } + } + sT = 0lu; + } // for batch + }; // threadBody_2 + + parallel_nt(0, threadBody_2); + + const auto float_inf = std::numeric_limits::infinity(); + + auto sumLogs = [&float_inf](float log1, float log2) { + if (log1 == -float_inf) { + return log2; + } else if (log2 == -float_inf) { + return log1; + } else { + if (log1 > log2) + return log1 + std::log1pf(std::exp(log2 - log1)); + else + return log2 + std::log1pf(std::exp(log1 - log2)); + } + }; + + auto threadBody_3 = [&](const int ithr, const int nthr) { + size_t start(0lu), end(0lu); + splitter(batchNum, nthr, ithr, start, end); + if (start >= end) + return; + + // As per Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural Networks: + // Graves et al., 2016, paragraph 4.1 (10) + for (size_t b = start; b < end; b++) { + auto& targetD = targetDB[b]; + auto& logProbabilities = logProbabilitiesB[b]; + const int actualLogitLen = logitsLength[b]; + const int decodedTargetLen = decodedTargetLenB[b]; + std::vector> logBwd(decodedTargetLen, std::vector(actualLogitLen, -float_inf)); + for (int s = decodedTargetLen - 2; s < decodedTargetLen; s++) + logBwd[s][actualLogitLen - 1] = 0.f; + + for (int t = actualLogitLen - 2; t >= 0; t--) { + const int t_1 = t + 1; + for (int s = std::max(0, decodedTargetLen - (2 * (actualLogitLen - t))); + s < std::min(decodedTargetLen, 2 * (t_1)); s++) { + if (ctcMergeRepeated || targetD[s] == blankIndex) { + logBwd[s][t] = sumLogs(logBwd[s][t], + logBwd[s][t_1] + logProbabilities[t_1][s]); + } + + if (s + 1 < decodedTargetLen) { + logBwd[s][t] = sumLogs(logBwd[s][t], + logBwd[s + 1][t_1] + logProbabilities[t_1][s + 1]); + } + + if (s + 2 < decodedTargetLen) { + if (targetD[s] != blankIndex && (!ctcMergeRepeated || (targetD[s] != targetD[s + 2]))) { + logBwd[s][t] = sumLogs(logBwd[s][t], + logBwd[s + 2][t_1] + logProbabilities[t_1][s + 2]); + } + } + } + } + + logBwd[0][0] += logProbabilities[0][0]; + logBwd[1][0] += logProbabilities[0][(decodedTargetLen > 1) ? 1 : 0]; + + dstData[b] = -sumLogs(logBwd[0][0], logBwd[1][0]); + } // for batch + }; // threadBody_3 + + parallel_nt(0, threadBody_3); +} + +bool MKLDNNCTCLossNode::created() const { + return getType() == CTCLoss; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNCTCLossNode, CTCLoss) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.h new file mode 100644 index 00000000000000..b46ff413e829be --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.h @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNCTCLossNode : public MKLDNNNode { +public: + MKLDNNCTCLossNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + bool ctcMergeRepeated; + bool preprocessCollapseRepeated; + bool unique; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.cpp new file mode 100644 index 00000000000000..3f6c8f903482ce --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.cpp @@ -0,0 +1,279 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "list.hpp" +#include "base.hpp" + +#include +#include + +#include +#include +#include "ie_parallel.hpp" +#include "ie_precision.hpp" +#include +#include "mkldnn_cum_sum_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNCumSumNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto cumsum = std::dynamic_pointer_cast(op); + if (!cumsum) { + errorMessage = "Only opset3 CumSum operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNCumSumNode::MKLDNNCumSumNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "CumSum layer with name '" + op->get_friendly_name() + "' "; + + if ((getOriginalInputsNumber() != numOfInputs && getOriginalInputsNumber() != (numOfInputs - 1)) || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + const auto &dataShape = op->get_input_shape(CUM_SUM_DATA); + if (dataShape.size() < 1) { + IE_THROW() << errorPrefix << " doesn't support 'data' input tensor with rank: " << dataShape.size(); + } + numOfDims = dataShape.size(); + + const auto cumsum = std::dynamic_pointer_cast(op); + exclusive = cumsum->is_exclusive(); + reverse = cumsum->is_reverse(); + + if (getOriginalInputsNumber() == numOfInputs) { + if (!ngraph::is_scalar(cumsum->get_input_shape(AXIS))) + IE_THROW() << errorPrefix << " doesn't support 'axis' input tensor with non scalar rank"; + } + + if (dataShape != cumsum->get_output_shape(0)) + IE_THROW() << errorPrefix << " has different 'data' input and output dimensions"; + + shape = dataShape; +} + +void MKLDNNCumSumNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + dataPrecision = getOriginalInputPrecisionAtPort(CUM_SUM_DATA); + if (dataPrecision != Precision::I8 && dataPrecision != Precision::U8 && dataPrecision != Precision::I16 && dataPrecision != Precision::I32 && + dataPrecision != Precision::FP32 && dataPrecision != Precision::I64 && dataPrecision != Precision::U64 && dataPrecision != Precision::BF16) + IE_THROW() << errorPrefix << " has unsupported 'data' input precision: " << dataPrecision.name(); + + if (getOriginalInputsNumber() == numOfInputs) { + const auto &axisTensorPrec = getOriginalInputPrecisionAtPort(AXIS); + if (axisTensorPrec != Precision::I32 && axisTensorPrec != Precision::I64) + IE_THROW() << errorPrefix << " has unsupported 'axis' input precision: " << axisTensorPrec.name(); + } + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, dataPrecision); + for (int i = 1; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, dataPrecision}}, + impl_desc_type::ref_any); +} + +void MKLDNNCumSumNode::execute(mkldnn::stream strm) { + if (inDims.size() == numOfInputs) + axis = getAxis(getParentEdgeAt(AXIS)->getBlob(), getParentEdgeAt(CUM_SUM_DATA)->getBlob()); + + switch (dataPrecision) { + case Precision::I8 : { + exec(); + break; + } + case Precision::U8 : { + exec(); + break; + } + case Precision::I16 : { + exec(); + break; + } + case Precision::I32 : { + exec(); + break; + } + case Precision::FP32 : { + exec(); + break; + } + case Precision::I64 : { + exec(); + break; + } + case Precision::U64 : { + exec(); + break; + } + default : { + std::string errorMsg = errorPrefix + " has unsupported 'data' input precision: " + dataPrecision.name(); + IE_THROW() << errorMsg; + } + } +} + + +template +void MKLDNNCumSumNode::exec() { + const auto *input = reinterpret_cast(getParentEdgeAt(CUM_SUM_DATA)->getMemoryPtr()->GetPtr()); + auto *output = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + const std::vector strides = getParentEdgeAt(CUM_SUM_DATA)->getDesc().getBlockingDesc().getStrides(); + + if (reverse) { + if (exclusive) { + cumSum(input, output, strides); + } else { + cumSum(input, output, strides); + } + } else { + if (exclusive) { + cumSum(input, output, strides); + } else { + cumSum(input, output, strides); + } + } +} + +template +void MKLDNNCumSumNode::cumSum(const dataType *input, dataType *output, const std::vector &strides) { + SizeVector iterationRange(numOfDims - 1); + size_t j = 0; + for (size_t i = 0; i < shape.size(); i++) { + if (i == axis) + continue; + iterationRange[j++] = shape[i]; + } + size_t work_amount_dst = std::accumulate(iterationRange.begin(), iterationRange.end(), 1, std::multiplies()); + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + SizeVector counters(numOfDims - 1, 0); + splitter(work_amount_dst, nthr, ithr, start, end); + + parallelItInit(start, counters, iterationRange); + + for (size_t iwork = start; iwork < end; ++iwork) { + std::vector forStartOffset(numOfDims); + forStartOffset[axis] = 0; + for (size_t offsetIdx = 0, countersIdx = 0; offsetIdx < numOfDims; ++offsetIdx) { + if (offsetIdx == axis) { + continue; + } + forStartOffset[offsetIdx] = counters[countersIdx++]; + } + + size_t startOffset = getStartOffset(forStartOffset, strides); + + const dataType *inputStart = input + startOffset; + dataType *outputStart = output + startOffset; + + size_t offset = strides[axis]; + if (reverse) { + if (exclusive) { + outputStart[offset*(shape[axis] - 1)] = 0; + for (int64_t i = shape[axis] - 2; i >= 0; i--) { + outputStart[i*offset] = inputStart[(i+1)*offset] + outputStart[(i+1)*offset]; + } + } else { + outputStart[offset*(shape[axis] - 1)] = inputStart[offset * (shape[axis] - 1)]; + for (int64_t i = shape[axis] - 2; i >= 0; i--) { + outputStart[i*offset] = inputStart[i*offset] + outputStart[(i+1)*offset]; + } + } + } else { + if (exclusive) { + outputStart[0] = 0; + for (size_t i = 1; i < shape[axis]; i++) { + outputStart[i*offset] = inputStart[(i-1)*offset] + outputStart[(i-1)*offset]; + } + } else { + outputStart[0] = inputStart[0]; + for (size_t i = 1; i < shape[axis]; i++) { + outputStart[i*offset] = inputStart[i*offset] + outputStart[(i-1)*offset]; + } + } + } + + parallelItStep(counters, iterationRange); + } + }); +} + +void MKLDNNCumSumNode::parallelItInit(size_t start, std::vector& counters, const std::vector& iterationRange) { + auto itCounter = counters.rbegin(); + auto itWork = iterationRange.rbegin(); + while (itCounter != counters.rend() && itWork != iterationRange.rend()) { + *itCounter = start % *itWork; + start /= *itWork; + ++itCounter; + ++itWork; + } +} + +inline void MKLDNNCumSumNode::parallelItStep(std::vector& counters, const std::vector& iterationRange) { + auto itCounter = counters.rbegin(); + auto itWork = iterationRange.rbegin(); + + while (itCounter != counters.rend() && itWork != iterationRange.rend()) { + *itCounter = (*itCounter + 1) % *itWork; + if (*itCounter != 0) { + break; + } + ++itCounter; + ++itWork; + } +} + +inline size_t MKLDNNCumSumNode::getStartOffset(const std::vector &forStartOffset, const std::vector& strides) const { + size_t startOffset = 0; + for (size_t idx = 0; idx < forStartOffset.size(); ++idx) { + startOffset += forStartOffset[idx] * strides[idx]; + } + return startOffset; +} + +size_t MKLDNNCumSumNode::getAxis(const Blob::CPtr& _axis, const Blob::CPtr& _data) const { + const auto& axisPrecision = _axis->getTensorDesc().getPrecision(); + const int64_t dataShapeSize = static_cast(_data->getTensorDesc().getDims().size()); + int64_t axisValueFromBlob; + switch (axisPrecision) { + case Precision::I32 : { + const auto *axisPtr = _axis->cbuffer().as(); + axisValueFromBlob = static_cast(axisPtr[0]); + break; + } + case Precision::I64 : { + const auto *axisPtr = _axis->cbuffer().as(); + axisValueFromBlob = axisPtr[0]; + break; + } + default : { + IE_THROW() << errorPrefix << " doesn't support 'axis' input with precision: " << axisPrecision.name(); + } + } + if (axisValueFromBlob < -dataShapeSize || axisValueFromBlob > dataShapeSize - 1) + IE_THROW() << errorPrefix << " has axis with a value out of range: " << axisValueFromBlob; + return axisValueFromBlob >= 0 ? axisValueFromBlob : (axisValueFromBlob + dataShapeSize); +} + +bool MKLDNNCumSumNode::created() const { + return getType() == CumSum; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNCumSumNode, CumSum) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.h new file mode 100644 index 00000000000000..794d6bc73f1722 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.h @@ -0,0 +1,50 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNCumSumNode : public MKLDNNNode { +public: + MKLDNNCumSumNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + template + void exec(); + + template + void cumSum(const dataType *input, dataType *output, const std::vector &strides); + + void parallelItInit(size_t start, std::vector& counters, const std::vector& iterationRange); + + inline void parallelItStep(std::vector& counters, const std::vector& iterationRange); + + inline size_t getStartOffset(const std::vector &forStartOffset, const std::vector& strides) const; + + size_t getAxis(const InferenceEngine::Blob::CPtr& _axis, const InferenceEngine::Blob::CPtr& _data) const; + + enum { CUM_SUM_DATA, AXIS, numOfInputs }; + bool exclusive; + bool reverse; + size_t numOfDims; + size_t axis = 0; + std::vector shape; + + InferenceEngine::Precision dataPrecision; + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp index dde4d960c5897e..a2fae182a52f70 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp @@ -741,7 +741,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ bool MKLDNNDeformableConvolutionNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto defConvNode = ngraph::as_type_ptr(op); + const auto defConvNode = ngraph::as_type_ptr(op); if (!defConvNode) { errorMessage = "Node is not an instance of DeformableConvolution form the operation set v1."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.cpp new file mode 100644 index 00000000000000..4b8c695a987315 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.cpp @@ -0,0 +1,601 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_detection_output_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +template +static bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +bool MKLDNNDetectionOutputNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto doOp = ngraph::as_type_ptr(op); + if (!doOp) { + errorMessage = "Node is not an instance of the DetectionOutput from the operations set v0."; + return false; + } + if (!details::CaselessEq()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CENTER_SIZE") && + !details::CaselessEq()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CORNER")) { + errorMessage = "Unsupported code_type attribute: " + doOp->get_attrs().code_type; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNDetectionOutputNode::MKLDNNDetectionOutputNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "DetectionOutput layer with name '" + op->get_friendly_name() + "' "; + + if (getOriginalInputsNumber() != 3 && getOriginalInputsNumber() != 5) + IE_THROW() << errorPrefix << " has incorrect number of input edges."; + + if (getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of output edges."; + + auto doOp = ngraph::as_type_ptr(op); + auto attributes = doOp->get_attrs(); + + _num_classes = attributes.num_classes; + _background_label_id = attributes.background_label_id; + _top_k = attributes.top_k; + _variance_encoded_in_target = attributes.variance_encoded_in_target; + _keep_top_k = attributes.keep_top_k[0]; + _nms_threshold = attributes.nms_threshold; + _confidence_threshold = attributes.confidence_threshold; + _share_location = attributes.share_location; + _clip_before_nms = attributes.clip_before_nms; + _clip_after_nms = attributes.clip_after_nms; + _decrease_label_id = attributes.decrease_label_id; + _normalized = attributes.normalized; + _image_height = attributes.input_height; + _image_width = attributes.input_width; + _prior_size = _normalized ? 4 : 5; + _offset = _normalized ? 0 : 1; + _num_loc_classes = _share_location ? 1 : _num_classes; + + with_add_box_pred = getOriginalInputsNumber() == 5; + _objectness_score = attributes.objectness_score; + + _code_type = (details::CaselessEq()(attributes.code_type, "caffe.PriorBoxParameter.CENTER_SIZE") ? + CodeType::CENTER_SIZE : CodeType::CORNER); + + _num_priors = static_cast(op->get_input_shape(idx_priors).back() / _prior_size); + _priors_batches = op->get_input_shape(idx_priors).front() != 1; + + if (_num_priors * _num_loc_classes * 4 != static_cast(op->get_input_shape(idx_location)[1])) + IE_THROW() << errorPrefix << " has incorrect number of priors must match number of location predictions (" + << _num_priors * _num_loc_classes * 4 << " vs " + << op->get_input_shape(idx_location)[1] << ")"; + + if (_num_priors * _num_classes != static_cast(op->get_input_shape(idx_confidence).back())) + IE_THROW() << " has incorrect number of priors must match number of confidence predictions."; + + if (_decrease_label_id && _background_label_id != 0) + IE_THROW() << errorPrefix << " cannot use decrease_label_id and background_label_id parameter simultaneously."; + + _num = static_cast(op->get_input_shape(idx_confidence)[0]); + + _decoded_bboxes.resize(_num * _num_classes * _num_priors * 4); + _buffer.resize(_num * _num_classes * _num_priors); + _indices.resize(_num * _num_classes * _num_priors); + _detections_count.resize(_num * _num_classes); + _bbox_sizes.resize(_num * _num_classes * _num_priors); + _num_priors_actual.resize(_num); + + const auto &confSize = op->get_input_shape(idx_confidence); + _reordered_conf.resize(std::accumulate(confSize.begin(), confSize.end(), 1, std::multiplies())); +} + +void MKLDNNDetectionOutputNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNDetectionOutputNode::execute(mkldnn::stream strm) { + float *dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + const float *loc_data = reinterpret_cast(getParentEdgeAt(idx_location)->getMemoryPtr()->GetPtr()); + const float *conf_data = reinterpret_cast(getParentEdgeAt(idx_confidence)->getMemoryPtr()->GetPtr()); + const float *prior_data = reinterpret_cast(getParentEdgeAt(idx_priors)->getMemoryPtr()->GetPtr()); + const float *arm_conf_data = inDims.size() > 3 ? + reinterpret_cast(getParentEdgeAt(idx_arm_confidence)->getMemoryPtr()->GetPtr()) : nullptr; + const float *arm_loc_data = inDims.size() > 4 ? + reinterpret_cast(getParentEdgeAt(idx_arm_location)->getMemoryPtr()->GetPtr()) : nullptr; + + const int N = getParentEdgeAt(idx_confidence)->getDims()[0]; + + float *decoded_bboxes_data = _decoded_bboxes.data(); + float *reordered_conf_data = _reordered_conf.data(); + float *bbox_sizes_data = _bbox_sizes.data(); + int *detections_data = _detections_count.data(); + int *buffer_data = _buffer.data(); + int *indices_data = _indices.data(); + int *num_priors_actual = _num_priors_actual.data(); + + for (int n = 0; n < N; ++n) { + const float *ppriors = prior_data; + const float *prior_variances = prior_data + _num_priors*_prior_size; + if (_priors_batches) { + ppriors += _variance_encoded_in_target ? n*_num_priors*_prior_size : 2*n*_num_priors*_prior_size; + prior_variances += _variance_encoded_in_target ? 0 : 2*n*_num_priors*_prior_size; + } + + if (_share_location) { + const float *ploc = loc_data + n*4*_num_priors; + float *pboxes = decoded_bboxes_data + n*4*_num_priors; + float *psizes = bbox_sizes_data + n*_num_priors; + + if (with_add_box_pred) { + const float *p_arm_loc = arm_loc_data + n*4*_num_priors; + decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); + decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false); + } else { + decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); + } + } else { + for (int c = 0; c < _num_loc_classes; ++c) { + if (c == _background_label_id) { + continue; + } + const float *ploc = loc_data + n*4*_num_loc_classes*_num_priors + c*4; + float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors + c*4*_num_priors; + float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors + c*_num_priors; + if (with_add_box_pred) { + const float *p_arm_loc = arm_loc_data + n*4*_num_loc_classes*_num_priors + c*4; + decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); + decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false); + } else { + decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); + } + } + } + } + + if (with_add_box_pred) { + for (int n = 0; n < N; ++n) { + for (int p = 0; p < _num_priors; ++p) { + if (arm_conf_data[n*_num_priors*2 + p * 2 + 1] < _objectness_score) { + for (int c = 0; c < _num_classes; ++c) { + reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = c == _background_label_id ? 1.0f : 0.0f; + } + } else { + for (int c = 0; c < _num_classes; ++c) { + reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c]; + } + } + } + } + } else { + for (int n = 0; n < N; ++n) { + for (int c = 0; c < _num_classes; ++c) { + for (int p = 0; p < _num_priors; ++p) { + reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c]; + } + } + } + } + + memset(detections_data, 0, N*_num_classes*sizeof(int)); + + for (int n = 0; n < N; ++n) { + int detections_total = 0; + + if (!_decrease_label_id) { + // Caffe style + parallel_for(_num_classes, [&](int c) { + if (c != _background_label_id) { // Ignore background class + int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors; + int *pbuffer = buffer_data + c*_num_priors; + int *pdetections = detections_data + n*_num_classes + c; + + const float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors; + const float *pboxes; + const float *psizes; + if (_share_location) { + pboxes = decoded_bboxes_data + n*4*_num_priors; + psizes = bbox_sizes_data + n*_num_priors; + } else { + pboxes = decoded_bboxes_data + n*4*_num_classes*_num_priors + c*4*_num_priors; + psizes = bbox_sizes_data + n*_num_classes*_num_priors + c*_num_priors; + } + + nms_cf(pconf, pboxes, psizes, pbuffer, pindices, *pdetections, num_priors_actual[n]); + } + }); + } else { + // MXNet style + int *pindices = indices_data + n*_num_classes*_num_priors; + int *pbuffer = buffer_data; + int *pdetections = detections_data + n*_num_classes; + + const float *pconf = reordered_conf_data + n*_num_classes*_num_priors; + const float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors; + const float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors; + + nms_mx(pconf, pboxes, psizes, pbuffer, pindices, pdetections, _num_priors); + } + + for (int c = 0; c < _num_classes; ++c) { + detections_total += detections_data[n*_num_classes + c]; + } + + if (_keep_top_k > -1 && detections_total > _keep_top_k) { + std::vector>> conf_index_class_map; + + for (int c = 0; c < _num_classes; ++c) { + int detections = detections_data[n*_num_classes + c]; + int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors; + + float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors; + + for (int i = 0; i < detections; ++i) { + int idx = pindices[i]; + conf_index_class_map.push_back(std::make_pair(pconf[idx], std::make_pair(c, idx))); + } + } + + std::sort(conf_index_class_map.begin(), conf_index_class_map.end(), + SortScorePairDescend>); + conf_index_class_map.resize(_keep_top_k); + + // Store the new indices. + memset(detections_data + n*_num_classes, 0, _num_classes * sizeof(int)); + + for (size_t j = 0; j < conf_index_class_map.size(); ++j) { + int label = conf_index_class_map[j].second.first; + int idx = conf_index_class_map[j].second.second; + int *pindices = indices_data + n * _num_classes * _num_priors + label * _num_priors; + pindices[detections_data[n*_num_classes + label]] = idx; + detections_data[n*_num_classes + label]++; + } + } + } + + const int num_results = getChildEdgesAtPort(0)[0]->getDims()[2]; + const int DETECTION_SIZE = getChildEdgesAtPort(0)[0]->getDims()[3]; + if (DETECTION_SIZE != 7) { + IE_THROW() << NOT_IMPLEMENTED; + } + + int dst_data_size = 0; + if (_keep_top_k > 0) + dst_data_size = N * _keep_top_k * DETECTION_SIZE * sizeof(float); + else if (_top_k > 0) + dst_data_size = N * _top_k * _num_classes * DETECTION_SIZE * sizeof(float); + else + dst_data_size = N * _num_classes * _num_priors * DETECTION_SIZE * sizeof(float); + + if (dst_data_size > getChildEdgesAtPort(0)[0]->getBlob()->byteSize()) { + IE_THROW() << OUT_OF_BOUNDS; + } + memset(dst_data, 0, dst_data_size); + + int count = 0; + for (int n = 0; n < N; ++n) { + const float *pconf = reordered_conf_data + n * _num_priors * _num_classes; + const float *pboxes = decoded_bboxes_data + n*_num_priors*4*_num_loc_classes; + const int *pindices = indices_data + n*_num_classes*_num_priors; + + for (int c = 0; c < _num_classes; ++c) { + for (int i = 0; i < detections_data[n*_num_classes + c]; ++i) { + int idx = pindices[c*_num_priors + i]; + + dst_data[count * DETECTION_SIZE + 0] = static_cast(n); + dst_data[count * DETECTION_SIZE + 1] = static_cast(_decrease_label_id ? c-1 : c); + dst_data[count * DETECTION_SIZE + 2] = pconf[c*_num_priors + idx]; + + float xmin = _share_location ? pboxes[idx*4 + 0] : + pboxes[c*4*_num_priors + idx*4 + 0]; + float ymin = _share_location ? pboxes[idx*4 + 1] : + pboxes[c*4*_num_priors + idx*4 + 1]; + float xmax = _share_location ? pboxes[idx*4 + 2] : + pboxes[c*4*_num_priors + idx*4 + 2]; + float ymax = _share_location ? pboxes[idx*4 + 3] : + pboxes[c*4*_num_priors + idx*4 + 3]; + + if (_clip_after_nms) { + xmin = (std::max)(0.0f, (std::min)(1.0f, xmin)); + ymin = (std::max)(0.0f, (std::min)(1.0f, ymin)); + xmax = (std::max)(0.0f, (std::min)(1.0f, xmax)); + ymax = (std::max)(0.0f, (std::min)(1.0f, ymax)); + } + + dst_data[count * DETECTION_SIZE + 3] = xmin; + dst_data[count * DETECTION_SIZE + 4] = ymin; + dst_data[count * DETECTION_SIZE + 5] = xmax; + dst_data[count * DETECTION_SIZE + 6] = ymax; + + ++count; + } + } + } + + if (count < num_results) { + // marker at end of boxes list + dst_data[count * DETECTION_SIZE + 0] = -1; + } +} + +struct ConfidenceComparator { + explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {} + + bool operator()(int idx1, int idx2) { + if (_conf_data[idx1] > _conf_data[idx2]) return true; + if (_conf_data[idx1] < _conf_data[idx2]) return false; + return idx1 < idx2; + } + + const float* _conf_data; +}; + +static inline float JaccardOverlap(const float *decoded_bbox, + const float *bbox_sizes, + const int idx1, + const int idx2) { + float xmin1 = decoded_bbox[idx1*4 + 0]; + float ymin1 = decoded_bbox[idx1*4 + 1]; + float xmax1 = decoded_bbox[idx1*4 + 2]; + float ymax1 = decoded_bbox[idx1*4 + 3]; + + float xmin2 = decoded_bbox[idx2*4 + 0]; + float ymin2 = decoded_bbox[idx2*4 + 1]; + float xmax2 = decoded_bbox[idx2*4 + 2]; + float ymax2 = decoded_bbox[idx2*4 + 3]; + + if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) { + return 0.0f; + } + + float intersect_xmin = (std::max)(xmin1, xmin2); + float intersect_ymin = (std::max)(ymin1, ymin2); + float intersect_xmax = (std::min)(xmax1, xmax2); + float intersect_ymax = (std::min)(ymax1, ymax2); + + float intersect_width = intersect_xmax - intersect_xmin; + float intersect_height = intersect_ymax - intersect_ymin; + + if (intersect_width <= 0 || intersect_height <= 0) { + return 0.0f; + } + + float intersect_size = intersect_width * intersect_height; + float bbox1_size = bbox_sizes[idx1]; + float bbox2_size = bbox_sizes[idx2]; + + return intersect_size / (bbox1_size + bbox2_size - intersect_size); +} + +void MKLDNNDetectionOutputNode::decodeBBoxes(const float *prior_data, + const float *loc_data, + const float *variance_data, + float *decoded_bboxes, + float *decoded_bbox_sizes, + int* num_priors_actual, + int n, + const int& offs, + const int& pr_size, + bool decodeType) { + num_priors_actual[n] = _num_priors; + if (!_normalized && decodeType) { + int num = 0; + for (; num < _num_priors; ++num) { + float batch_id = prior_data[num * pr_size + 0]; + if (batch_id == -1.f) { + num_priors_actual[n] = num; + break; + } + } + } + parallel_for(num_priors_actual[n], [&](int p) { + float new_xmin = 0.0f; + float new_ymin = 0.0f; + float new_xmax = 0.0f; + float new_ymax = 0.0f; + + float prior_xmin = prior_data[p*pr_size + 0 + offs]; + float prior_ymin = prior_data[p*pr_size + 1 + offs]; + float prior_xmax = prior_data[p*pr_size + 2 + offs]; + float prior_ymax = prior_data[p*pr_size + 3 + offs]; + + float loc_xmin = loc_data[4*p*_num_loc_classes + 0]; + float loc_ymin = loc_data[4*p*_num_loc_classes + 1]; + float loc_xmax = loc_data[4*p*_num_loc_classes + 2]; + float loc_ymax = loc_data[4*p*_num_loc_classes + 3]; + + if (!_normalized) { + prior_xmin /= _image_width; + prior_ymin /= _image_height; + prior_xmax /= _image_width; + prior_ymax /= _image_height; + } + + if (_code_type == CodeType::CORNER) { + if (_variance_encoded_in_target) { + // variance is encoded in target, we simply need to add the offset predictions. + new_xmin = prior_xmin + loc_xmin; + new_ymin = prior_ymin + loc_ymin; + new_xmax = prior_xmax + loc_xmax; + new_ymax = prior_ymax + loc_ymax; + } else { + new_xmin = prior_xmin + variance_data[p*4 + 0] * loc_xmin; + new_ymin = prior_ymin + variance_data[p*4 + 1] * loc_ymin; + new_xmax = prior_xmax + variance_data[p*4 + 2] * loc_xmax; + new_ymax = prior_ymax + variance_data[p*4 + 3] * loc_ymax; + } + } else if (_code_type == CodeType::CENTER_SIZE) { + float prior_width = prior_xmax - prior_xmin; + float prior_height = prior_ymax - prior_ymin; + float prior_center_x = (prior_xmin + prior_xmax) / 2.0f; + float prior_center_y = (prior_ymin + prior_ymax) / 2.0f; + + float decode_bbox_center_x, decode_bbox_center_y; + float decode_bbox_width, decode_bbox_height; + + if (_variance_encoded_in_target) { + // variance is encoded in target, we simply need to restore the offset predictions. + decode_bbox_center_x = loc_xmin * prior_width + prior_center_x; + decode_bbox_center_y = loc_ymin * prior_height + prior_center_y; + decode_bbox_width = std::exp(loc_xmax) * prior_width; + decode_bbox_height = std::exp(loc_ymax) * prior_height; + } else { + // variance is encoded in bbox, we need to scale the offset accordingly. + decode_bbox_center_x = variance_data[p*4 + 0] * loc_xmin * prior_width + prior_center_x; + decode_bbox_center_y = variance_data[p*4 + 1] * loc_ymin * prior_height + prior_center_y; + decode_bbox_width = std::exp(variance_data[p*4 + 2] * loc_xmax) * prior_width; + decode_bbox_height = std::exp(variance_data[p*4 + 3] * loc_ymax) * prior_height; + } + + new_xmin = decode_bbox_center_x - decode_bbox_width / 2.0f; + new_ymin = decode_bbox_center_y - decode_bbox_height / 2.0f; + new_xmax = decode_bbox_center_x + decode_bbox_width / 2.0f; + new_ymax = decode_bbox_center_y + decode_bbox_height / 2.0f; + } + + if (_clip_before_nms) { + new_xmin = (std::max)(0.0f, (std::min)(1.0f, new_xmin)); + new_ymin = (std::max)(0.0f, (std::min)(1.0f, new_ymin)); + new_xmax = (std::max)(0.0f, (std::min)(1.0f, new_xmax)); + new_ymax = (std::max)(0.0f, (std::min)(1.0f, new_ymax)); + } + + decoded_bboxes[p*4 + 0] = new_xmin; + decoded_bboxes[p*4 + 1] = new_ymin; + decoded_bboxes[p*4 + 2] = new_xmax; + decoded_bboxes[p*4 + 3] = new_ymax; + + decoded_bbox_sizes[p] = (new_xmax - new_xmin) * (new_ymax - new_ymin); + }); +} + +void MKLDNNDetectionOutputNode::nms_cf(const float* conf_data, + const float* bboxes, + const float* sizes, + int* buffer, + int* indices, + int& detections, + int num_priors_actual) { + int count = 0; + for (int i = 0; i < num_priors_actual; ++i) { + if (conf_data[i] > _confidence_threshold) { + indices[count] = i; + count++; + } + } + + int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count)); + + std::partial_sort_copy(indices, indices + count, + buffer, buffer + num_output_scores, + ConfidenceComparator(conf_data)); + + for (int i = 0; i < num_output_scores; ++i) { + const int idx = buffer[i]; + + bool keep = true; + for (int k = 0; k < detections; ++k) { + const int kept_idx = indices[k]; + float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx); + if (overlap > _nms_threshold) { + keep = false; + break; + } + } + if (keep) { + indices[detections] = idx; + detections++; + } + } +} + +void MKLDNNDetectionOutputNode::nms_mx(const float* conf_data, + const float* bboxes, + const float* sizes, + int* buffer, + int* indices, + int* detections, + int num_priors_actual) { + int count = 0; + for (int i = 0; i < num_priors_actual; ++i) { + float conf = -1; + int id = 0; + for (int c = 1; c < _num_classes; ++c) { + float temp = conf_data[c*_num_priors + i]; + if (temp > conf) { + conf = temp; + id = c; + } + } + + if (id > 0 && conf >= _confidence_threshold) { + indices[count++] = id*_num_priors + i; + } + } + + int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count)); + + std::partial_sort_copy(indices, indices + count, + buffer, buffer + num_output_scores, + ConfidenceComparator(conf_data)); + + for (int i = 0; i < num_output_scores; ++i) { + const int idx = buffer[i]; + const int cls = idx/_num_priors; + const int prior = idx%_num_priors; + + int &ndetection = detections[cls]; + int *pindices = indices + cls*_num_priors; + + bool keep = true; + for (int k = 0; k < ndetection; ++k) { + const int kept_idx = pindices[k]; + float overlap = 0.0f; + if (_share_location) { + overlap = JaccardOverlap(bboxes, sizes, prior, kept_idx); + } else { + overlap = JaccardOverlap(bboxes, sizes, cls*_num_priors + prior, cls*_num_priors + kept_idx); + } + if (overlap > _nms_threshold) { + keep = false; + break; + } + } + if (keep) { + pindices[ndetection++] = prior; + } + } +} + +bool MKLDNNDetectionOutputNode::created() const { + return getType() == DetectionOutput; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNDetectionOutputNode, DetectionOutput) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.h new file mode 100644 index 00000000000000..dbf9bde760907c --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.h @@ -0,0 +1,86 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNDetectionOutputNode : public MKLDNNNode { +public: + MKLDNNDetectionOutputNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const int idx_location = 0; + const int idx_confidence = 1; + const int idx_priors = 2; + const int idx_arm_confidence = 3; + const int idx_arm_location = 4; + + int _num_classes = 0; + int _background_label_id = 0; + int _top_k = 0; + int _variance_encoded_in_target = 0; + int _keep_top_k = 0; + int _code_type = 0; + + bool _share_location = false; + bool _clip_before_nms = false; // clip bounding boxes before nms step + bool _clip_after_nms = false; // clip bounding boxes after nms step + bool _decrease_label_id = false; + + bool with_add_box_pred = false; + + int _image_width = 0; + int _image_height = 0; + int _prior_size = 4; + bool _normalized = true; + int _offset = 0; + + float _nms_threshold = 0.0f; + float _confidence_threshold = 0.0f; + float _objectness_score = 0.0f; + + int _num = 0; + int _num_loc_classes = 0; + int _num_priors = 0; + bool _priors_batches = false; + + enum CodeType { + CORNER = 1, + CENTER_SIZE = 2, + }; + + void decodeBBoxes(const float *prior_data, const float *loc_data, const float *variance_data, + float *decoded_bboxes, float *decoded_bbox_sizes, int* num_priors_actual, int n, const int& offs, const int& pr_size, + bool decodeType = true); // after ARM = false + + void nms_cf(const float *conf_data, const float *bboxes, const float *sizes, + int *buffer, int *indices, int &detections, int num_priors_actual); + + void nms_mx(const float *conf_data, const float *bboxes, const float *sizes, + int *buffer, int *indices, int *detections, int num_priors_actual); + + std::vector _decoded_bboxes; + std::vector _buffer; + std::vector _indices; + std::vector _detections_count; + std::vector _reordered_conf; + std::vector _bbox_sizes; + std::vector _num_priors_actual; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h index 6b565370917db7..34e95d45ae06e8 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h @@ -124,4 +124,3 @@ class MKLDNNEltwiseNode : public MKLDNNNode { }; } // namespace MKLDNNPlugin - diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp index c8810e4444b2a5..f59b69b023d99c 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp @@ -13,7 +13,7 @@ using namespace InferenceEngine; bool MKLDNNEmbeddingBagOffsetSumNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto embBagOffsetSumOp = ngraph::as_type_ptr(op); + const auto embBagOffsetSumOp = ngraph::as_type_ptr(op); if (!embBagOffsetSumOp) { errorMessage = "Node is not an instance of the EmbeddingBagOffsetsSum operation from opset v3."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp index 4d1b808b502fb5..3318e1089faeed 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp @@ -13,7 +13,7 @@ using namespace InferenceEngine; bool MKLDNNEmbeddingBagPackedSumNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto embBagPackedSumOp = ngraph::as_type_ptr(op); + const auto embBagPackedSumOp = ngraph::as_type_ptr(op); if (!embBagPackedSumOp) { errorMessage = "Node is not an instance of the EmbeddingBagPackedSum operation from opset v3."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp index 798feecf7bd062..82eae04dcc2193 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp @@ -13,7 +13,7 @@ using namespace InferenceEngine; bool MKLDNNEmbeddingSegmentsSumNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto embBagSegSumOp = ngraph::as_type_ptr(op); + const auto embBagSegSumOp = ngraph::as_type_ptr(op); if (!embBagSegSumOp) { errorMessage = "Node is not an instance of the EmbeddingSegmentsSum operation from opset v3."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.cpp new file mode 100644 index 00000000000000..fe2362003f377a --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.cpp @@ -0,0 +1,369 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_experimental_detectron_detection_output_node.h" + + +struct Indexer { + const std::vector dims_; + int total_{1}; + + explicit Indexer(const std::vector& dims) : dims_(dims) { + total_ = 1; + for (size_t i = 0; i < dims_.size(); ++i) { + total_ *= dims_[i]; + } + } + + int operator()(const std::vector& idx) const { + int flat_idx = 0; + assert(idx.size() == dims_.size()); + for (size_t i = 0; i < dims_.size(); ++i) { + assert(0 <= idx[i] && idx[i] < dims_[i]); + flat_idx = flat_idx * dims_[i] + idx[i]; + } + assert(flat_idx < total_); + return flat_idx; + } +}; + + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +static +void refine_boxes(const float* boxes, const float* deltas, const float* weights, const float* scores, + float* refined_boxes, float* refined_boxes_areas, float* refined_scores, + const int rois_num, const int classes_num, + const float img_H, const float img_W, + const float max_delta_log_wh, + float coordinates_offset) { + Indexer box_idx({rois_num, 4}); + Indexer delta_idx({rois_num, classes_num, 4}); + Indexer score_idx({rois_num, classes_num}); + + Indexer refined_box_idx({classes_num, rois_num, 4}); + Indexer refined_score_idx({classes_num, rois_num}); + + for (int roi_idx = 0; roi_idx < rois_num; ++roi_idx) { + float x0 = boxes[box_idx({roi_idx, 0})]; + float y0 = boxes[box_idx({roi_idx, 1})]; + float x1 = boxes[box_idx({roi_idx, 2})]; + float y1 = boxes[box_idx({roi_idx, 3})]; + + if (x1 - x0 <= 0 || y1 - y0 <= 0) { + continue; + } + + // width & height of box + const float ww = x1 - x0 + coordinates_offset; + const float hh = y1 - y0 + coordinates_offset; + // center location of box + const float ctr_x = x0 + 0.5f * ww; + const float ctr_y = y0 + 0.5f * hh; + + for (int class_idx = 1; class_idx < classes_num; ++class_idx) { + const float dx = deltas[delta_idx({roi_idx, class_idx, 0})] / weights[0]; + const float dy = deltas[delta_idx({roi_idx, class_idx, 1})] / weights[1]; + const float d_log_w = deltas[delta_idx({roi_idx, class_idx, 2})] / weights[2]; + const float d_log_h = deltas[delta_idx({roi_idx, class_idx, 3})] / weights[3]; + + // new center location according to deltas (dx, dy) + const float pred_ctr_x = dx * ww + ctr_x; + const float pred_ctr_y = dy * hh + ctr_y; + // new width & height according to deltas d(log w), d(log h) + const float pred_w = std::exp((std::min)(d_log_w, max_delta_log_wh)) * ww; + const float pred_h = std::exp((std::min)(d_log_h, max_delta_log_wh)) * hh; + + // update upper-left corner location + float x0_new = pred_ctr_x - 0.5f * pred_w; + float y0_new = pred_ctr_y - 0.5f * pred_h; + // update lower-right corner location + float x1_new = pred_ctr_x + 0.5f * pred_w - coordinates_offset; + float y1_new = pred_ctr_y + 0.5f * pred_h - coordinates_offset; + + // adjust new corner locations to be within the image region, + x0_new = std::max(0.0f, x0_new); + y0_new = std::max(0.0f, y0_new); + x1_new = std::max(0.0f, x1_new); + y1_new = std::max(0.0f, y1_new); + + // recompute new width & height + const float box_w = x1_new - x0_new + coordinates_offset; + const float box_h = y1_new - y0_new + coordinates_offset; + + refined_boxes[refined_box_idx({class_idx, roi_idx, 0})] = x0_new; + refined_boxes[refined_box_idx({class_idx, roi_idx, 1})] = y0_new; + refined_boxes[refined_box_idx({class_idx, roi_idx, 2})] = x1_new; + refined_boxes[refined_box_idx({class_idx, roi_idx, 3})] = y1_new; + + refined_boxes_areas[refined_score_idx({class_idx, roi_idx})] = box_w * box_h; + + refined_scores[refined_score_idx({class_idx, roi_idx})] = scores[score_idx({roi_idx, class_idx})]; + } + } +} + +template +static bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + + +struct ConfidenceComparator { + explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {} + + bool operator()(int idx1, int idx2) { + if (_conf_data[idx1] > _conf_data[idx2]) return true; + if (_conf_data[idx1] < _conf_data[idx2]) return false; + return idx1 < idx2; + } + + const float* _conf_data; +}; + +static inline float JaccardOverlap(const float *decoded_bbox, + const float *bbox_sizes, + const int idx1, + const int idx2, + const float coordinates_offset = 1) { + float xmin1 = decoded_bbox[idx1 * 4 + 0]; + float ymin1 = decoded_bbox[idx1 * 4 + 1]; + float xmax1 = decoded_bbox[idx1 * 4 + 2]; + float ymax1 = decoded_bbox[idx1 * 4 + 3]; + + float xmin2 = decoded_bbox[idx2 * 4 + 0]; + float ymin2 = decoded_bbox[idx2 * 4 + 1]; + float ymax2 = decoded_bbox[idx2 * 4 + 3]; + float xmax2 = decoded_bbox[idx2 * 4 + 2]; + + if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) { + return 0.0f; + } + + float intersect_xmin = (std::max)(xmin1, xmin2); + float intersect_ymin = (std::max)(ymin1, ymin2); + float intersect_xmax = (std::min)(xmax1, xmax2); + float intersect_ymax = (std::min)(ymax1, ymax2); + + float intersect_width = intersect_xmax - intersect_xmin + coordinates_offset; + float intersect_height = intersect_ymax - intersect_ymin + coordinates_offset; + + if (intersect_width <= 0 || intersect_height <= 0) { + return 0.0f; + } + + float intersect_size = intersect_width * intersect_height; + float bbox1_size = bbox_sizes[idx1]; + float bbox2_size = bbox_sizes[idx2]; + + return intersect_size / (bbox1_size + bbox2_size - intersect_size); +} + + +static void nms_cf(const float* conf_data, + const float* bboxes, + const float* sizes, + int* buffer, + int* indices, + int& detections, + const int boxes_num, + const int pre_nms_topn, + const int post_nms_topn, + const float confidence_threshold, + const float nms_threshold) { + int count = 0; + for (int i = 0; i < boxes_num; ++i) { + if (conf_data[i] > confidence_threshold) { + indices[count] = i; + count++; + } + } + + int num_output_scores = (pre_nms_topn == -1 ? count : (std::min)(pre_nms_topn, count)); + + std::partial_sort_copy(indices, indices + count, + buffer, buffer + num_output_scores, + ConfidenceComparator(conf_data)); + + detections = 0; + for (int i = 0; i < num_output_scores; ++i) { + const int idx = buffer[i]; + + bool keep = true; + for (int k = 0; k < detections; ++k) { + const int kept_idx = indices[k]; + float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx); + if (overlap > nms_threshold) { + keep = false; + break; + } + } + if (keep) { + indices[detections] = idx; + detections++; + } + } + + detections = (post_nms_topn == -1 ? detections : (std::min)(post_nms_topn, detections)); +} + +bool MKLDNNExperimentalDetectronDetectionOutputNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto doOp = ngraph::as_type_ptr(op); + if (!doOp) { + errorMessage = "Node is not an instance of the ExperimentalDetectronDetectionOutput from the operations set v6."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNExperimentalDetectronDetectionOutputNode::MKLDNNExperimentalDetectronDetectionOutputNode + (const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + auto doOp = ngraph::as_type_ptr(op); + auto attributes = doOp->get_attrs(); + + score_threshold_ = attributes.score_threshold; + nms_threshold_ = attributes.nms_threshold; + max_delta_log_wh_ = attributes.max_delta_log_wh; + classes_num_ = attributes.num_classes; + max_detections_per_class_ = attributes.post_nms_count; + max_detections_per_image_ = attributes.max_detections_per_image; + class_agnostic_box_regression_ = attributes.class_agnostic_box_regression; + deltas_weights_ = attributes.deltas_weights; +} + +void MKLDNNExperimentalDetectronDetectionOutputNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::I32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNExperimentalDetectronDetectionOutputNode::execute(mkldnn::stream strm) { + const int rois_num = getParentEdgeAt(INPUT_ROIS)->getDims()[0]; + assert(classes_num_ == static_cast(getParentEdgeAt(INPUT_SCORES)->getDims()[1])); + assert(4 * classes_num_ == static_cast(getParentEdgeAt(INPUT_DELTAS)->getDims()[1])); + + const auto* boxes = reinterpret_cast(getParentEdgeAt(INPUT_ROIS)->getMemoryPtr()->GetPtr()); + const auto* deltas = reinterpret_cast(getParentEdgeAt(INPUT_DELTAS)->getMemoryPtr()->GetPtr()); + const auto* scores = reinterpret_cast(getParentEdgeAt(INPUT_SCORES)->getMemoryPtr()->GetPtr()); + const auto* im_info = reinterpret_cast(getParentEdgeAt(INPUT_IM_INFO)->getMemoryPtr()->GetPtr()); + + auto* output_boxes = reinterpret_cast(getChildEdgesAtPort(OUTPUT_BOXES)[0]->getMemoryPtr()->GetPtr()); + auto* output_scores = reinterpret_cast(getChildEdgesAtPort(OUTPUT_SCORES)[0]->getMemoryPtr()->GetPtr()); + auto* output_classes = reinterpret_cast(getChildEdgesAtPort(OUTPUT_CLASSES)[0]->getMemoryPtr()->GetPtr()); + + const float img_H = im_info[0]; + const float img_W = im_info[1]; + + // Apply deltas. + std::vector refined_boxes(classes_num_ * rois_num * 4, 0); + std::vector refined_scores(classes_num_ * rois_num, 0); + std::vector refined_boxes_areas(classes_num_ * rois_num, 0); + Indexer refined_box_idx({classes_num_, rois_num, 4}); + Indexer refined_score_idx({classes_num_, rois_num}); + + refine_boxes(boxes, deltas, &deltas_weights_[0], scores, + &refined_boxes[0], &refined_boxes_areas[0], &refined_scores[0], + rois_num, classes_num_, + img_H, img_W, + max_delta_log_wh_, + 1.0f); + + // Apply NMS class-wise. + std::vector buffer(rois_num, 0); + std::vector indices(classes_num_ * rois_num, 0); + std::vector detections_per_class(classes_num_, 0); + int total_detections_num = 0; + + for (int class_idx = 1; class_idx < classes_num_; ++class_idx) { + nms_cf(&refined_scores[refined_score_idx({class_idx, 0})], + &refined_boxes[refined_box_idx({class_idx, 0, 0})], + &refined_boxes_areas[refined_score_idx({class_idx, 0})], + &buffer[0], + &indices[total_detections_num], + detections_per_class[class_idx], + rois_num, + -1, + max_detections_per_class_, + score_threshold_, + nms_threshold_); + total_detections_num += detections_per_class[class_idx]; + } + + // Leave only max_detections_per_image_ detections. + // confidence, + std::vector>> conf_index_class_map; + + int indices_offset = 0; + for (int c = 0; c < classes_num_; ++c) { + int n = detections_per_class[c]; + for (int i = 0; i < n; ++i) { + int idx = indices[indices_offset + i]; + float score = refined_scores[refined_score_idx({c, idx})]; + conf_index_class_map.push_back(std::make_pair(score, std::make_pair(c, idx))); + } + indices_offset += n; + } + + assert(max_detections_per_image_ > 0); + if (total_detections_num > max_detections_per_image_) { + std::partial_sort(conf_index_class_map.begin(), + conf_index_class_map.begin() + max_detections_per_image_, + conf_index_class_map.end(), + SortScorePairDescend>); + conf_index_class_map.resize(max_detections_per_image_); + total_detections_num = max_detections_per_image_; + } + + // Fill outputs. + memset(output_boxes, 0, max_detections_per_image_ * 4 * sizeof(output_boxes[0])); + memset(output_scores, 0, max_detections_per_image_ * sizeof(output_scores[0])); + memset(output_classes, 0, max_detections_per_image_ * sizeof(output_classes[0])); + + int i = 0; + for (const auto & detection : conf_index_class_map) { + float score = detection.first; + int cls = detection.second.first; + int idx = detection.second.second; + output_boxes[4 * i + 0] = refined_boxes[refined_box_idx({cls, idx, 0})]; + output_boxes[4 * i + 1] = refined_boxes[refined_box_idx({cls, idx, 1})]; + output_boxes[4 * i + 2] = refined_boxes[refined_box_idx({cls, idx, 2})]; + output_boxes[4 * i + 3] = refined_boxes[refined_box_idx({cls, idx, 3})]; + output_scores[i] = score; + output_classes[i] = cls; + ++i; + } +} + +bool MKLDNNExperimentalDetectronDetectionOutputNode::created() const { + return getType() == ExperimentalDetectronDetectionOutput; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronDetectionOutputNode, ExperimentalDetectronDetectionOutput) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.h new file mode 100644 index 00000000000000..2df28ce5c4983b --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.h @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNExperimentalDetectronDetectionOutputNode : public MKLDNNNode { +public: + MKLDNNExperimentalDetectronDetectionOutputNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const int INPUT_ROIS {0}; + const int INPUT_DELTAS {1}; + const int INPUT_SCORES {2}; + const int INPUT_IM_INFO {3}; + + const int OUTPUT_BOXES {0}; + const int OUTPUT_CLASSES {1}; + const int OUTPUT_SCORES {2}; + + float score_threshold_; + float nms_threshold_; + float max_delta_log_wh_; + int classes_num_; + int max_detections_per_class_; + int max_detections_per_image_; + bool class_agnostic_box_regression_; + std::vector deltas_weights_; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.cpp new file mode 100644 index 00000000000000..255f8443765660 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.cpp @@ -0,0 +1,429 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#if defined(HAVE_AVX2) +#include +#endif + +#include +#include "ie_parallel.hpp" +#include "common/cpu_memcpy.h" +#include "mkldnn_experimental_detectron_generate_proposals_single_image_node.h" + +namespace { +struct Indexer4d { + int dim3_; + int dim23_; + int dim123_; + + explicit Indexer4d(int dim0, int dim1, int dim2, int dim3): + dim3_(dim3), dim23_(dim2 * dim3), dim123_(dim1 * dim2 * dim3) { + (void)dim0; + } + + int operator()(int i, int j, int k, int n) const { + return i * dim123_ + j * dim23_ + k * dim3_ + n; + } +}; +} // namespace + + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +static +void refine_anchors(const float* deltas, const float* scores, const float* anchors, + float* proposals, const int anchors_num, const int bottom_H, + const int bottom_W, const float img_H, const float img_W, + const float min_box_H, const float min_box_W, + const float max_delta_log_wh, + float coordinates_offset) { + Indexer4d delta_idx(anchors_num, 4, bottom_H, bottom_W); + Indexer4d score_idx(anchors_num, 1, bottom_H, bottom_W); + Indexer4d proposal_idx(bottom_H, bottom_W, anchors_num, 5); + Indexer4d anchor_idx(bottom_H, bottom_W, anchors_num, 4); + + parallel_for2d(bottom_H, bottom_W, [&](int h, int w) { + for (int anchor = 0; anchor < anchors_num; ++anchor) { + int a_idx = anchor_idx(h, w, anchor, 0); + float x0 = anchors[a_idx + 0]; + float y0 = anchors[a_idx + 1]; + float x1 = anchors[a_idx + 2]; + float y1 = anchors[a_idx + 3]; + + const float dx = deltas[delta_idx(anchor, 0, h, w)]; + const float dy = deltas[delta_idx(anchor, 1, h, w)]; + const float d_log_w = deltas[delta_idx(anchor, 2, h, w)]; + const float d_log_h = deltas[delta_idx(anchor, 3, h, w)]; + + const float score = scores[score_idx(anchor, 0, h, w)]; + + // width & height of box + const float ww = x1 - x0 + coordinates_offset; + const float hh = y1 - y0 + coordinates_offset; + // center location of box + const float ctr_x = x0 + 0.5f * ww; + const float ctr_y = y0 + 0.5f * hh; + + // new center location according to deltas (dx, dy) + const float pred_ctr_x = dx * ww + ctr_x; + const float pred_ctr_y = dy * hh + ctr_y; + // new width & height according to deltas d(log w), d(log h) + const float pred_w = std::exp(std::min(d_log_w, max_delta_log_wh)) * ww; + const float pred_h = std::exp(std::min(d_log_h, max_delta_log_wh)) * hh; + + // update upper-left corner location + x0 = pred_ctr_x - 0.5f * pred_w; + y0 = pred_ctr_y - 0.5f * pred_h; + // update lower-right corner location + x1 = pred_ctr_x + 0.5f * pred_w - coordinates_offset; + y1 = pred_ctr_y + 0.5f * pred_h - coordinates_offset; + + // adjust new corner locations to be within the image region, + x0 = std::max(0.0f, std::min(x0, img_W - coordinates_offset)); + y0 = std::max(0.0f, std::min(y0, img_H - coordinates_offset)); + x1 = std::max(0.0f, std::min(x1, img_W - coordinates_offset)); + y1 = std::max(0.0f, std::min(y1, img_H - coordinates_offset)); + + // recompute new width & height + const float box_w = x1 - x0 + coordinates_offset; + const float box_h = y1 - y0 + coordinates_offset; + + int p_idx = proposal_idx(h, w, anchor, 0); + proposals[p_idx + 0] = x0; + proposals[p_idx + 1] = y0; + proposals[p_idx + 2] = x1; + proposals[p_idx + 3] = y1; + proposals[p_idx + 4] = (min_box_W <= box_w) * (min_box_H <= box_h) * score; + } + }); +} + +static void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int pre_nms_topn) { + parallel_for(pre_nms_topn, [&](size_t i) { + unpacked_boxes[0*pre_nms_topn + i] = p_proposals[5*i + 0]; + unpacked_boxes[1*pre_nms_topn + i] = p_proposals[5*i + 1]; + unpacked_boxes[2*pre_nms_topn + i] = p_proposals[5*i + 2]; + unpacked_boxes[3*pre_nms_topn + i] = p_proposals[5*i + 3]; + unpacked_boxes[4*pre_nms_topn + i] = p_proposals[5*i + 4]; + }); +} + +static +void nms_cpu(const int num_boxes, int is_dead[], + const float* boxes, int index_out[], int* const num_out, + const int base_index, const float nms_thresh, const int max_num_out, + float coordinates_offset) { + const int num_proposals = num_boxes; + int count = 0; + + const float* x0 = boxes + 0 * num_proposals; + const float* y0 = boxes + 1 * num_proposals; + const float* x1 = boxes + 2 * num_proposals; + const float* y1 = boxes + 3 * num_proposals; + + std::memset(is_dead, 0, num_boxes * sizeof(int)); + +#if defined(HAVE_AVX2) + __m256 vc_fone = _mm256_set1_ps(coordinates_offset); + __m256i vc_ione = _mm256_set1_epi32(1); + __m256 vc_zero = _mm256_set1_ps(0.0f); + + __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh); +#endif + + for (int box = 0; box < num_boxes; ++box) { + if (is_dead[box]) + continue; + + index_out[count++] = base_index + box; + if (count == max_num_out) + break; + + int tail = box + 1; + +#if defined(HAVE_AVX2) + __m256 vx0i = _mm256_set1_ps(x0[box]); + __m256 vy0i = _mm256_set1_ps(y0[box]); + __m256 vx1i = _mm256_set1_ps(x1[box]); + __m256 vy1i = _mm256_set1_ps(y1[box]); + + __m256 vA_width = _mm256_sub_ps(vx1i, vx0i); + __m256 vA_height = _mm256_sub_ps(vy1i, vy0i); + __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone)); + + for (; tail <= num_boxes - 8; tail += 8) { + __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail); + __m256i vdst = _mm256_loadu_si256(pdst); + + __m256 vx0j = _mm256_loadu_ps(x0 + tail); + __m256 vy0j = _mm256_loadu_ps(y0 + tail); + __m256 vx1j = _mm256_loadu_ps(x1 + tail); + __m256 vy1j = _mm256_loadu_ps(y1 + tail); + + __m256 vx0 = _mm256_max_ps(vx0i, vx0j); + __m256 vy0 = _mm256_max_ps(vy0i, vy0j); + __m256 vx1 = _mm256_min_ps(vx1i, vx1j); + __m256 vy1 = _mm256_min_ps(vy1i, vy1j); + + __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone); + __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone); + __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight)); + + __m256 vB_width = _mm256_sub_ps(vx1j, vx0j); + __m256 vB_height = _mm256_sub_ps(vy1j, vy0j); + __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone)); + + __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea); + __m256 vintersection_area = _mm256_div_ps(varea, vdivisor); + + __m256 vcmp_0 = _mm256_cmp_ps(vx0i, vx1j, _CMP_LE_OS); + __m256 vcmp_1 = _mm256_cmp_ps(vy0i, vy1j, _CMP_LE_OS); + __m256 vcmp_2 = _mm256_cmp_ps(vx0j, vx1i, _CMP_LE_OS); + __m256 vcmp_3 = _mm256_cmp_ps(vy0j, vy1i, _CMP_LE_OS); + __m256 vcmp_4 = _mm256_cmp_ps(vc_nms_thresh, vintersection_area, _CMP_LT_OS); + + vcmp_0 = _mm256_and_ps(vcmp_0, vcmp_1); + vcmp_2 = _mm256_and_ps(vcmp_2, vcmp_3); + vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_0); + vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_2); + + _mm256_storeu_si256(pdst, _mm256_blendv_epi8(vdst, vc_ione, _mm256_castps_si256(vcmp_4))); + } +#endif + + for (; tail < num_boxes; ++tail) { + float res = 0.0f; + + const float x0i = x0[box]; + const float y0i = y0[box]; + const float x1i = x1[box]; + const float y1i = y1[box]; + + const float x0j = x0[tail]; + const float y0j = y0[tail]; + const float x1j = x1[tail]; + const float y1j = y1[tail]; + + if (x0i <= x1j && y0i <= y1j && x0j <= x1i && y0j <= y1i) { + // overlapped region (= box) + const float x0 = std::max(x0i, x0j); + const float y0 = std::max(y0i, y0j); + const float x1 = std::min(x1i, x1j); + const float y1 = std::min(y1i, y1j); + + // intersection area + const float width = std::max(0.0f, x1 - x0 + coordinates_offset); + const float height = std::max(0.0f, y1 - y0 + coordinates_offset); + const float area = width * height; + + // area of A, B + const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset); + const float B_area = (x1j - x0j + coordinates_offset) * (y1j - y0j + coordinates_offset); + + // IoU + res = area / (A_area + B_area - area); + } + + if (nms_thresh < res) + is_dead[tail] = 1; + } + } + + *num_out = count; +} + + +static +void fill_output_blobs(const float* proposals, const int* roi_indices, + float* rois, float* scores, + const int num_proposals, const int num_rois, const int post_nms_topn) { + const float *src_x0 = proposals + 0 * num_proposals; + const float *src_y0 = proposals + 1 * num_proposals; + const float *src_x1 = proposals + 2 * num_proposals; + const float *src_y1 = proposals + 3 * num_proposals; + const float *src_score = proposals + 4 * num_proposals; + + parallel_for(num_rois, [&](size_t i) { + int index = roi_indices[i]; + rois[i * 4 + 0] = src_x0[index]; + rois[i * 4 + 1] = src_y0[index]; + rois[i * 4 + 2] = src_x1[index]; + rois[i * 4 + 3] = src_y1[index]; + scores[i] = src_score[index]; + }); + + if (num_rois < post_nms_topn) { + for (int i = 4 * num_rois; i < 4 * post_nms_topn; i++) { + rois[i] = 0.f; + } + for (int i = num_rois; i < post_nms_topn; i++) { + scores[i] = 0.f; + } + } +} + +bool MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::isSupportedOperation + (const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto proposalOp = ngraph::as_type_ptr(op); + if (!proposalOp) { + errorMessage = "Node is not an instance of the Proposal from the operations set v0."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode + (const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + auto proposalOp = ngraph::as_type_ptr(op); + auto proposalAttrs = proposalOp->get_attrs(); + + min_size_ = proposalAttrs.min_size; + nms_thresh_ = proposalAttrs.nms_threshold; + pre_nms_topn_ = proposalAttrs.pre_nms_count; + post_nms_topn_ = proposalAttrs.post_nms_count; + + coordinates_offset = 0.0f; + + roi_indices_.resize(post_nms_topn_); +} + +void MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::execute(mkldnn::stream strm) { + try { + if (inDims.size() != 4 || outDims.size() != 2) { + IE_THROW() << "Incorrect number of input or output edges!"; + } + + size_t anchor_dims_size = 1; + for (size_t i = 0; i < getParentEdgeAt(INPUT_ANCHORS)->getDims().ToSizeVector().size(); i++) { + anchor_dims_size *= getParentEdgeAt(INPUT_ANCHORS)->getDims().ToSizeVector()[i]; + } + + size_t deltas_dims_size = 1; + for (size_t i = 0; i < getParentEdgeAt(INPUT_DELTAS)->getDims().ToSizeVector().size(); i++) { + deltas_dims_size *= getParentEdgeAt(INPUT_DELTAS)->getDims().ToSizeVector()[i]; + } + if (anchor_dims_size != deltas_dims_size) + IE_THROW() << "'Anchors' blob size for ONNXProposal is incompatible with 'deltas' blob size!"; + + size_t score_dims_size = 1; + for (size_t i = 0; i < getParentEdgeAt(INPUT_SCORES)->getDims().ToSizeVector().size(); i++) { + score_dims_size *= getParentEdgeAt(INPUT_SCORES)->getDims().ToSizeVector()[i]; + } + if (deltas_dims_size != (4 * score_dims_size)) + IE_THROW() << "'Deltas' blob size for ONNXProposal is incompatible with 'scores' blob size!"; + + // Prepare memory + const float *p_deltas_item = reinterpret_cast(getParentEdgeAt(INPUT_DELTAS)->getMemoryPtr()->GetPtr()); + const float *p_scores_item = reinterpret_cast(getParentEdgeAt(INPUT_SCORES)->getMemoryPtr()->GetPtr()); + const float *p_anchors_item = reinterpret_cast(getParentEdgeAt(INPUT_ANCHORS)->getMemoryPtr()->GetPtr()); + const float *p_img_info_cpu = reinterpret_cast(getParentEdgeAt(INPUT_IM_INFO)->getMemoryPtr()->GetPtr()); + + float *p_roi_item = reinterpret_cast(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr()); + float *p_roi_score_item = reinterpret_cast(getChildEdgesAtPort(OUTPUT_SCORES)[0]->getMemoryPtr()->GetPtr()); + + const int anchors_num = getParentEdgeAt(INPUT_SCORES)->getDims()[0]; + + // bottom shape: (num_anchors) x H x W + const int bottom_H = getParentEdgeAt(INPUT_DELTAS)->getDims()[1]; + const int bottom_W = getParentEdgeAt(INPUT_DELTAS)->getDims()[2]; + + // input image height & width + const float img_H = p_img_info_cpu[0]; + const float img_W = p_img_info_cpu[1]; + + // scale factor for height & width + + // minimum box width & height + const float min_box_H = min_size_; + const float min_box_W = min_size_; + + // number of all proposals = num_anchors * H * W + const int num_proposals = anchors_num * bottom_H * bottom_W; + + // number of top-n proposals before NMS + const int pre_nms_topn = std::min(num_proposals, pre_nms_topn_); + + // number of final RoIs + int num_rois = 0; + + // enumerate all proposals + // num_proposals = num_anchors * H * W + // (x1, y1, x2, y2, score) for each proposal + // NOTE: for bottom, only foreground scores are passed + struct ProposalBox { + float x0; + float y0; + float x1; + float y1; + float score; + }; + std::vector proposals_(num_proposals); + std::vector unpacked_boxes(5 * pre_nms_topn); + std::vector is_dead(pre_nms_topn); + + // Execute + int batch_size = 1; // inputs[INPUT_DELTAS]->getTensorDesc().getDims()[0]; + for (int n = 0; n < batch_size; ++n) { + refine_anchors(p_deltas_item, p_scores_item, p_anchors_item, + reinterpret_cast(&proposals_[0]), anchors_num, bottom_H, + bottom_W, img_H, img_W, + min_box_H, min_box_W, + static_cast(log(1000. / 16.)), + 1.0f); + std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(), + [](const ProposalBox &struct1, const ProposalBox &struct2) { + return (struct1.score > struct2.score); + }); + + unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn); + nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0, + nms_thresh_, post_nms_topn_, coordinates_offset); + fill_output_blobs(&unpacked_boxes[0], &roi_indices_[0], p_roi_item, p_roi_score_item, + pre_nms_topn, num_rois, post_nms_topn_); + } + } catch (const std::exception &e) { + std::string errorMsg = e.what(); + IE_THROW() << errorMsg; + } +} + +bool MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::created() const { + return getType() == ExperimentalDetectronGenerateProposalsSingleImage; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode, ExperimentalDetectronGenerateProposalsSingleImage) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.h new file mode 100644 index 00000000000000..b2f5f0bcd89fe1 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.h @@ -0,0 +1,50 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode : public MKLDNNNode { +public: + MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode(const std::shared_ptr& op, + const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + // Inputs: + // rois, shape [n, 4] + // rois_probs, shape [n] + // Outputs: + // top_rois, shape [max_rois, 4] + + const int INPUT_IM_INFO {0}; + const int INPUT_ANCHORS {1}; + const int INPUT_DELTAS {2}; + const int INPUT_SCORES {3}; + const int OUTPUT_ROIS {0}; + const int OUTPUT_SCORES {1}; + + float min_size_; + int pre_nms_topn_; + int post_nms_topn_; + float nms_thresh_; + float coordinates_offset; + + std::vector roi_indices_; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.cpp new file mode 100644 index 00000000000000..b5d073a0b3552e --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.cpp @@ -0,0 +1,95 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_experimental_detectron_priorgridgenerator_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNExperimentalDetectronPriorGridGeneratorNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto priorGridGen = std::dynamic_pointer_cast(op); + if (!priorGridGen) { + errorMessage = "Only opset6 ExperimentalDetectronPriorGridGenerator operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNExperimentalDetectronPriorGridGeneratorNode::MKLDNNExperimentalDetectronPriorGridGeneratorNode + (const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "ExperimentalDetectronPriorGridGenerator layer with name '" + op->get_friendly_name() + "'"; + const auto priorGridGen = std::dynamic_pointer_cast(op); + if (getOriginalInputsNumber() != 3 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + if (op->get_input_shape(INPUT_PRIORS).size() != 2 || + op->get_input_shape(INPUT_FEATUREMAP).size() != 4 || + op->get_input_shape(INPUT_IMAGE).size() != 4) + IE_THROW() << errorPrefix << " has unsupported input shape"; + + const auto &attr = priorGridGen->get_attrs(); + grid_w_ = attr.w; + grid_h_ = attr.h; + stride_h_ = attr.stride_y; + stride_w_ = attr.stride_x; +} + +void MKLDNNExperimentalDetectronPriorGridGeneratorNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNExperimentalDetectronPriorGridGeneratorNode::execute(mkldnn::stream strm) { + const int num_priors_ = getParentEdgeAt(INPUT_PRIORS)->getDims()[0]; + assert(getParentEdgeAt(INPUT_PRIORS)->getDims()[1] == 4); + + // Execute + const int layer_width = grid_w_ ? grid_w_ : getParentEdgeAt(INPUT_FEATUREMAP)->getDims()[3]; + const int layer_height = grid_h_ ? grid_h_ : getParentEdgeAt(INPUT_FEATUREMAP)->getDims()[2]; + const float step_w = stride_w_ ? stride_w_ : static_cast(getParentEdgeAt(INPUT_IMAGE)->getDims()[3]) / layer_width; + const float step_h = stride_h_ ? stride_h_ : static_cast(getParentEdgeAt(INPUT_IMAGE)->getDims()[2]) / layer_height; + + const auto *bottom_data_0 = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + auto *top_data_0 = reinterpret_cast(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr()); + + for (int h = 0; h < layer_height; ++h) { + for (int w = 0; w < layer_width; ++w) { + for (int s = 0; s < num_priors_; ++s) { + top_data_0[0] = bottom_data_0[4 * s + 0] + step_w * (w + 0.5f); + top_data_0[1] = bottom_data_0[4 * s + 1] + step_h * (h + 0.5f); + top_data_0[2] = bottom_data_0[4 * s + 2] + step_w * (w + 0.5f); + top_data_0[3] = bottom_data_0[4 * s + 3] + step_h * (h + 0.5f); + top_data_0 += 4; + } + } + } +} + +bool MKLDNNExperimentalDetectronPriorGridGeneratorNode::created() const { + return getType() == ExperimentalDetectronPriorGridGenerator; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronPriorGridGeneratorNode, ExperimentalDetectronPriorGridGenerator) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.h new file mode 100644 index 00000000000000..9ef117f44e65f7 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.h @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNExperimentalDetectronPriorGridGeneratorNode : public MKLDNNNode { +public: + MKLDNNExperimentalDetectronPriorGridGeneratorNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + // Inputs: + // priors, shape [n, 4] + // [feature_map], shape [b, c, h, w] + // [im_data], shape [b, 3, im_h, im_w] + // Outputs: + // priors_grid, shape [m, 4] + + const int INPUT_PRIORS {0}; + const int INPUT_FEATUREMAP {1}; + const int INPUT_IMAGE {2}; + + const int OUTPUT_ROIS {0}; + + int grid_w_; + int grid_h_; + float stride_w_; + float stride_h_; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.cpp new file mode 100644 index 00000000000000..94e7f033a95548 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.cpp @@ -0,0 +1,413 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include +#include +#include + +#include +#include "ie_parallel.hpp" +#include "common/cpu_memcpy.h" +#include "mkldnn_experimental_detectron_roifeatureextractor_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +// implementation taken from Caffe2 +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc.at(pre_calc_index) = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high = 0; + int x_high = 0; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = static_cast(1) - ly, hx = static_cast(1) - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indeces + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void ROIAlignForward_cpu_kernel( + const int nthreads, + const T* bottom_data, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, + const bool aligned, + T* top_data) { + int roi_cols = 4; + + int n_rois = nthreads / channels / pooled_width / pooled_height; + // (n, c, ph, pw) is an element in the pooled output + parallel_for(n_rois, [&](size_t n) { + int index_n = n * channels * pooled_width * pooled_height; + + // roi could have 4 or 5 columns + const T* offset_bottom_rois = bottom_rois + n * roi_cols; + int roi_batch_ind = 0; + if (roi_cols == 5) { + roi_batch_ind = static_cast(offset_bottom_rois[0]); + offset_bottom_rois++; + } + + T offset = aligned ? (T)0.5 : (T)0.0; + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[0] * spatial_scale - offset; + T roi_start_h = offset_bottom_rois[1] * spatial_scale - offset; + T roi_end_w = offset_bottom_rois[2] * spatial_scale - offset; + T roi_end_h = offset_bottom_rois[3] * spatial_scale - offset; + + // Force malformed ROIs to be 1x1 + T roi_width = (std::max)(roi_end_w - roi_start_w, (T)1.); + T roi_height = (std::max)(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceil(roi_height / pooled_height)); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : static_cast(ceil(roi_width / pooled_width)); + + // We do average (integral) pooling inside a bin + const T count = static_cast(roi_bin_grid_h * roi_bin_grid_w); // e.g. = 4 + + // we want to precalculate indeces and weights shared by all chanels, + // this is the key point of optimiation + std::vector> pre_calc( + roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate( + height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_bottom_data[pc.pos1] + + pc.w2 * offset_bottom_data[pc.pos2] + + pc.w3 * offset_bottom_data[pc.pos3] + + pc.w4 * offset_bottom_data[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + top_data[index] = output_val; + } // for pw + } // for ph + } // for c + }); +} + + +void redistribute_rois(const float* rois, int* level_ids, + const int num_rois, const int levels_num) { + const float canonical_scale = 224.0f; + const int canonical_level = 2; + + for (int i = 0; i < num_rois; ++i) { + const float x0 = rois[4 * i + 0]; + const float y0 = rois[4 * i + 1]; + const float x1 = rois[4 * i + 2]; + const float y1 = rois[4 * i + 3]; + + int target_level = levels_num; + float area = (x1 - x0) * (y1 - y0); + if (area > 0) { + area = std::sqrt(area) / canonical_scale; + area = std::log2(area + 1e-6f); + target_level = static_cast(std::floor(area + canonical_level)); + target_level = (std::max)(0, (std::min)(levels_num - 1, target_level)); + } + + level_ids[i] = target_level; + } +} + + +void reord(const float* src_data, const int* ranks, const int n, const int step, float* dst_data, + int* dst_mapping) { + std::iota(dst_mapping, dst_mapping + n, 0); + std::sort(dst_mapping, dst_mapping + n, [&ranks](size_t i1, size_t i2) {return ranks[i1] < ranks[i2];}); + for (int i = 0; i < n; ++i) { + const int j = dst_mapping[i]; + assert(0 <= j && j < n); + cpu_memcpy(dst_data + i * step, src_data + j * step, sizeof(float) * step); + } +} + +void split_points(const std::vector& ids, std::vector& rois_per_level, const int levels_num) { + rois_per_level.clear(); + rois_per_level.resize(levels_num, 0); + for (size_t i = 0; i < ids.size(); ++i) { + assert(0 <= ids[i] && ids[i] < levels_num); + rois_per_level[ids[i]]++; + } + for (int i = 1; i < levels_num; ++i) { + rois_per_level[i] += rois_per_level[i - 1]; + } + rois_per_level.insert(rois_per_level.begin(), 0); +} + + +void reorder_rois(const float *rois, const int* ids, int* mapping, const int rois_num, + float * reordered_rois, std::vector& rois_per_level, const int levels_num) { + rois_per_level.clear(); + rois_per_level.resize(levels_num, 0); + for (int i = 0; i < rois_num; ++i) { + assert(0 <= ids[i] && ids[i] < levels_num); + rois_per_level[ids[i]]++; + } + for (int i = 1; i < levels_num; ++i) { + rois_per_level[i] += rois_per_level[i - 1]; + } + rois_per_level.insert(rois_per_level.begin(), 0); + + std::vector level_counter = rois_per_level; + + for (int i = 0; i < rois_num; ++i) { + const int level = ids[i]; + assert(level < levels_num); + const int j = level_counter[level]; + assert(0 <= j && j < rois_num); + reordered_rois[j * 4 + 0] = rois[i * 4 + 0]; + reordered_rois[j * 4 + 1] = rois[i * 4 + 1]; + reordered_rois[j * 4 + 2] = rois[i * 4 + 2]; + reordered_rois[j * 4 + 3] = rois[i * 4 + 3]; + level_counter[level]++; + } +} + +bool MKLDNNExperimentalDetectronROIFeatureExtractorNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto roiFeatureExtractor = std::dynamic_pointer_cast(op); + if (!roiFeatureExtractor) { + errorMessage = "Only opset6 ExperimentalDetectronROIFeatureExtractor operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNExperimentalDetectronROIFeatureExtractorNode::MKLDNNExperimentalDetectronROIFeatureExtractorNode + (const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + const auto roiFeatureExtractor = std::dynamic_pointer_cast(op); + const auto &attr = roiFeatureExtractor->get_attrs(); + output_dim_ = attr.output_size; + pyramid_scales_ = attr.pyramid_scales; + sampling_ratio_ = attr.sampling_ratio; + aligned_ = attr.aligned; + pooled_height_ = output_dim_; + pooled_width_ = output_dim_; +} + +void MKLDNNExperimentalDetectronROIFeatureExtractorNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNExperimentalDetectronROIFeatureExtractorNode::execute(mkldnn::stream strm) { + const int levels_num = inDims.size() - INPUT_FEATURES_START; + const int num_rois = getParentEdgeAt(INPUT_ROIS)->getDims()[0]; + const int channels_num = getParentEdgeAt(INPUT_FEATURES_START)->getDims()[1]; + const int feaxels_per_roi = pooled_height_ * pooled_width_ * channels_num; + + auto *input_rois = reinterpret_cast(getParentEdgeAt(INPUT_ROIS)->getMemoryPtr()->GetPtr()); + auto *output_rois_features = reinterpret_cast(getChildEdgesAtPort(OUTPUT_ROI_FEATURES)[0]->getMemoryPtr()->GetPtr()); + float *output_rois = nullptr; + if (OUTPUT_ROIS < outDims.size()) { + output_rois = reinterpret_cast(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr()); + } + + std::vector level_ids(num_rois, 0); + redistribute_rois(input_rois, reinterpret_cast(&level_ids[0]), num_rois, levels_num); + + std::vector reordered_rois(4 * num_rois, 0); + std::vector original_rois_mapping(num_rois, 0); + reord(input_rois, &level_ids[0], num_rois, 4, &reordered_rois[0], &original_rois_mapping[0]); + + std::vector rois_per_level; + split_points(level_ids, rois_per_level, levels_num + 1); + + std::vector output_rois_features_temp(feaxels_per_roi * num_rois, 0); + for (int i = 0; i < levels_num; ++i) { + const int level_rois_offset = rois_per_level[i]; + const int level_rois_num = rois_per_level[i + 1] - level_rois_offset; + if (level_rois_num > 0) { + auto *featuremap = reinterpret_cast(getParentEdgeAt(INPUT_FEATURES_START + i)->getMemoryPtr()->GetPtr()); + const int featuremap_height = getParentEdgeAt(INPUT_FEATURES_START + i)->getDims()[2]; + const int featuremap_width = getParentEdgeAt(INPUT_FEATURES_START + i)->getDims()[3]; + ROIAlignForward_cpu_kernel(feaxels_per_roi * level_rois_num, + featuremap, + 1.0f / pyramid_scales_[i], + channels_num, + featuremap_height, + featuremap_width, + pooled_height_, + pooled_width_, + sampling_ratio_, + &reordered_rois[4 * level_rois_offset], + aligned_, + &output_rois_features_temp[feaxels_per_roi * level_rois_offset]); + } + } + + std::vector dummy_mapping(num_rois, 0); + reord(&output_rois_features_temp[0], &original_rois_mapping[0], num_rois, feaxels_per_roi, + output_rois_features, &dummy_mapping[0]); + if (output_rois != nullptr) { + cpu_memcpy(output_rois, input_rois, 4 * num_rois * sizeof(float)); + } +} + +bool MKLDNNExperimentalDetectronROIFeatureExtractorNode::created() const { + return getType() == ExperimentalDetectronROIFeatureExtractor; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronROIFeatureExtractorNode, ExperimentalDetectronROIFeatureExtractor) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.h new file mode 100644 index 00000000000000..bfcb9061f26fbe --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.h @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNExperimentalDetectronROIFeatureExtractorNode : public MKLDNNNode { +public: + MKLDNNExperimentalDetectronROIFeatureExtractorNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const int INPUT_ROIS {0}; + const int INPUT_FEATURES_START {1}; + + const int OUTPUT_ROI_FEATURES {0}; + const int OUTPUT_ROIS {1}; + + int output_dim_ = 0; + int pooled_height_ = 0; + int pooled_width_ = 0; + std::vector pyramid_scales_; + int sampling_ratio_ = 0; + bool aligned_ = false; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.cpp new file mode 100644 index 00000000000000..d543658f78e724 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include +#include +#include + +#include +#include "ie_parallel.hpp" +#include "common/cpu_memcpy.h" +#include "mkldnn_experimental_detectron_topkrois_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNExperimentalDetectronTopKROIsNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto topKROI = std::dynamic_pointer_cast(op); + if (!topKROI) { + errorMessage = "Only opset6 ExperimentalDetectronTopKROIs operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNExperimentalDetectronTopKROIsNode::MKLDNNExperimentalDetectronTopKROIsNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "ExperimentalDetectronTopKROIs layer with name '" + op->get_friendly_name() + "'"; + const auto topKROI = std::dynamic_pointer_cast(op); + if (getOriginalInputsNumber() != 2 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + if (op->get_input_shape(INPUT_ROIS).size() != 2 || op->get_input_shape(INPUT_PROBS).size() != 1) + IE_THROW() << errorPrefix << " has nsupported input shape"; + + max_rois_num_ = topKROI->get_max_rois(); +} + +void MKLDNNExperimentalDetectronTopKROIsNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNExperimentalDetectronTopKROIsNode::execute(mkldnn::stream strm) { + const int input_rois_num = getParentEdgeAt(INPUT_ROIS)->getDims()[0]; + const int top_rois_num = (std::min)(max_rois_num_, input_rois_num); + + auto *input_rois = reinterpret_cast(getParentEdgeAt(INPUT_ROIS)->getMemoryPtr()->GetPtr()); + auto *input_probs = reinterpret_cast(getParentEdgeAt(INPUT_PROBS)->getMemoryPtr()->GetPtr()); + auto *output_rois = reinterpret_cast(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr()); + + std::vector idx(input_rois_num); + iota(idx.begin(), idx.end(), 0); + // FIXME. partial_sort is enough here. + sort(idx.begin(), idx.end(), [&input_probs](size_t i1, size_t i2) {return input_probs[i1] > input_probs[i2];}); + + for (int i = 0; i < top_rois_num; ++i) { + cpu_memcpy(output_rois + 4 * i, input_rois + 4 * idx[i], 4 * sizeof(float)); + } +} + +bool MKLDNNExperimentalDetectronTopKROIsNode::created() const { + return getType() == ExperimentalDetectronTopKROIs; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronTopKROIsNode, ExperimentalDetectronTopKROIs) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.h new file mode 100644 index 00000000000000..76171de71e473c --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.h @@ -0,0 +1,40 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNExperimentalDetectronTopKROIsNode : public MKLDNNNode { +public: + MKLDNNExperimentalDetectronTopKROIsNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + // Inputs: + // rois, shape [n, 4] + // rois_probs, shape [n] + // Outputs: + // top_rois, shape [max_rois, 4] + + const int INPUT_ROIS {0}; + const int INPUT_PROBS {1}; + + const int OUTPUT_ROIS {0}; + int max_rois_num_; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.cpp similarity index 66% rename from inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.cpp rename to inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.cpp index b0f0aa5d327ed8..d4c5d3037962b0 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.cpp @@ -1,22 +1,22 @@ -// Copyright (C) 2020-2021 Intel Corporation +// Copyright (C) 2018-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include "extract_image_patches.hpp" -#include "caseless.hpp" -#include "ie_parallel.hpp" -#include "list.hpp" -#include +#include "base.hpp" + #include #include #include + #include +#include "ie_parallel.hpp" +#include "mkldnn_extract_image_patches_node.h" +#include "list.hpp" +#include +#include "caseless.hpp" using namespace MKLDNNPlugin; - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { +using namespace InferenceEngine; using details::CaselessEq; @@ -266,11 +266,11 @@ struct jit_extract_image_patches_kernel : public jit_uni_extract_image_patches_k align(64); L(gather_index_table); for (int32_t i = 0; i < vlen / sizeof(int32_t); i++) - dd(i * jpp.SW * jpp.dtype_size); + dd(i * jpp.SW * jpp.dtype_size); } }; -bool ExtractImagePatchesImpl::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool MKLDNNExtractImagePatchesNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { const auto extImgPatcher = std::dynamic_pointer_cast(op); if (!extImgPatcher) { @@ -292,140 +292,141 @@ bool ExtractImagePatchesImpl::isSupportedOperation(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } +MKLDNNExtractImagePatchesNode::MKLDNNExtractImagePatchesNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } - errorPrefix = "ExtractImagePatches layer with name '" + op->get_friendly_name() + "' "; - const auto extImgPatcher = std::dynamic_pointer_cast(op); + errorPrefix = "ExtractImagePatches layer with name '" + op->get_friendly_name() + "' "; + const auto extImgPatcher = std::dynamic_pointer_cast(op); - if (op->get_input_size() != 1 || op->get_output_size() != 1) - IE_THROW() << errorPrefix << "has incorrect number of input or output edges!" - << " Input: " << op->get_input_size() << "; Output: " << op->get_output_size(); - - if (op->get_input_shape(0).size() != 4) - IE_THROW() << errorPrefix << "must have 4D input tensor. Actual: " << op->get_input_shape(0).size(); - - if (op->get_output_shape(0).size() != 4) - IE_THROW() << errorPrefix << "must have 4D output tensor. Actual: " << op->get_output_shape(0).size(); - - const auto precision = details::convertPrecision(op->get_input_element_type(0)); - if (_supported_precisions_sizes.find(precision.size()) == _supported_precisions_sizes.end()) - IE_THROW() << errorPrefix << "has unsupported precision: " << precision.name(); - - auto ksizes = extImgPatcher->get_sizes(); - auto strides = extImgPatcher->get_strides(); - auto rates = extImgPatcher->get_rates(); - if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::VALID) { - _auto_pad = ExtImgPatcherPadType::VALID; - } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_LOWER) { - _auto_pad = ExtImgPatcherPadType::SAME_LOWER; - } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_UPPER) { - _auto_pad = ExtImgPatcherPadType::SAME_UPPER; - } else { - IE_THROW() << errorPrefix << "has unsupported pad type: " << extImgPatcher->get_auto_pad(); - } + if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << "has incorrect number of input or output edges!" + << " Input: " << getOriginalInputsNumber() << "; Output: " << getOriginalOutputsNumber(); + + if (op->get_input_shape(0).size() != 4) + IE_THROW() << errorPrefix << "must have 4D input tensor. Actual: " << op->get_input_shape(0).size(); - if (ksizes.size() != 2 || strides.size() != 2 || rates.size() != 2) - IE_THROW() << errorPrefix << "must have the following attributes with shape {2}: sizes, strides, rates."; - _ksizes.clear(); - _strides.clear(); - _rates.clear(); - for (const auto& x : ksizes) { - if (x < 0) - IE_THROW() << "Kernel sizes must be non-negative, got '" << x << "'."; - _ksizes.push_back(static_cast(x)); + if (op->get_output_shape(0).size() != 4) + IE_THROW() << errorPrefix << "must have 4D output tensor. Actual: " << op->get_output_shape(0).size(); + + auto ksizes = extImgPatcher->get_sizes(); + auto strides = extImgPatcher->get_strides(); + auto rates = extImgPatcher->get_rates(); + if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::VALID) { + _auto_pad = ExtImgPatcherPadType::VALID; + } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_LOWER) { + _auto_pad = ExtImgPatcherPadType::SAME_LOWER; + } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_UPPER) { + _auto_pad = ExtImgPatcherPadType::SAME_UPPER; + } else { + IE_THROW() << errorPrefix << "has unsupported pad type: " << extImgPatcher->get_auto_pad(); + } + + if (ksizes.size() != 2 || strides.size() != 2 || rates.size() != 2) + IE_THROW() << errorPrefix << "must have the following attributes with shape {2}: sizes, strides, rates."; + _ksizes.clear(); + _strides.clear(); + _rates.clear(); + for (const auto& x : ksizes) { + if (x < 0) + IE_THROW() << "Kernel sizes must be non-negative, got '" << x << "'."; + _ksizes.push_back(static_cast(x)); + } + for (const auto& x : strides) { + if (x < 0) + IE_THROW() << "Strides must be non-negative, got '" << x << "'."; + _strides.push_back(static_cast(x)); + } + for (const auto& x : rates) { + if (x < 0) + IE_THROW() << "Rates must be non-negative, got '" << x << "'."; + _rates.push_back(static_cast(x)); + } + + SizeVector in_dims = op->get_input_shape(0); + _pad_left = 0; + _pad_top = 0; + jit_extract_image_patches_params jpp; + jpp.need_padding = false; + if (_auto_pad != ExtImgPatcherPadType::VALID) { + const size_t iheight = in_dims[2]; + const size_t iwidth = in_dims[3]; + const int64_t ihStep = _ksizes[0] + (_rates[0] - 1) * (_ksizes[0] - 1); + const int64_t iwStep = _ksizes[1] + (_rates[1] - 1) * (_ksizes[1] - 1); + + int64_t PW = (std::ceil(1.f * iwidth/_strides[1]) - 1) * _strides[1] + iwStep - iwidth; + int64_t PH = (std::ceil(1.f * iheight/_strides[0]) - 1) * _strides[0] + ihStep - iheight; + + int64_t increment_sign = 0; + if (_auto_pad == ExtImgPatcherPadType::SAME_LOWER) { + increment_sign = 1; + } else if (_auto_pad == ExtImgPatcherPadType::SAME_UPPER) { + increment_sign = -1; } - for (const auto& x : strides) { - if (x < 0) - IE_THROW() << "Strides must be non-negative, got '" << x << "'."; - _strides.push_back(static_cast(x)); + + if ((PW > 0) && (PW < iwStep)) { + _pad_left = static_cast((PW + increment_sign * (PW % 2)) / 2); + jpp.need_padding = true; } - for (const auto& x : rates) { - if (x < 0) - IE_THROW() << "Rates must be non-negative, got '" << x << "'."; - _rates.push_back(static_cast(x)); + if ((PH > 0) && (PH < ihStep)) { + _pad_top = static_cast((PH + increment_sign * (PH % 2)) / 2); + jpp.need_padding = true; } + } - SizeVector in_dims = op->get_input_shape(0); - _pad_left = 0; - _pad_top = 0; - jit_extract_image_patches_params jpp; - jpp.need_padding = false; - if (_auto_pad != ExtImgPatcherPadType::VALID) { - const size_t iheight = in_dims[2]; - const size_t iwidth = in_dims[3]; - const int64_t ihStep = _ksizes[0] + (_rates[0] - 1) * (_ksizes[0] - 1); - const int64_t iwStep = _ksizes[1] + (_rates[1] - 1) * (_ksizes[1] - 1); - - int64_t PW = (std::ceil(1.f * iwidth/_strides[1]) - 1) * _strides[1] + iwStep - iwidth; - int64_t PH = (std::ceil(1.f * iheight/_strides[0]) - 1) * _strides[0] + ihStep - iheight; - - int64_t increment_sign = 0; - if (_auto_pad == ExtImgPatcherPadType::SAME_LOWER) { - increment_sign = 1; - } else if (_auto_pad == ExtImgPatcherPadType::SAME_UPPER) { - increment_sign = -1; - } + jpp.IW = in_dims[3]; + SizeVector out_dims = op->get_output_shape(0); + jpp.OH = out_dims[2]; + jpp.OW = out_dims[3]; + jpp.KH = _ksizes[0]; + jpp.KW = _ksizes[1]; + jpp.SH = _strides[0]; + jpp.SW = _strides[1]; + jpp.dtype_size = getOriginalInputPrecisionAtPort(0).size(); + jpp.block_size = 1; + + if (mayiuse(x64::avx512_common)) { + jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; + extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); + } else if (mayiuse(x64::avx2)) { + jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; + extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); + } else if (mayiuse(x64::sse41)) { + jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; + extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); + } - if ((PW > 0) && (PW < iwStep)) { - _pad_left = static_cast((PW + increment_sign * (PW % 2)) / 2); - jpp.need_padding = true; - } - if ((PH > 0) && (PH < ihStep)) { - _pad_top = static_cast((PH + increment_sign * (PH % 2)) / 2); - jpp.need_padding = true; - } - } + if (extract_image_patches_kernel) + extract_image_patches_kernel->create_ker(); +} - jpp.IW = in_dims[3]; - SizeVector out_dims = op->get_output_shape(0); - jpp.OH = out_dims[2]; - jpp.OW = out_dims[3]; - jpp.KH = _ksizes[0]; - jpp.KW = _ksizes[1]; - jpp.SH = _strides[0]; - jpp.SW = _strides[1]; - jpp.dtype_size = precision.size(); - jpp.block_size = 1; - - if (mayiuse(x64::avx512_common)) { - jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; - extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); - } else if (mayiuse(x64::avx2)) { - jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; - extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); - } else if (mayiuse(x64::sse41)) { - jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; - extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); - } +void MKLDNNExtractImagePatchesNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; - if (extract_image_patches_kernel) - extract_image_patches_kernel->create_ker(); + precision = getOriginalInputPrecisionAtPort(0); + if (_supported_precisions_sizes.find(precision.size()) == _supported_precisions_sizes.end()) + IE_THROW() << errorPrefix << "has unsupported precision: " << precision.name(); - addConfig(op, {{TensorDescCreatorTypes::ncsp, precision}}, - {{TensorDescCreatorTypes::ncsp, precision}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - } + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision}}, + {{TensorDescCreatorTypes::ncsp, precision}}, + impl_desc_type::ref_any); } -StatusCode ExtractImagePatchesImpl::execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept { - const char *src_data = inputs[0]->cbuffer().as() + - inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - char *dst_data = outputs[0]->buffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const size_t dtype_size = inputs[0]->getTensorDesc().getPrecision().size(); +void MKLDNNExtractImagePatchesNode::execute(mkldnn::stream strm) { + const char *src_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + char *dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + const size_t dtype_size = getOriginalInputPrecisionAtPort(0).size(); - const auto& inDims = inputs[0]->getTensorDesc().getDims(); + const auto& inDims = getParentEdgeAt(0)->getDims().ToSizeVector(); const size_t IC = inDims[1]; const size_t IH = inDims[2]; const size_t IW = inDims[3]; - const auto& outDims = outputs[0]->getTensorDesc().getDims(); + const auto& outDims = getChildEdgesAtPort(0)[0]->getDims().ToSizeVector(); const size_t OB = outDims[0]; const size_t OH = outDims[2]; const size_t OW = outDims[3]; @@ -435,8 +436,8 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector& inputs, std: const size_t RH = _rates[0], RW = _rates[1]; const size_t PT = _pad_top, PL = _pad_left; - const std::vector istrides = inputs[0]->getTensorDesc().getBlockingDesc().getStrides(); - const std::vector ostrides = outputs[0]->getTensorDesc().getBlockingDesc().getStrides(); + const std::vector istrides = getParentEdgeAt(0)->getDesc().getBlockingDesc().getStrides(); + const std::vector ostrides = getChildEdgesAtPort(0)[0]->getDesc().getBlockingDesc().getStrides(); const std::vector ostrides_partial = {ostrides[0], KW * IC * ostrides[1], IC * ostrides[1], ostrides[1]}; if (extract_image_patches_kernel) { @@ -471,7 +472,7 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector& inputs, std: const size_t iw_hpad = std::ceil((IW - 1.f * iw_start) / SW) > OW ? OW : std::ceil((IW - 1.f * iw_start) / SW); char *my_dst_ptr = dst_data + - (ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]) * dtype_size; + (ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]) * dtype_size; const char *my_src_ptr = src_data + (ob * istrides[0] + ic * istrides[1] + ih_start * istrides[2] + iw_start) * dtype_size; size_t num_bytes_to_set = ih_lpad * OW * dtype_size; @@ -480,14 +481,14 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector& inputs, std: const char* src_ptr_h_stop = my_src_ptr + ih_hpad * SH * IW * dtype_size; for (const char *src_h_ptr = my_src_ptr + ih_lpad * SH * IW * dtype_size; - src_h_ptr < src_ptr_h_stop; src_h_ptr += SH * IW * dtype_size) { + src_h_ptr < src_ptr_h_stop; src_h_ptr += SH * IW * dtype_size) { num_bytes_to_set = iw_lpad * dtype_size; memset(my_dst_ptr, 0, num_bytes_to_set); my_dst_ptr += num_bytes_to_set; const char* src_ptr_w_stop = src_h_ptr + iw_hpad * SW * dtype_size; for (const char* src_w_ptr = src_h_ptr + iw_lpad * SW * dtype_size; - src_w_ptr < src_ptr_w_stop; src_w_ptr += SW * dtype_size) { + src_w_ptr < src_ptr_w_stop; src_w_ptr += SW * dtype_size) { num_bytes_to_set = dtype_size; memcpy(my_dst_ptr, src_w_ptr, num_bytes_to_set); my_dst_ptr += num_bytes_to_set; @@ -500,11 +501,12 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector& inputs, std: memset(my_dst_ptr, 0, num_bytes_to_set); }); } - return OK; } -const std::set ExtractImagePatchesImpl::_supported_precisions_sizes = {1, 2, 4}; +const std::set MKLDNNExtractImagePatchesNode::_supported_precisions_sizes = {1, 2, 4}; + +bool MKLDNNExtractImagePatchesNode::created() const { + return getType() == ExtractImagePatches; +} -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine +REG_MKLDNN_PRIM_FOR(MKLDNNExtractImagePatchesNode, ExtractImagePatches) diff --git a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.hpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.h similarity index 64% rename from inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.hpp rename to inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.h index 8ed62fbca89b0d..2990b12d08f2e3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.hpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.h @@ -1,16 +1,16 @@ -// Copyright (C) 2021 Intel Corporation +// Copyright (C) 2018-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // + #pragma once -#include "base.hpp" +#include +#include +#include +#include #include -#include -#include -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { +namespace MKLDNNPlugin { struct jit_extract_image_patches_params { size_t IW; @@ -40,12 +40,17 @@ struct jit_uni_extract_image_patches_kernel { virtual ~jit_uni_extract_image_patches_kernel() {} }; - -class ExtractImagePatchesImpl : public ExtLayerBase { +class MKLDNNExtractImagePatchesNode : public MKLDNNNode { public: - explicit ExtractImagePatchesImpl(const std::shared_ptr& op); - StatusCode execute(std::vector&, std::vector&, ResponseDesc*) noexcept override; - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + MKLDNNExtractImagePatchesNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: enum class ExtImgPatcherPadType { @@ -63,12 +68,9 @@ class ExtractImagePatchesImpl : public ExtLayerBase { static const std::set _supported_precisions_sizes; ExtImgPatcherPadType _auto_pad; + InferenceEngine::Precision precision; std::string errorPrefix; }; -REG_FACTORY_FOR(ExtractImagePatchesImpl, ExtractImagePatches); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp index eabd4f52aac8b2..e3e14e356912db 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp @@ -18,7 +18,7 @@ using namespace InferenceEngine; bool MKLDNNGatherElementsNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto gatherElementsOp = ngraph::as_type_ptr(op); + const auto gatherElementsOp = ngraph::as_type_ptr(op); if (!gatherElementsOp) { errorMessage = "Node is not an instance of the GatherElements operation from operation set v6."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp index 3e858dd309d8ca..ee7623f9b4810b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp @@ -18,7 +18,7 @@ using namespace InferenceEngine; bool MKLDNNGatherNDNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto gatherElementsOp = ngraph::as_type_ptr(op); + const auto gatherElementsOp = ngraph::as_type_ptr(op); if (!gatherElementsOp) { errorMessage = "Node is not an instance of the GatherND operation from operation set v5."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp index 3bd50aadf3357e..ade92f6a4a0060 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp @@ -15,13 +15,13 @@ using namespace InferenceEngine; bool MKLDNNGatherNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto gatherOp = ngraph::as_type_ptr(op); + const auto gatherOp = ngraph::as_type_ptr(op); if (!gatherOp) { errorMessage = "Only opset7 Gather operation is supported"; return false; } - auto axesOp = gatherOp->get_input_node_shared_ptr(GATHER_AXIS); + const auto axesOp = gatherOp->get_input_node_shared_ptr(GATHER_AXIS); if (!ngraph::as_type_ptr(axesOp)) { errorMessage = "Only Constant operation on 'axis' input is supported"; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.cpp new file mode 100644 index 00000000000000..ce396446df2418 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.cpp @@ -0,0 +1,148 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_gather_tree_node.h" +#include + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNGatherTreeNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto gatherElementsOp = ngraph::as_type_ptr(op); + if (!gatherElementsOp) { + errorMessage = "Node is not an instance of the GatherTree operation from operation set v1."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNGatherTreeNode::MKLDNNGatherTreeNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = std::string("Node GatherTree with name '") + op->get_friendly_name() + "'"; + if (op->get_input_size() != 4) + IE_THROW() << errorPrefix << " has incorrect number of input edges."; + if (op->get_output_size() != 1) + IE_THROW() << errorPrefix << " has incorrect number of output edges."; + + if (op->get_input_shape(GATHER_TREE_STEP_IDX).size() != 3) + IE_THROW() << errorPrefix << " step_idx vector should be 3 dimension"; + if (op->get_input_shape(GATHER_TREE_PARENT_IDX).size() != 3) + IE_THROW() << errorPrefix << " parent_idx vector should be 3 dimension"; + if (op->get_input_shape(GATHER_TREE_MAX_SEQ_LEN).size() != 1) + IE_THROW() << errorPrefix << " max_seq_len vector should be 1 dimension"; + if (op->get_input_shape(GATHER_TREE_END_TOKEN).size() != 0) + IE_THROW() << errorPrefix << " end_token should be 1 dimension"; +} + +void MKLDNNGatherTreeNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + precision = getOriginalInputPrecisionAtPort(GATHER_TREE_STEP_IDX); + if (!MKLDNNPlugin::one_of(precision, Precision::FP32, Precision::I32)) + precision = Precision::FP32; + + if (getOriginalInputPrecisionAtPort(GATHER_TREE_PARENT_IDX) != precision || + getOriginalInputPrecisionAtPort(GATHER_TREE_MAX_SEQ_LEN) != precision || + getOriginalInputPrecisionAtPort(GATHER_TREE_END_TOKEN) != precision || + getOriginalOutputPrecisionAtPort(0) != precision) { + IE_THROW() << errorPrefix << " has incorrect input/output data precision. Must be the same."; + } + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision}, + {TensorDescCreatorTypes::ncsp, precision}, + {TensorDescCreatorTypes::ncsp, precision}, + {TensorDescCreatorTypes::ncsp, precision}}, + {{TensorDescCreatorTypes::ncsp, precision}}, + impl_desc_type::ref_any); +} + +void MKLDNNGatherTreeNode::execute(mkldnn::stream strm) { + if (precision == Precision::FP32) + return gatherTreeKernel(); + else + return gatherTreeKernel(); +} + +template +void MKLDNNGatherTreeNode::gatherTreeKernel() noexcept { + const auto *step_idx = reinterpret_cast(getParentEdgeAt(GATHER_TREE_STEP_IDX)->getMemoryPtr()->GetPtr()); + const auto * const parent_idx = reinterpret_cast(getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getMemoryPtr()->GetPtr()); + const size_t parent_idx_size = getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getDims().size() + - getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getDesc().getBlockingDesc().getOffsetPadding(); + const auto *max_seq_len = reinterpret_cast(getParentEdgeAt(GATHER_TREE_MAX_SEQ_LEN)->getMemoryPtr()->GetPtr()); + auto end_token = (reinterpret_cast(getParentEdgeAt(GATHER_TREE_END_TOKEN)->getMemoryPtr()->GetPtr()))[0]; + auto * final_idx = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + SizeVector step_idx_dims = getParentEdgeAt(GATHER_TREE_STEP_IDX)->getDims().ToSizeVector(); + SizeVector parent_idx_dims = getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getDims().ToSizeVector(); + SizeVector max_seq_len_dims = getParentEdgeAt(GATHER_TREE_MAX_SEQ_LEN)->getDims().ToSizeVector(); + SizeVector final_idx_dims = getChildEdgesAtPort(0)[0]->getDims().ToSizeVector(); + int32_t max_time = step_idx_dims[0]; + const size_t batch_size = step_idx_dims[1]; + const size_t beam_width = step_idx_dims[2]; + const size_t bb_size = batch_size * beam_width; + + if (max_time != static_cast(parent_idx_dims[0]) || max_time != static_cast(final_idx_dims[0]) || + batch_size != parent_idx_dims[1] || batch_size != final_idx_dims[1] || batch_size != max_seq_len_dims[0] || + beam_width != parent_idx_dims[2] || beam_width != final_idx_dims[2]) { + std::string errorMsg = "Input/Output tensors dimensions mismatch"; + IE_THROW() << errorMsg; + } + + bool incorrect_result = false; + parallel_for2d(batch_size, beam_width, [&](size_t batch, size_t beam) { + int32_t max_sequence_in_beam = std::min(max_time, static_cast(max_seq_len[batch])); + if (max_sequence_in_beam > 0) { + int32_t time, idx = (max_time - 1) * bb_size + batch * beam_width; + for (time = (max_time - 1); time >= max_sequence_in_beam; time--, idx -= bb_size) + final_idx[idx + beam] = end_token; + + for (int32_t parent = static_cast(beam); time >= 0; time--, idx -= bb_size) { + if (parent < 0 || parent >= static_cast(beam_width) || idx + parent >= parent_idx_size) { + incorrect_result = true; + break; + } + final_idx[idx + beam] = step_idx[idx + parent]; + parent = static_cast(parent_idx[idx + parent]); + } + + bool finished = false; + auto *final = &final_idx[batch * beam_width + beam]; + for (time = 0; time < max_sequence_in_beam; time++, final += bb_size) { + if (finished) + (*final) = end_token; + else if ((*final) == end_token) + finished = true; + } + } + }); + + if (incorrect_result) { + std::string errorMsg = "Wrong parent index, result is incorrect"; + IE_THROW() << errorMsg; + } +} + +bool MKLDNNGatherTreeNode::created() const { + return getType() == GatherTree; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNGatherTreeNode, GatherTree) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.h new file mode 100644 index 00000000000000..63f34fe6d6e685 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.h @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNGatherTreeNode : public MKLDNNNode { +public: + MKLDNNGatherTreeNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + template + void gatherTreeKernel() noexcept; + + private: + static const size_t GATHER_TREE_STEP_IDX = 0; + static const size_t GATHER_TREE_PARENT_IDX = 1; + static const size_t GATHER_TREE_MAX_SEQ_LEN = 2; + static const size_t GATHER_TREE_END_TOKEN = 3; + + InferenceEngine::Precision precision; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.cpp new file mode 100644 index 00000000000000..0dbe8dee59ea51 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_grn_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNGRNNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto grn = std::dynamic_pointer_cast(op); + if (!grn) { + errorMessage = "Only opset1 GRN operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNGRNNode::MKLDNNGRNNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "GRN layer with name '" + op->get_friendly_name() + "'"; + const auto grn = std::dynamic_pointer_cast(op); + + if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + bias = grn->get_bias(); +} + +void MKLDNNGRNNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}}, + impl_desc_type::ref_any); +} + +void MKLDNNGRNNode::execute(mkldnn::stream strm) { + const float* src_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + float* dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + SizeVector dims = getParentEdgeAt(0)->getDims().ToSizeVector(); + + int N = static_cast((dims.size() > 0) ? dims[0] : 1); + int C = static_cast((dims.size() > 1) ? dims[1] : 1); + int H = static_cast((dims.size() > 2) ? dims[2] : 1); + int W = static_cast((dims.size() > 3) ? dims[3] : 1); + + parallel_for3d(N, H, W, [&](int b, int h, int w) { + double variance = 0; + for (int c = 0; c < C; c++) { + variance += std::pow(src_data[b*C*H*W + c*H*W + h*W + w], 2); + } + variance = std::pow(variance + bias, 0.5f); + for (int c = 0; c < C; c++) { + dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] / static_cast(variance); + } + }); +} + +bool MKLDNNGRNNode::created() const { + return getType() == GRN; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNGRNNode, GRN) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.h new file mode 100644 index 00000000000000..8fe8d9d75b04e7 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.h @@ -0,0 +1,30 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNGRNNode : public MKLDNNNode { +public: + MKLDNNGRNNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + float bias = 1.0f; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.cpp new file mode 100644 index 00000000000000..5750f8517b0096 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.cpp @@ -0,0 +1,116 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_log_softmax_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNLogSoftmaxNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto logSoftMax = std::dynamic_pointer_cast(op); + if (!logSoftMax) { + errorMessage = "Only opset5 LogSoftmax operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNLogSoftmaxNode::MKLDNNLogSoftmaxNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "LogSoftmax layer with name '" + op->get_friendly_name() + "'"; + const auto logSoftMax = std::dynamic_pointer_cast(op); + + if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + SizeVector dims = op->get_input_shape(0); + if (!dims.size()) + dims = SizeVector(1, 1); + int axis = logSoftMax->get_axis(); + if (axis < 0) + axis += dims.size(); + + if (dims.size() < static_cast((size_t)(1) + axis)) + IE_THROW() << errorPrefix << " has incorrect input parameters dimensions and axis number!"; + + int j; + for (j = dims.size() - 1; j >= 0; j--) { + if (dims[j] != 1) break; + } + if (j == axis) isLastDim = true; + + for (int i = 0; i < axis; i++) + axisStep *= dims[i]; + reducedAxisSize = dims[axis]; + for (size_t i = (axis + 1); i < dims.size(); i++) + reducedAxisStride *= dims[i]; +} + +void MKLDNNLogSoftmaxNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNLogSoftmaxNode::execute(mkldnn::stream strm) { + const float *srcData = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + float* dstData = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + if (isLastDim) { + parallel_for(axisStep, [&](size_t i) { + const float *srcDataPtr = &srcData[i * reducedAxisSize]; + float *dstDataPtr = &dstData[i * reducedAxisSize]; + + float reduceProd = 0.0f; + const float max = *std::max_element(srcDataPtr, srcDataPtr + reducedAxisSize); + for (size_t j = 0; j < reducedAxisSize; ++j) + reduceProd += expf(srcDataPtr[j] - max); + + reduceProd = logf(reduceProd); + for (size_t j = 0; j < reducedAxisSize; ++j) + dstDataPtr[j] = srcDataPtr[j] - max - reduceProd; + }); + } else { + parallel_for2d(axisStep, reducedAxisStride, [&](size_t k, size_t i) { + const float *srcDataPtr = &srcData[k * reducedAxisStride * reducedAxisSize + i]; + float *dstDataPtr = &dstData[k * reducedAxisStride * reducedAxisSize + i]; + + float reduceProd = 0.0f; + float max = std::numeric_limits::min(); + for (size_t j = 0; j < reducedAxisSize; ++j) { + if (srcDataPtr[j * reducedAxisStride] > max) + max = srcDataPtr[j * reducedAxisStride]; + } + + for (size_t j = 0; j < reducedAxisSize; ++j) + reduceProd += expf(srcDataPtr[j * reducedAxisStride] - max); + + reduceProd = logf(reduceProd); + for (size_t j = 0; j < reducedAxisSize; ++j) + dstDataPtr[j * reducedAxisStride] = srcDataPtr[j * reducedAxisStride] - max - reduceProd; + }); + } +} + +bool MKLDNNLogSoftmaxNode::created() const { + return getType() == LogSoftmax; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNLogSoftmaxNode, LogSoftmax) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.h new file mode 100644 index 00000000000000..456d7321efcdc4 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.h @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNLogSoftmaxNode : public MKLDNNNode { +public: + MKLDNNLogSoftmaxNode(const std::shared_ptr& op, + const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + size_t reducedAxisSize; + size_t reducedAxisStride = 1; + size_t axisStep = 1; + bool isLastDim = false; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp index ecfa4fbbd32468..908686bf6df1eb 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp @@ -43,17 +43,17 @@ MKLDNNMathNode::MKLDNNMathNode(const std::shared_ptr& op, const mk } initializers[op->get_type_info()](op, *this); - - size_t sizeVector = op->get_input_size(); - inDataConf.reserve(sizeVector); - for (int i = 0; i < sizeVector; ++i) - inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); } void MKLDNNMathNode::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + addSupportedPrimDesc(inDataConf, {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, impl_desc_type::ref_any); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h index a91cb3ae373d9c..28260dc476ec54 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h @@ -28,7 +28,6 @@ class MKLDNNMathNode : public MKLDNNNode { float beta = 0.0f; float gamma = 0.0f; - std::vector inDataConf; std::string errorPrefix; }; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.cpp new file mode 100644 index 00000000000000..093127eada5f9a --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.cpp @@ -0,0 +1,406 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include +#include + +#include "mkldnn_non_max_suppression_node.h" +#include "ie_parallel.hpp" +#include +#include "utils/general_utils.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNNonMaxSuppressionNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto nms = std::dynamic_pointer_cast(op); + if (!nms) { + errorMessage = "Only internal NonMaxSuppression operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNNonMaxSuppressionNode::MKLDNNNonMaxSuppressionNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "NMS layer with name '" + op->get_friendly_name() + "' "; + const auto nms = std::dynamic_pointer_cast(op); + + if (getOriginalInputsNumber() < 2 || getOriginalInputsNumber() > 6) + IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getOriginalInputsNumber(); + + if (getOriginalOutputsNumber() < 1 || getOriginalOutputsNumber() > 3) + IE_THROW() << errorPrefix << "has incorrect number of output edges: " << getOriginalOutputsNumber(); + + boxEncodingType = nms->m_center_point_box ? boxEncoding::CENTER : boxEncoding::CORNER; + + sort_result_descending = nms->m_sort_result_descending; + + const SizeVector &boxes_dims = op->get_input_shape(NMS_BOXES); + num_batches = boxes_dims[0]; + num_boxes = boxes_dims[1]; + if (boxes_dims.size() != 3) + IE_THROW() << errorPrefix << "has unsupported 'boxes' input rank: " << boxes_dims.size(); + if (boxes_dims[2] != 4) + IE_THROW() << errorPrefix << "has unsupported 'boxes' input 3rd dimension size: " << boxes_dims[2]; + + const SizeVector &scores_dims = op->get_input_shape(NMS_SCORES); + num_classes = scores_dims[1]; + if (scores_dims.size() != 3) + IE_THROW() << errorPrefix << "has unsupported 'scores' input rank: " << scores_dims.size(); + + if (num_batches != scores_dims[0]) + IE_THROW() << errorPrefix << " num_batches is different in 'boxes' and 'scores' inputs"; + if (num_boxes != scores_dims[2]) + IE_THROW() << errorPrefix << " num_boxes is different in 'boxes' and 'scores' inputs"; + + numFiltBox.resize(num_batches); + for (auto & i : numFiltBox) + i.resize(num_classes); + + inputShape_MAXOUTPUTBOXESPERCLASS = op->get_input_shape(NMS_MAXOUTPUTBOXESPERCLASS); + inputShape_IOUTHRESHOLD = op->get_input_shape(NMS_IOUTHRESHOLD); + inputShape_SCORETHRESHOLD = op->get_input_shape(NMS_SCORETHRESHOLD); + if (getOriginalInputsNumber() > NMS_SOFTNMSSIGMA) { + inputShape_SOFTNMSSIGMA = op->get_input_shape(NMS_SOFTNMSSIGMA); + } + + outputShape_SELECTEDINDICES = op->get_output_shape(NMS_SELECTEDINDICES); + outputShape_SELECTEDSCORES = op->get_output_shape(NMS_SELECTEDSCORES); + + const SizeVector &valid_outputs_dims = op->get_input_shape(NMS_VALIDOUTPUTS); + if (valid_outputs_dims.size() != 1) + IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output rank: " << valid_outputs_dims.size(); + if (valid_outputs_dims[0] != 1) + IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output 1st dimension size: " << valid_outputs_dims[1]; +} + +void MKLDNNNonMaxSuppressionNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + const std::vector supportedFloatPrecision = {Precision::FP32, Precision::BF16}; + const std::vector supportedIntOutputPrecision = {Precision::I32, Precision::I64}; + + checkPrecision(getOriginalInputPrecisionAtPort(NMS_BOXES), supportedFloatPrecision, "boxes", inType); + checkPrecision(getOriginalInputPrecisionAtPort(NMS_SCORES), supportedFloatPrecision, "scores", inType); + checkPrecision(getOriginalInputPrecisionAtPort(NMS_VALIDOUTPUTS), supportedIntOutputPrecision, "valid_outputs", outType); + + const std::vector supportedPrecision = {Precision::I16, Precision::U8, Precision::I8, Precision::U16, Precision::I32, + Precision::U32, Precision::I64, Precision::U64}; + + check1DInput(inputShape_MAXOUTPUTBOXESPERCLASS, supportedPrecision, "max_output_boxes_per_class", NMS_MAXOUTPUTBOXESPERCLASS); + check1DInput(inputShape_IOUTHRESHOLD, supportedFloatPrecision, "iou_threshold", NMS_IOUTHRESHOLD); + check1DInput(inputShape_SCORETHRESHOLD, supportedFloatPrecision, "score_threshold", NMS_SCORETHRESHOLD); + + if (getOriginalInputsNumber() > NMS_SOFTNMSSIGMA) { + check1DInput(inputShape_SOFTNMSSIGMA, supportedFloatPrecision, "soft_nms_sigma", NMS_SOFTNMSSIGMA); + } + + checkOutput(outputShape_SELECTEDINDICES, supportedIntOutputPrecision, "selected_indices", NMS_SELECTEDINDICES); + checkOutput(outputShape_SELECTEDSCORES, supportedFloatPrecision, "selected_scores", NMS_SELECTEDSCORES); + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) { + Precision inPrecision = i == NMS_MAXOUTPUTBOXESPERCLASS ? Precision::I32 : Precision::FP32; + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, inPrecision); + } + + std::vector outDataConf; + outDataConf.reserve(getOriginalOutputsNumber()); + for (int i = 0; i < getOriginalOutputsNumber(); ++i) { + Precision outPrecision = i == NMS_SELECTEDSCORES ? Precision::FP32 : Precision::I32; + outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, outPrecision); + } + + addSupportedPrimDesc(inDataConf, outDataConf, impl_desc_type::ref_any); +} + +void MKLDNNNonMaxSuppressionNode::execute(mkldnn::stream strm) { + const float *boxes = reinterpret_cast(getParentEdgeAt(NMS_BOXES)->getMemoryPtr()->GetPtr()); + const float *scores = reinterpret_cast(getParentEdgeAt(NMS_SCORES)->getMemoryPtr()->GetPtr()); + + max_output_boxes_per_class = outDims.size() > NMS_SELECTEDSCORES ? 0 : num_boxes; + if (inDims.size() > NMS_MAXOUTPUTBOXESPERCLASS) { + max_output_boxes_per_class = reinterpret_cast(getParentEdgeAt(NMS_MAXOUTPUTBOXESPERCLASS)->getMemoryPtr()->GetPtr())[0]; + } + + if (max_output_boxes_per_class == 0) + return; + + iou_threshold = outDims.size() > NMS_SELECTEDSCORES ? 0.0f : 1.0f; + if (inDims.size() > NMS_IOUTHRESHOLD) + iou_threshold = reinterpret_cast(getParentEdgeAt(NMS_IOUTHRESHOLD)->getMemoryPtr()->GetPtr())[0]; + + score_threshold = 0.0f; + if (inDims.size() > NMS_SCORETHRESHOLD) + score_threshold = reinterpret_cast(getParentEdgeAt(NMS_SCORETHRESHOLD)->getMemoryPtr()->GetPtr())[0]; + + soft_nms_sigma = 0.0f; + if (inDims.size() > NMS_SOFTNMSSIGMA) + soft_nms_sigma = reinterpret_cast(getParentEdgeAt(NMS_SOFTNMSSIGMA)->getMemoryPtr()->GetPtr())[0]; + scale = 0.0f; + if (soft_nms_sigma > 0.0) { + scale = -0.5 / soft_nms_sigma; + } + + int *selected_indices = reinterpret_cast(getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getMemoryPtr()->GetPtr()); + + float *selected_scores = nullptr; + if (outDims.size() > NMS_SELECTEDSCORES) + selected_scores = reinterpret_cast(getChildEdgesAtPort(NMS_SELECTEDSCORES)[0]->getMemoryPtr()->GetPtr()); + + int *valid_outputs = nullptr; + if (outDims.size() > NMS_VALIDOUTPUTS) + valid_outputs = reinterpret_cast(getChildEdgesAtPort(NMS_VALIDOUTPUTS)[0]->getMemoryPtr()->GetPtr()); + + auto boxesStrides = getParentEdgeAt(NMS_BOXES)->getDesc().getBlockingDesc().getStrides(); + auto scoresStrides = getParentEdgeAt(NMS_SCORES)->getDesc().getBlockingDesc().getStrides(); + + std::vector filtBoxes(max_output_boxes_per_class * num_batches * num_classes); + + if (soft_nms_sigma == 0.0f) { + nmsWithoutSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes); + } else { + nmsWithSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes); + } + + size_t startOffset = numFiltBox[0][0]; + for (size_t b = 0; b < numFiltBox.size(); b++) { + size_t batchOffset = b*num_classes*max_output_boxes_per_class; + for (size_t c = (b == 0 ? 1 : 0); c < numFiltBox[b].size(); c++) { + size_t offset = batchOffset + c*max_output_boxes_per_class; + for (size_t i = 0; i < numFiltBox[b][c]; i++) { + filtBoxes[startOffset + i] = filtBoxes[offset + i]; + } + startOffset += numFiltBox[b][c]; + } + } + filtBoxes.resize(startOffset); + + // need more particular comparator to get deterministic behaviour + // escape situation when filtred boxes with same score have different position from launch to launch + if (sort_result_descending) { + parallel_sort(filtBoxes.begin(), filtBoxes.end(), + [](const filteredBoxes& l, const filteredBoxes& r) { + return (l.score > r.score) || + (l.score == r.score && l.batch_index < r.batch_index) || + (l.score == r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) || + (l.score == r.score && l.batch_index == r.batch_index && l.class_index == r.class_index && l.box_index < r.box_index); + }); + } + + const size_t selectedBoxesNum = getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getDims()[0]; + const size_t validOutputs = std::min(filtBoxes.size(), selectedBoxesNum); + + int selectedIndicesStride = getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getDesc().getBlockingDesc().getStrides()[0]; + int *selectedIndicesPtr = selected_indices; + float *selectedScoresPtr = selected_scores; + + size_t idx = 0lu; + for (; idx < validOutputs; idx++) { + selectedIndicesPtr[0] = filtBoxes[idx].batch_index; + selectedIndicesPtr[1] = filtBoxes[idx].class_index; + selectedIndicesPtr[2] = filtBoxes[idx].box_index; + selectedIndicesPtr += selectedIndicesStride; + if (outDims.size() > NMS_SELECTEDSCORES) { + selectedScoresPtr[0] = static_cast(filtBoxes[idx].batch_index); + selectedScoresPtr[1] = static_cast(filtBoxes[idx].class_index); + selectedScoresPtr[2] = static_cast(filtBoxes[idx].score); + selectedScoresPtr += selectedIndicesStride; + } + } + std::fill(selectedIndicesPtr, selectedIndicesPtr + (selectedBoxesNum - idx) * selectedIndicesStride, -1); + if (outDims.size() > NMS_SELECTEDSCORES) { + std::fill(selectedScoresPtr, selectedScoresPtr + (selectedBoxesNum - idx) * selectedIndicesStride, -1.f); + } + if (outDims.size() > NMS_VALIDOUTPUTS) + *valid_outputs = static_cast(validOutputs); +} + +bool MKLDNNNonMaxSuppressionNode::created() const { + return getType() == NonMaxSuppression; +} + +float MKLDNNNonMaxSuppressionNode::intersectionOverUnion(const float *boxesI, const float *boxesJ) { + float yminI, xminI, ymaxI, xmaxI, yminJ, xminJ, ymaxJ, xmaxJ; + if (boxEncodingType == boxEncoding::CENTER) { + // box format: x_center, y_center, width, height + yminI = boxesI[1] - boxesI[3] / 2.f; + xminI = boxesI[0] - boxesI[2] / 2.f; + ymaxI = boxesI[1] + boxesI[3] / 2.f; + xmaxI = boxesI[0] + boxesI[2] / 2.f; + yminJ = boxesJ[1] - boxesJ[3] / 2.f; + xminJ = boxesJ[0] - boxesJ[2] / 2.f; + ymaxJ = boxesJ[1] + boxesJ[3] / 2.f; + xmaxJ = boxesJ[0] + boxesJ[2] / 2.f; + } else { + // box format: y1, x1, y2, x2 + yminI = (std::min)(boxesI[0], boxesI[2]); + xminI = (std::min)(boxesI[1], boxesI[3]); + ymaxI = (std::max)(boxesI[0], boxesI[2]); + xmaxI = (std::max)(boxesI[1], boxesI[3]); + yminJ = (std::min)(boxesJ[0], boxesJ[2]); + xminJ = (std::min)(boxesJ[1], boxesJ[3]); + ymaxJ = (std::max)(boxesJ[0], boxesJ[2]); + xmaxJ = (std::max)(boxesJ[1], boxesJ[3]); + } + + float areaI = (ymaxI - yminI) * (xmaxI - xminI); + float areaJ = (ymaxJ - yminJ) * (xmaxJ - xminJ); + if (areaI <= 0.f || areaJ <= 0.f) + return 0.f; + + float intersection_area = + (std::max)((std::min)(ymaxI, ymaxJ) - (std::max)(yminI, yminJ), 0.f) * + (std::max)((std::min)(xmaxI, xmaxJ) - (std::max)(xminI, xminJ), 0.f); + return intersection_area / (areaI + areaJ - intersection_area); +} + +void MKLDNNNonMaxSuppressionNode::nmsWithSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, + const SizeVector &scoresStrides, std::vector &filtBoxes) { + auto less = [](const boxInfo& l, const boxInfo& r) { + return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx)); + }; + + auto coeff = [&](float iou) { + const float weight = std::exp(scale * iou * iou); + return iou <= iou_threshold ? weight : 0.0f; + }; + + parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) { + std::vector fb; + const float *boxesPtr = boxes + batch_idx * boxesStrides[0]; + const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; + + std::priority_queue, decltype(less)> sorted_boxes(less); + for (int box_idx = 0; box_idx < num_boxes; box_idx++) { + if (scoresPtr[box_idx] > score_threshold) + sorted_boxes.emplace(boxInfo({scoresPtr[box_idx], box_idx, 0})); + } + + fb.reserve(sorted_boxes.size()); + if (sorted_boxes.size() > 0) { + while (fb.size() < max_output_boxes_per_class && !sorted_boxes.empty()) { + boxInfo currBox = sorted_boxes.top(); + float origScore = currBox.score; + sorted_boxes.pop(); + + bool box_is_selected = true; + for (int idx = static_cast(fb.size()) - 1; idx >= currBox.suppress_begin_index; idx--) { + float iou = intersectionOverUnion(&boxesPtr[currBox.idx * 4], &boxesPtr[fb[idx].box_index * 4]); + currBox.score *= coeff(iou); + if (iou >= iou_threshold) { + box_is_selected = false; + break; + } + if (currBox.score <= score_threshold) + break; + } + + currBox.suppress_begin_index = fb.size(); + if (box_is_selected) { + if (currBox.score == origScore) { + fb.push_back({ currBox.score, batch_idx, class_idx, currBox.idx }); + continue; + } + if (currBox.score > score_threshold) { + sorted_boxes.push(currBox); + } + } + } + } + numFiltBox[batch_idx][class_idx] = fb.size(); + size_t offset = batch_idx*num_classes*max_output_boxes_per_class + class_idx*max_output_boxes_per_class; + for (size_t i = 0; i < fb.size(); i++) { + filtBoxes[offset + i] = fb[i]; + } + }); +} + +void MKLDNNNonMaxSuppressionNode::nmsWithoutSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, + const SizeVector &scoresStrides, std::vector &filtBoxes) { + int max_out_box = static_cast(max_output_boxes_per_class); + parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) { + const float *boxesPtr = boxes + batch_idx * boxesStrides[0]; + const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; + + std::vector> sorted_boxes; + for (int box_idx = 0; box_idx < num_boxes; box_idx++) { + if (scoresPtr[box_idx] > score_threshold) + sorted_boxes.emplace_back(std::make_pair(scoresPtr[box_idx], box_idx)); + } + + int io_selection_size = 0; + if (sorted_boxes.size() > 0) { + parallel_sort(sorted_boxes.begin(), sorted_boxes.end(), + [](const std::pair& l, const std::pair& r) { + return (l.first > r.first || ((l.first == r.first) && (l.second < r.second))); + }); + int offset = batch_idx*num_classes*max_output_boxes_per_class + class_idx*max_output_boxes_per_class; + filtBoxes[offset + 0] = filteredBoxes(sorted_boxes[0].first, batch_idx, class_idx, sorted_boxes[0].second); + io_selection_size++; + for (size_t box_idx = 1; (box_idx < sorted_boxes.size()) && (io_selection_size < max_out_box); box_idx++) { + bool box_is_selected = true; + for (int idx = io_selection_size - 1; idx >= 0; idx--) { + float iou = intersectionOverUnion(&boxesPtr[sorted_boxes[box_idx].second * 4], &boxesPtr[filtBoxes[offset + idx].box_index * 4]); + if (iou >= iou_threshold) { + box_is_selected = false; + break; + } + } + + if (box_is_selected) { + filtBoxes[offset + io_selection_size] = filteredBoxes(sorted_boxes[box_idx].first, batch_idx, class_idx, sorted_boxes[box_idx].second); + io_selection_size++; + } + } + } + numFiltBox[batch_idx][class_idx] = io_selection_size; + }); +} + +void MKLDNNNonMaxSuppressionNode::checkPrecision(const Precision prec, const std::vector precList, + const std::string name, const std::string type) { + if (std::find(precList.begin(), precList.end(), prec) == precList.end()) + IE_THROW() << errorPrefix << "has unsupported '" << name << "' " << type << " precision: " << prec; +} + +void MKLDNNNonMaxSuppressionNode::check1DInput(const SizeVector& dims, const std::vector precList, + const std::string name, const size_t port) { + checkPrecision(getOriginalInputPrecisionAtPort(port), precList, name, inType); + + if (dims.size() != 0 && dims.size() != 1) + IE_THROW() << errorPrefix << "has unsupported '" << name << "' input rank: " << dims.size(); + if (dims.size() == 1) + if (dims[0] != 1) + IE_THROW() << errorPrefix << "has unsupported '" << name << "' input 1st dimension size: " << dims[0]; +} + +void MKLDNNNonMaxSuppressionNode::checkOutput(const SizeVector& dims, const std::vector precList, + const std::string name, const size_t port) { + checkPrecision(getOriginalOutputPrecisionAtPort(port), precList, name, outType); + + if (dims.size() != 2) + IE_THROW() << errorPrefix << "has unsupported '" << name << "' output rank: " << dims.size(); + if (dims[1] != 3) + IE_THROW() << errorPrefix << "has unsupported '" << name << "' output 2nd dimension size: " << dims[1]; +} + + +REG_MKLDNN_PRIM_FOR(MKLDNNNonMaxSuppressionNode, NonMaxSuppression) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.h new file mode 100644 index 00000000000000..4651da1f2e795c --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.h @@ -0,0 +1,102 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +using namespace InferenceEngine; + +namespace MKLDNNPlugin { + +class MKLDNNNonMaxSuppressionNode : public MKLDNNNode { +public: + MKLDNNNonMaxSuppressionNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + struct filteredBoxes { + float score; + int batch_index; + int class_index; + int box_index; + filteredBoxes() = default; + filteredBoxes(float _score, int _batch_index, int _class_index, int _box_index) : + score(_score), batch_index(_batch_index), class_index(_class_index), box_index(_box_index) {} + }; + + struct boxInfo { + float score; + int idx; + int suppress_begin_index; + }; + + float intersectionOverUnion(const float *boxesI, const float *boxesJ); + + void nmsWithSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, + const SizeVector &scoresStrides, std::vector &filtBoxes); + + void nmsWithoutSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, + const SizeVector &scoresStrides, std::vector &filtBoxes); + +private: + // input + const size_t NMS_BOXES = 0; + const size_t NMS_SCORES = 1; + const size_t NMS_MAXOUTPUTBOXESPERCLASS = 2; + const size_t NMS_IOUTHRESHOLD = 3; + const size_t NMS_SCORETHRESHOLD = 4; + const size_t NMS_SOFTNMSSIGMA = 5; + + // output + const size_t NMS_SELECTEDINDICES = 0; + const size_t NMS_SELECTEDSCORES = 1; + const size_t NMS_VALIDOUTPUTS = 2; + + enum class boxEncoding { + CORNER, + CENTER + }; + boxEncoding boxEncodingType = boxEncoding::CORNER; + bool sort_result_descending = true; + + size_t num_batches; + size_t num_boxes; + size_t num_classes; + + size_t max_output_boxes_per_class = 0lu; + float iou_threshold = 0.0f; + float score_threshold = 0.0f; + float soft_nms_sigma = 0.0f; + float scale = 1.f; + + SizeVector inputShape_MAXOUTPUTBOXESPERCLASS; + SizeVector inputShape_IOUTHRESHOLD; + SizeVector inputShape_SCORETHRESHOLD; + SizeVector inputShape_SOFTNMSSIGMA; + + SizeVector outputShape_SELECTEDINDICES; + SizeVector outputShape_SELECTEDSCORES; + + std::string errorPrefix; + + std::vector> numFiltBox; + const std::string inType = "input", outType = "output"; + + void checkPrecision(const Precision prec, const std::vector precList, const std::string name, const std::string type); + void check1DInput(const SizeVector& dims, const std::vector precList, const std::string name, const size_t port); + void checkOutput(const SizeVector& dims, const std::vector precList, const std::string name, const size_t port); +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.cpp new file mode 100644 index 00000000000000..584960373aeb2e --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.cpp @@ -0,0 +1,198 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_proposal_node.h" + +static std::vector generate_anchors(proposal_conf &conf) { + auto base_size = conf.base_size_; + auto coordinates_offset = conf.coordinates_offset; + auto round_ratios = conf.round_ratios; + + auto num_ratios = conf.ratios.size(); + auto ratios = conf.ratios.data(); + + auto num_scales = conf.scales.size(); + auto scales = conf.scales.data(); + + std::vector anchors(num_scales * num_ratios * 4); + auto anchors_ptr = anchors.data(); + + // base box's width & height & center location + const float base_area = static_cast(base_size * base_size); + const float half_base_size = base_size * 0.5f; + const float center = 0.5f * (base_size - coordinates_offset); + + // enumerate all transformed boxes + for (int ratio = 0; ratio < num_ratios; ++ratio) { + // transformed width & height for given ratio factors + float ratio_w; + float ratio_h; + if (round_ratios) { + ratio_w = std::roundf(std::sqrt(base_area / ratios[ratio])); + ratio_h = std::roundf(ratio_w * ratios[ratio]); + } else { + ratio_w = std::sqrt(base_area / ratios[ratio]); + ratio_h = ratio_w * ratios[ratio]; + } + + float * const p_anchors_wm = anchors_ptr + 0 * num_ratios * num_scales + ratio * num_scales; + float * const p_anchors_hm = anchors_ptr + 1 * num_ratios * num_scales + ratio * num_scales; + float * const p_anchors_wp = anchors_ptr + 2 * num_ratios * num_scales + ratio * num_scales; + float * const p_anchors_hp = anchors_ptr + 3 * num_ratios * num_scales + ratio * num_scales; + + for (int scale = 0; scale < num_scales; ++scale) { + // transformed width & height for given scale factors + const float scale_w = 0.5f * (ratio_w * scales[scale] - coordinates_offset); + const float scale_h = 0.5f * (ratio_h * scales[scale] - coordinates_offset); + + // (x1, y1, x2, y2) for transformed box + p_anchors_wm[scale] = center - scale_w; + p_anchors_hm[scale] = center - scale_h; + p_anchors_wp[scale] = center + scale_w; + p_anchors_hp[scale] = center + scale_h; + + if (conf.shift_anchors) { + p_anchors_wm[scale] -= half_base_size; + p_anchors_hm[scale] -= half_base_size; + p_anchors_wp[scale] -= half_base_size; + p_anchors_hp[scale] -= half_base_size; + } + } + } + return anchors; +} + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNProposalNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto proposal0Op = ngraph::as_type_ptr(op); + const auto proposal4Op = ngraph::as_type_ptr(op); + if (!proposal0Op && !proposal4Op) { + errorMessage = "Node is not an instance of the Proposal from the operations set v0 or v4."; + return false; + } + auto proposalOp = std::dynamic_pointer_cast(op); + if (proposalOp->get_attrs().framework != "tensorflow" && !proposalOp->get_attrs().framework.empty()) { + errorMessage = "Unsupported framework attribute: " + proposalOp->get_attrs().framework; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNProposalNode::MKLDNNProposalNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + auto proposalOp = std::dynamic_pointer_cast(op); + auto proposalAttrs = proposalOp->get_attrs(); + + conf.feat_stride_ = proposalAttrs.feat_stride; + conf.base_size_ = proposalAttrs.base_size; + conf.min_size_ = proposalAttrs.min_size; + conf.pre_nms_topn_ = proposalAttrs.pre_nms_topn; + conf.post_nms_topn_ = proposalAttrs.post_nms_topn; + conf.nms_thresh_ = proposalAttrs.nms_thresh; + conf.box_coordinate_scale_ = proposalAttrs.box_coordinate_scale; + conf.box_size_scale_ = proposalAttrs.box_size_scale; + conf.scales = proposalAttrs.scale; + conf.ratios = proposalAttrs.ratio; + conf.normalize_ = proposalAttrs.normalize; + conf.clip_before_nms = proposalAttrs.clip_before_nms; + conf.clip_after_nms = proposalAttrs.clip_after_nms; + conf.anchors_shape_0 = conf.ratios.size() * conf.scales.size(); + + if (proposalAttrs.framework == "tensorflow") { + conf.coordinates_offset = 0.0f; + conf.initial_clip = true; + conf.shift_anchors = true; + conf.round_ratios = false; + conf.swap_xy = true; + } else { + conf.coordinates_offset = 1.0f; + conf.initial_clip = false; + conf.shift_anchors = false; + conf.round_ratios = true; + conf.swap_xy = false; + } + + anchors = generate_anchors(conf); + roi_indices.resize(conf.post_nms_topn_); + + store_prob = op->get_output_size() == 2; +} + +void MKLDNNProposalNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + if (store_prob) { + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); + } else { + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); + } +} + +void MKLDNNProposalNode::execute(mkldnn::stream strm) { + try { + const float* probabilitiesData = reinterpret_cast(getParentEdgeAt(PROBABILITIES_IN_IDX)->getMemoryPtr()->GetPtr()); + const float* anchorsData = reinterpret_cast(getParentEdgeAt(ANCHORS_IN_IDX)->getMemoryPtr()->GetPtr()); + const float* imgInfoData = reinterpret_cast(getParentEdgeAt(IMG_INFO_IN_IDX)->getMemoryPtr()->GetPtr()); + float* outRoiData = reinterpret_cast (getChildEdgesAtPort(ROI_OUT_IDX)[0]->getMemoryPtr()->GetPtr()); + float* outProbData = nullptr; + if (store_prob) + outProbData = reinterpret_cast (getChildEdgesAtPort(PROBABILITIES_OUT_IDX)[0]->getMemoryPtr()->GetPtr()); + + auto inProbDims = getParentEdgeAt(0)->getDims().ToSizeVector(); + const size_t imgInfoSize = getParentEdgeAt(2)->getDims()[0]; + + // input image height & width + const float imgHeight = imgInfoData[0]; + const float imgWidth = imgInfoData[1]; + if (!std::isnormal(imgHeight) || !std::isnormal(imgWidth) || (imgHeight < 0.f) || (imgWidth < 0.f)) { + IE_THROW() << "Proposal operation image info input must have positive image height and width."; + } + + // scale factor for height & width + const float scaleHeight = imgInfoData[2]; + const float scaleWidth = imgInfoSize == 4 ? imgInfoData[3] : scaleHeight; + if (!std::isfinite(scaleHeight) || !std::isfinite(scaleWidth) || (scaleHeight < 0.f) || (scaleWidth < 0.f)) { + IE_THROW() << "Proposal operation image info input must have non negative scales."; + } + + InferenceEngine::Extensions::Cpu::XARCH::proposal_exec(probabilitiesData, anchorsData, inProbDims, + {imgHeight, imgWidth, scaleHeight, scaleWidth}, anchors.data(), roi_indices.data(), outRoiData, outProbData, conf); + } catch (const InferenceEngine::Exception& e) { + std::string errorMsg = e.what(); + IE_THROW() << errorMsg; + } +} + +bool MKLDNNProposalNode::created() const { + return getType() == Proposal; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNProposalNode, Proposal) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.h new file mode 100644 index 00000000000000..4fdb333b25921b --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.h @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include "proposal_imp.hpp" + +using proposal_conf = InferenceEngine::Extensions::Cpu::proposal_conf; + +namespace MKLDNNPlugin { + +class MKLDNNProposalNode : public MKLDNNNode { +public: + MKLDNNProposalNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const size_t PROBABILITIES_IN_IDX = 0lu; + const size_t ANCHORS_IN_IDX = 1lu; + const size_t IMG_INFO_IN_IDX = 2lu; + const size_t ROI_OUT_IDX = 0lu; + const size_t PROBABILITIES_OUT_IDX = 1lu; + + proposal_conf conf; + std::vector anchors; + std::vector roi_indices; + bool store_prob; // store blob with proposal probabilities + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.cpp new file mode 100644 index 00000000000000..33e625fce6f88a --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_range_node.h" +#include + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNRangeNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + if (!MKLDNNPlugin::one_of(op->get_type_info(), ngraph::op::v0::Range::type_info, ngraph::op::v4::Range::type_info)) { + errorMessage = "Only opset1 and opset4 Range operation is supported"; + return false; + } + if (std::dynamic_pointer_cast(op->get_input_node_shared_ptr(RANGE_START)) == nullptr || + std::dynamic_pointer_cast(op->get_input_node_shared_ptr(RANGE_LIMIT)) == nullptr || + std::dynamic_pointer_cast(op->get_input_node_shared_ptr(RANGE_DELTA)) == nullptr) { + errorMessage = "Only const inputs for Range operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNRangeNode::MKLDNNRangeNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "Range layer with name '" + op->get_friendly_name() + "'"; + + if (getOriginalInputsNumber() != 3 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + SizeVector start_dims = op->get_input_shape(RANGE_START); + if (ngraph::shape_size(start_dims) != 1) + IE_THROW() << errorPrefix << " has start scalar with more than 1 value"; + + SizeVector limit_dims = op->get_input_shape(RANGE_LIMIT); + if (ngraph::shape_size(limit_dims) != 1) + IE_THROW() << errorPrefix << " has limit scalar with more than 1 value"; + + SizeVector delta_dims = op->get_input_shape(RANGE_DELTA); + if (ngraph::shape_size(delta_dims) != 1) + IE_THROW() << errorPrefix << " has delta scalar with more than 1 value"; + + SizeVector dst_dims = op->get_output_shape(0); + if (dst_dims.size() > 1) + IE_THROW() << errorPrefix << " has unsupported rank for output: " << dst_dims.size(); +} + +void MKLDNNRangeNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector inDataConf; + std::vector outDataConf; + + if (!(getOriginalInputPrecisionAtPort(RANGE_START) == Precision::I32 && + getOriginalInputPrecisionAtPort(RANGE_LIMIT) == Precision::I32 && + getOriginalInputPrecisionAtPort(RANGE_DELTA) == Precision::I32 && + getOriginalOutputPrecisionAtPort(0) == Precision::I32) && + !(getOriginalInputPrecisionAtPort(RANGE_START) == Precision::FP32 && + getOriginalInputPrecisionAtPort(RANGE_LIMIT) == Precision::FP32 && + getOriginalInputPrecisionAtPort(RANGE_DELTA) == Precision::FP32 && + getOriginalOutputPrecisionAtPort(0) == Precision::FP32)) { + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + outDataConf.reserve(1); + outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + addSupportedPrimDesc(inDataConf, outDataConf, impl_desc_type::ref_any); + } else { + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp); + outDataConf.reserve(1); + outDataConf.emplace_back(TensorDescCreatorTypes::ncsp); + addSupportedPrimDesc(inDataConf, outDataConf, impl_desc_type::ref_any); + } +} + +void MKLDNNRangeNode::execute(mkldnn::stream strm) { + StatusCode retcode = OK; + switch (getParentEdgeAt(0)->getDesc().getPrecision()) { + case Precision::FP32: + retcode = rangeKernel(); + break; + case Precision::I32: + retcode = rangeKernel(); + break; + default: + IE_THROW() << "Incorrect output precision. Only FP32 and I32 are supported!"; + } + if (retcode == PARAMETER_MISMATCH) { + std::string errorMsg = "Range indexes exceeds data tensor dimension"; + IE_THROW() << errorMsg; + } +} + +template +InferenceEngine::StatusCode MKLDNNRangeNode::rangeKernel() noexcept { + size_t dst_size = (getChildEdgesAtPort(0)[0]->getDims())[0]; + data_t* dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + data_t start = reinterpret_cast(getParentEdgeAt(RANGE_START)->getMemoryPtr()->GetPtr())[0]; + data_t limit = reinterpret_cast(getParentEdgeAt(RANGE_LIMIT)->getMemoryPtr()->GetPtr())[0]; + data_t delta = reinterpret_cast(getParentEdgeAt(RANGE_DELTA)->getMemoryPtr()->GetPtr())[0]; + size_t work_amount_dst = static_cast(std::floor(std::abs((limit - start) / delta))); + if (work_amount_dst != dst_size) + return PARAMETER_MISMATCH; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t iwork = 0, end = 0; + splitter(work_amount_dst, nthr, ithr, iwork, end); + data_t dst_value = start + iwork * delta; + + for (; iwork < end; ++iwork, dst_value += delta) { + dst_data[iwork] = dst_value; + } + }); + return OK; +} + +bool MKLDNNRangeNode::created() const { + return getType() == Range; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNRangeNode, Range) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.h new file mode 100644 index 00000000000000..b5584be6aa949c --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.h @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNRangeNode : public MKLDNNNode { +public: + MKLDNNRangeNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + template + InferenceEngine::StatusCode rangeKernel() noexcept; +private: + static const size_t RANGE_START = 0; + static const size_t RANGE_LIMIT = 1; + static const size_t RANGE_DELTA = 2; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.cpp new file mode 100644 index 00000000000000..3db7470e92fba9 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_reorg_yolo_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNReorgYoloNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto reorgYolo = std::dynamic_pointer_cast(op); + if (!reorgYolo) { + errorMessage = "Only opset2 ReorgYolo operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNReorgYoloNode::MKLDNNReorgYoloNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = std::string(op->get_type_name()) + " node with name '" + op->get_friendly_name() + "'"; + if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + const auto reorgYolo = std::dynamic_pointer_cast(op); + const auto strides = reorgYolo->get_strides(); + if (strides.empty()) + IE_THROW() << errorPrefix << " has empty strides"; + stride = strides[0]; +} + +void MKLDNNReorgYoloNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNReorgYoloNode::execute(mkldnn::stream strm) { + const auto *src_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + auto *dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + int IW = (getParentEdgeAt(0)->getDesc().getDims().size() > 3) ? getParentEdgeAt(0)->getDims()[3] : 1; + int IH = (getParentEdgeAt(0)->getDesc().getDims().size() > 2) ? getParentEdgeAt(0)->getDims()[2] : 1; + int IC = (getParentEdgeAt(0)->getDesc().getDims().size() > 1) ? getParentEdgeAt(0)->getDims()[1] : 1; + int B = (getParentEdgeAt(0)->getDesc().getDims().size() > 0) ? getParentEdgeAt(0)->getDims()[0] : 1; + + int ic_off = IC / (stride * stride); + int ih_off = IH * stride; + int iw_off = IW * stride; + for (int b = 0; b < B; b++) { + for (int ic = 0; ic < IC; ic++) { + for (int ih = 0; ih < IH; ih++) { + for (int iw = 0; iw < IW; iw++) { + int dstIndex = b * IC * IH * IW + ic * IH * IW + ih * IW + iw; + + int oc = ic % ic_off; + int offset = ic / ic_off; + + int ow = iw * stride + offset % stride; + int oh = ih * stride + offset / stride; + + int srcIndex = b * ic_off * ih_off * iw_off + oc * ih_off * iw_off + oh * iw_off + ow; + + dst_data[dstIndex] = src_data[srcIndex]; + } + } + } + } +} + +bool MKLDNNReorgYoloNode::created() const { + return getType() == ReorgYolo; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNReorgYoloNode, ReorgYolo) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.h new file mode 100644 index 00000000000000..b88f19010e0491 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.h @@ -0,0 +1,30 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNReorgYoloNode : public MKLDNNNode { +public: + MKLDNNReorgYoloNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + int stride; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.cpp new file mode 100644 index 00000000000000..5f6e6083e90c4a --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.cpp @@ -0,0 +1,182 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_reverse_sequence_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNReverseSequenceNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto revSeq = std::dynamic_pointer_cast(op); + if (!revSeq) { + errorMessage = "Only opset1 ReverseSequence operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNReverseSequenceNode::MKLDNNReverseSequenceNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "ReverseSequence layer with name '" + op->get_friendly_name() + "'"; + const auto revSeq = std::dynamic_pointer_cast(op); + + if (getOriginalInputsNumber() != 2 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + src_dims = op->get_input_shape(REVERSESEQUENCE_DATA); + + SizeVector seq_lengths_dims = op->get_input_shape(REVERSESEQUENCE_LENGTHS); + if (seq_lengths_dims.size() != 1) + IE_THROW() << errorPrefix << " has incorrect 2nd input rank: " << seq_lengths_dims.size(); + + SizeVector dst_dims = op->get_output_shape(0); + if (src_dims.size() != dst_dims.size()) + IE_THROW() << errorPrefix << " has incorrect number of input/output sizes!"; + + for (size_t i = 0; i < dst_dims.size(); i++) { + if (src_dims[i] != dst_dims[i]) + IE_THROW() << errorPrefix << " has incorrect number of input/output dimension!"; + } + + seq_axis = revSeq->get_sequence_axis(); + + if (seq_axis < 0 || seq_axis >= static_cast(src_dims.size())) + IE_THROW() << errorPrefix << " has incorrect 'seq_axis' parameters dimensions and axis number!"; + + batch_axis = revSeq->get_batch_axis(); + + if (batch_axis < 0 || batch_axis >= static_cast(src_dims.size())) + IE_THROW() << errorPrefix << " has incorrect 'batch_axis' parameters dimensions and axis number!"; + + if (seq_lengths_dims[0] != dst_dims[batch_axis]) + IE_THROW() << errorPrefix << " has incorrect 'seq_lengths_dims' parameters dimension!"; + + srcStrides.resize(src_dims.size()); + srcStrides[srcStrides.size() - 1] = 1; + for (int i = srcStrides.size() - 2; i >= 0; i--) { + srcStrides[i] = srcStrides[i + 1] * src_dims[i + 1]; + } + + work_amount_dst = srcStrides[0] * src_dims[0]; +} + +void MKLDNNReverseSequenceNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + lengthsPrecision = getOriginalInputPrecisionAtPort(REVERSESEQUENCE_LENGTHS); + if (lengthsPrecision != Precision::I32 && lengthsPrecision != Precision::FP32) + lengthsPrecision = Precision::I32; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, lengthsPrecision}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNReverseSequenceNode::execute(mkldnn::stream strm) { + size_t i; + const float *src_data = reinterpret_cast(getParentEdgeAt(REVERSESEQUENCE_DATA)->getMemoryPtr()->GetPtr()); + float* dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + switch (getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getDesc().getPrecision()) { + case Precision::FP32: { + float *seq_lengths_data = reinterpret_cast(getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getMemoryPtr()->GetPtr()); + for (i = 0; i < src_dims[batch_axis]; i++) { + if (static_cast(seq_lengths_data[i]) > static_cast(src_dims[seq_axis])) { + std::string errorMsg = "Incorrect input 'seq_lengths' values!"; + IE_THROW() << errorMsg; + } + } + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t i, start = 0, end = 0, src_idx = 0; + SizeVector counters(src_dims.size(), 0); + splitter(work_amount_dst, nthr, ithr, start, end); + for (int j = src_dims.size() - 1, i = start; j >= 0; j--) { + counters[j] = i % src_dims[j]; + i /= src_dims[j]; + } + + for (size_t iwork = start; iwork < end; ++iwork) { + for (i = 0, src_idx = 0; i < src_dims.size(); ++i) { + size_t idx = counters[i]; + if (static_cast(i) == seq_axis && + static_cast(idx) < static_cast(seq_lengths_data[counters[batch_axis]])) { + idx = static_cast(seq_lengths_data[counters[batch_axis]]) - idx - 1; + } + src_idx += idx * srcStrides[i]; + } + dst_data[iwork] = src_data[src_idx]; + for (int j = src_dims.size() - 1; j >= 0; j--) { + counters[j] = (counters[j] + 1) % src_dims[j]; + if (counters[j] != 0) break; + } + } + }); + } + break; + case Precision::I32: { + int32_t *seq_lengths_data = reinterpret_cast(getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getMemoryPtr()->GetPtr()); + for (i = 0; i < src_dims[batch_axis]; i++) { + if (seq_lengths_data[i] > static_cast(src_dims[seq_axis])) { + std::string errorMsg = "Incorrect input 'seq_lengths' values!"; + IE_THROW() << errorMsg; + } + } + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t i, start = 0, end = 0, src_idx = 0; + SizeVector counters(src_dims.size(), 0); + splitter(work_amount_dst, nthr, ithr, start, end); + for (int j = src_dims.size() - 1, i = start; j >= 0; j--) { + counters[j] = i % src_dims[j]; + i /= src_dims[j]; + } + + for (size_t iwork = start; iwork < end; ++iwork) { + for (i = 0, src_idx = 0; i < src_dims.size(); ++i) { + size_t idx = counters[i]; + if (static_cast(i) == seq_axis && + static_cast(idx) < seq_lengths_data[counters[batch_axis]]) { + idx = seq_lengths_data[counters[batch_axis]] - idx - 1; + } + src_idx += idx * srcStrides[i]; + } + dst_data[iwork] = src_data[src_idx]; + for (int j = src_dims.size() - 1; j >= 0; j--) { + counters[j] = (counters[j] + 1) % src_dims[j]; + if (counters[j] != 0) break; + } + } + }); + } + break; + default: + IE_THROW() << "ReverseSequence layer does not support " + << getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getDesc().getPrecision() << " precision"; + } +} + +bool MKLDNNReverseSequenceNode::created() const { + return getType() == ReverseSequence; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNReverseSequenceNode, ReverseSequence) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.h new file mode 100644 index 00000000000000..4b3cf056c63afa --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.h @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNReverseSequenceNode : public MKLDNNNode { +public: + MKLDNNReverseSequenceNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const size_t REVERSESEQUENCE_DATA = 0; + const size_t REVERSESEQUENCE_LENGTHS = 1; + + int seq_axis; + int batch_axis; + InferenceEngine::SizeVector src_dims; + InferenceEngine::SizeVector srcStrides; + size_t work_amount_dst; + + InferenceEngine::Precision lengthsPrecision; + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp index 6d4c9a27dc4d8b..53dda785e69115 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp @@ -14,7 +14,7 @@ using namespace InferenceEngine; MKLDNNSoftMaxNode::MKLDNNSoftMaxNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { - auto softmaxOp = ngraph::as_type_ptr(op); + const auto softmaxOp = ngraph::as_type_ptr(op); if (softmaxOp) { axis = softmaxOp->get_axis(); } else { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.cpp new file mode 100644 index 00000000000000..1c78c44b48df5a --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.cpp @@ -0,0 +1,478 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_topk_node.h" +#include "utils/general_utils.h" + +#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) +#include +#endif + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNTopKNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto topKOp = ngraph::as_type_ptr(op); + if (!topKOp) { + errorMessage = "Node is not an instance of the TopK from the operations set v1 or v3"; + return false; + } + if (topKOp->get_mode() != ngraph::op::TopKMode::MAX && + topKOp->get_mode() != ngraph::op::TopKMode::MIN) { + errorMessage = "Unsupported mode."; + return false; + } + if (!MKLDNNPlugin::one_of(topKOp->get_sort_type(), ngraph::op::TopKSortType::NONE, + ngraph::op::TopKSortType::SORT_VALUES, + ngraph::op::TopKSortType::SORT_INDICES)) { + errorMessage = "Unsupported sort type."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNTopKNode::MKLDNNTopKNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + auto topK1Op = ngraph::as_type_ptr(op); + + SizeVector dstDims = topK1Op->get_output_shape(TOPK_VALUE); + src_dims = topK1Op->get_input_shape(TOPK_DATA); + + axis = topK1Op->get_axis(); + + if (topK1Op->get_mode() == ngraph::op::TopKMode::MAX) + mode_max = true; + else + mode_max = false; + + if (topK1Op->get_sort_type() == ngraph::op::TopKSortType::SORT_VALUES) + sort_value = true; + else + sort_value = false; + + int j; + for (j = src_dims.size() - 1; j >= 0; j--) { + if (src_dims[j] != 1) break; + } + if (static_cast(j) == axis) is_last_dim = true; + + for (size_t i = 0; i < axis; i++) { + axis_step *= src_dims[i]; + } + axis_dim = src_dims[axis]; + for (size_t i = (axis + 1); i < src_dims.size(); i++) { + axis_stride *= src_dims[i]; + } + dim = static_cast(src_dims[axis]); + before_num = count(src_dims, 0, axis); +} + +void MKLDNNTopKNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector outDataConf; + outDataConf.reserve(getOriginalOutputsNumber()); + outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + for (int i = 1; i < getOriginalOutputsNumber(); ++i) + outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32); + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::I32}}, + outDataConf, + impl_desc_type::ref_any); +} + +void MKLDNNTopKNode::execute(mkldnn::stream strm) { + const float *src = reinterpret_cast(getParentEdgeAt(TOPK_DATA)->getMemoryPtr()->GetPtr()); + src_k = reinterpret_cast(getParentEdgeAt(TOPK_K)->getMemoryPtr()->GetPtr())[0]; + float* dst_data = nullptr; + int* dst_idx = nullptr; + + if (outDims.size() == 1) { + if (getOriginalOutputPrecisionAtPort(0) == Precision::FP32) { + dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + } else { + dst_idx = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + } + SizeVector dstDims = getChildEdgesAtPort(0)[0]->getDims().ToSizeVector(); + + if (dstDims[axis] != static_cast(src_k)) { + std::string errorMsg = "Output tensor dimension mismatch"; + IE_THROW() << errorMsg; + } + } else if (outDims.size() == 2) { + dst_data = reinterpret_cast(getChildEdgesAtPort(TOPK_VALUE)[0]->getMemoryPtr()->GetPtr()); + SizeVector dst_data_dims = getChildEdgesAtPort(TOPK_VALUE)[0]->getDims().ToSizeVector(); + + dst_idx = reinterpret_cast(getChildEdgesAtPort(TOPK_INDEX)[0]->getMemoryPtr()->GetPtr()); + SizeVector dst_idx_dims = getChildEdgesAtPort(TOPK_INDEX)[0]->getDims().ToSizeVector(); + + if (dst_idx_dims[axis] != static_cast(src_k) || dst_data_dims[axis] != static_cast(src_k)) { + std::string errorMsg = "Output tensors dimension mismatch"; + IE_THROW() << errorMsg; + } + } else { + std::string errorMsg = "Output tensors amount mismatch"; + IE_THROW() << errorMsg; + } + + if (src_dims[axis] < static_cast(src_k)) + src_k = src_dims[axis]; + + SizeVector in_dims = getParentEdgeAt(TOPK_DATA)->getDims().ToSizeVector(); + + if (src_k == 1) { + if (is_last_dim) { + if (mode_max) + top1(src, dst_data, dst_idx, in_dims); + else + top1(src, dst_data, dst_idx, in_dims); + } else { + if (mode_max) + top1_axis(src, dst_data, dst_idx, in_dims); + else + top1_axis(src, dst_data, dst_idx, in_dims); + } + } else { + if (is_last_dim) { + if (mode_max) + topk(src, dst_data, dst_idx, in_dims); + else + topk(src, dst_data, dst_idx, in_dims); + } else { + if (mode_max) + topk_axis(src, dst_data, dst_idx, in_dims); + else + topk_axis(src, dst_data, dst_idx, in_dims); + } + } +} + +bool MKLDNNTopKNode::created() const { + return getType() == TopK; +} + +template class Compare2> +void MKLDNNTopKNode::top1_axis(const float* src_data, float* dst_data, int* dst_idx, SizeVector in_dims) { + int after_num = count(in_dims, axis + 1, in_dims.size()); + int first_index = 0; + +#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + parallel_for2d(before_num, after_num / block_size, [&](int i0, int ib1) { + int s_index = i0 * dim * after_num + ib1 * block_size; + vec_type_f vmax_val = _mm_uni_loadu_ps(src_data + s_index); + vec_type_i vindex_max_val = _mm_uni_setzero_si(); + for (int i2 = 1; i2 < dim; i2++) { + s_index += after_num; + vec_type_f vsrc = _mm_uni_loadu_ps(src_data + s_index); + vmask_type vmask = Compare1::cmp_ps(vsrc, vmax_val); + vmax_val = _mm_uni_blendv_ps(vmax_val, vsrc, vmask); + + vec_type_i vindex_cur_val = _mm_uni_set1_epi32(i2); +#if defined(HAVE_AVX512F) + vindex_max_val = _mm512_mask_blend_epi32(vmask, vindex_max_val, vindex_cur_val); +#else + vindex_max_val = _mm_uni_blendv_epi8(vindex_max_val, vindex_cur_val, _mm_uni_castps_si(vmask)); +#endif + } + if (dst_data) + _mm_uni_storeu_ps(dst_data + i0 * after_num + ib1 * block_size, vmax_val); + if (dst_idx) + _mm_uni_storeu_si(reinterpret_cast(dst_idx + i0 * after_num + ib1 * block_size), vindex_max_val); + }); + first_index = after_num / block_size * block_size; +#endif + int rest = after_num - first_index; + parallel_for2d(before_num, rest, [&](int i0, int i1) { + int index_max_val = 0; + int s_index = i0 * dim * after_num + first_index + i1; + float max_val = src_data[s_index]; + for (int i2 = 1; i2 < dim; i2++) { + s_index += after_num; + if (Compare2()(src_data[s_index], max_val)) { + max_val = src_data[s_index]; + index_max_val = i2; + } + } + if (dst_data) + dst_data[i0 * after_num + first_index + i1] = max_val; + if (dst_idx) + dst_idx[i0 * after_num + first_index + i1] = index_max_val; + }); +} + +template