From 7fb9bac24a20baaf9bee13a37da0f71547010069 Mon Sep 17 00:00:00 2001 From: Maksim Kutakov Date: Mon, 31 May 2021 18:49:57 +0300 Subject: [PATCH] [CPU] Extend Concat node logic to avoid fallback on slow ref implementation. (#4129) --- .../mkldnn_plugin/mkldnn_extension_utils.cpp | 10 + .../mkldnn_plugin/mkldnn_extension_utils.h | 3 + .../nodes/mkldnn_concat_node.cpp | 476 ++++++------------ .../mkldnn_plugin/nodes/mkldnn_concat_node.h | 2 + .../nodes/mkldnn_shuffle_channels_node.cpp | 12 +- .../plugin/cpu/single_layer_tests/concat.cpp | 214 ++++++++ .../src/fuse_transpose_reorder.cpp | 1 + 7 files changed, 395 insertions(+), 323 deletions(-) create mode 100644 inference-engine/tests/functional/plugin/cpu/single_layer_tests/concat.cpp diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp index 4a8eef03944f85..2d7d4e5e6b61e2 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp @@ -134,6 +134,16 @@ PartialBlkDesc PartialBlkDesc::makeCBlocked(const InferenceEngine::SizeVector &d return res; } + +PartialBlkDesc PartialBlkDesc::makeTailC(const InferenceEngine::SizeVector &dims) { + PartialBlkDesc res = makePlain(dims); + if (dims.size() > 2) { + auto itr = res.outer_order.begin() + 1; + std::rotate(itr, itr + 1, res.outer_order.end()); + } + return res; +} + PartialBlkDesc PartialBlkDesc::extractFrom(const InferenceEngine::TensorDesc &desc) { if (desc.getLayout() == InferenceEngine::ANY) IE_THROW() << "Cannot extract partial blocked descriptor for `ANY` layout"; diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h index 26fc09c92de36b..95e14a7afa2cb3 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h @@ -59,6 +59,9 @@ class PartialBlkDesc { /** Construct blocked Channel PartialBlkDesc based on dims information */ static PartialBlkDesc makeCBlocked(const InferenceEngine::SizeVector &dims, size_t block_size); + /** Construct per Channel PartialBlkDesc based on dims information */ + static PartialBlkDesc makeTailC(const InferenceEngine::SizeVector &dims); + /** Compare operators. Allow to use it as key for std::map */ bool operator == (const PartialBlkDesc& it) const; bool operator < (const PartialBlkDesc& it) const; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp index aa9d7b8fd9850a..ba760cae535806 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp @@ -21,11 +21,15 @@ #include "mkldnn_eltwise_node.h" #include #include "common/cpu_memcpy.h" +#include "common/tensor_desc_creator.h" using namespace mkldnn; using namespace MKLDNNPlugin; using namespace InferenceEngine; +namespace { + constexpr size_t channelAxis = 1lu; +} bool MKLDNNConcatNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { @@ -89,308 +93,120 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() { } } - // MKLDNN doesn't support different precision on inputs so fallback on FP32 in such case + // Concat doesn't support different precision on inputs so fallback on FP32 in such case if (isMixedPrecision) inputPrecision = Precision::FP32; - // Concat node supports int8 implementations only for NHWC and NDHWC layouts - if (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) { - int ndims = getChildEdgeAt(0)->getDims().ndims(); - if (ndims != 2 && ndims != 4 && ndims != 5) - inputPrecision = Precision::FP32; - } - - // MKLDNN supports only equal precisions for inputs and output + // Concat supports only equal precisions for inputs and output outputPrecision = inputPrecision; - auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision); - auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outputPrecision); - - MKLDNNDims dstDims = getChildEdgeAt(0)->getDims(); - InferenceEngine::LayerConfig config; - config.dynBatchSupport = true; - - for (size_t i = 0; i < getParentEdges().size(); i++) { - auto parentEdge = getParentEdgeAt(i); - - InferenceEngine::DataConfig dataConfig; - dataConfig.inPlace = -1; - dataConfig.constant = false; - auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? parentEdge->getDims().ndims() == 2 ? memory::format_tag::nc : - parentEdge->getDims().ndims() == 4 ? memory::format_tag::nhwc : - memory::format_tag::ndhwc - : memory::format_tag::any; + auto& dstDims = getChildEdgeAt(0)->getDims(); + std::vector tdCreatorTypes = {TensorDescCreatorTypes::ncsp, TensorDescCreatorTypes::nspc}; - dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(parentEdge->getDims(), inputDataType, fmt)); - config.inConfs.push_back(dataConfig); - } - - auto dims = getChildEdgeAt(0)->getDims(); + // check if blocked layouts are available the channels size should be evenly divided by the block size to avoid slow oneDNN ref implementation + if (dstDims.ndims() > channelAxis) { + for (auto item : { std::make_pair(8lu, TensorDescCreatorTypes::nCsp8c), std::make_pair(16lu, TensorDescCreatorTypes::nCsp16c)}) { + SizeVector blkDims = dstDims.ToSizeVector(); + if (blkDims[channelAxis] % item.first) + continue; - config.outConfs.resize(1); - config.outConfs[0].inPlace = -1; - config.outConfs[0].constant = false; - if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1) { - auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? dims.ndims() == 2 ? memory::format_tag::nc : - dims.ndims() == 4 ? memory::format_tag::nhwc : - memory::format_tag::ndhwc - : MKLDNNMemory::GetPlainFormat(dims); - - config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, fmt)); - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, fmt); - - if (inputPrecision != Precision::U8 && inputPrecision != Precision::I8) { - if (dims.ndims() == 4) { - if (dims[1] % 8 == 0) { - config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc( - MKLDNNMemoryDesc(dims, outputDataType, memory::format_tag::nChw8c)); - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, memory::format_tag::nChw8c); - - if (dims[1] % 16 == 0) { - config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc( - MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nChw16c)); - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nChw16c); - } - } - } else if (dims.ndims() == 5) { - if (dims[1] % 8 == 0) { - config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc( - MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw8c)); - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw8c); - - if (dims[1] % 16 == 0) { - config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc( - MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw16c)); - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw16c); - } + bool blocked = true; + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto& srcDims = getParentEdgeAt(i)->getDims(); + if (srcDims[channelAxis] % item.first) { + blocked = false; + break; } } + if (blocked) { + tdCreatorTypes.push_back(item.second); + } } } - if (axis != 1) - return; - - auto numOfDim = static_cast(dstDims.ndims()); - - SizeVector order(numOfDim); - SizeVector offsets(numOfDim, 0lu); - size_t offset = (std::numeric_limits::max)(); - for (size_t i = 0; i < numOfDim; i++) { - order[i] = i; - } - - if (outputPrecision == Precision::I8 || outputPrecision == Precision::U8) { - if (numOfDim == 4) { - // Here we assume NHWC layout (channels are the last) - - order = {0, 2, 3, 1}; - offsets = {0, 0, 0, 0}; - - SizeVector blkDims = dstDims.ToSizeVector(); - blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] }; - - SizeVector strides(numOfDim); - strides.resize(numOfDim); - // C is the last in NHWC, so all strides are max() - for (size_t i = 0; i < numOfDim; i++) { - strides[i] = (std::numeric_limits::max)(); - } - - config.outConfs[0].desc = TensorDesc(outputPrecision, - dstDims.ToSizeVector(), - { blkDims, order, offset, offsets, strides }); - for (size_t i = 0; i < getParentEdges().size(); i++) { - auto parentEdge = getParentEdgeAt(i); - - SizeVector blkDims = parentEdge->getDims().ToSizeVector(); - blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] }; + std::vector pdIndexesToReuse; - config.inConfs[i].inPlace = -1; // Change to 0 here if inplace concat is supported for NHWC in mkldnn + auto& creatorsMap = TensorDescCreator::getCommonCreators(); + auto itrRange = TensorDescCreator::makeFilteredRange(creatorsMap, static_cast(dstDims.ndims()), tdCreatorTypes); + for (auto itr = itrRange.first; itr != itrRange.second; ++itr) { + InferenceEngine::LayerConfig config; - config.inConfs[i].desc = TensorDesc(inputPrecision, parentEdge->getDims().ToSizeVector(), - {blkDims, order, offset, offsets, strides}); - } + config.dynBatchSupport = true; + config.outConfs.resize(1); + config.outConfs[0].inPlace = -1; + config.outConfs[0].constant = false; + config.outConfs[0].desc = itr->second->createDesc(outputPrecision, dstDims.ToSizeVector()); + memory::format_tag outFmt = MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat(); - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nhwc); + config.inConfs.resize(getParentEdges().size()); - return; - } else if (numOfDim == 5) { - // Here we assume NDHWC layout (channels are the last) + for (size_t i = 0; i < getParentEdges().size(); ++i) { + config.inConfs[i].inPlace = -1; + config.inConfs[i].constant = false; + config.inConfs[i].desc = MKLDNNExtensionUtils::getUninitTensorDesc( + itr->second->createDesc(inputPrecision, getParentEdgeAt(i)->getDims().ToSizeVector())); + } + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, outFmt); + if (itr->first != TensorDescCreatorTypes::nspc) { + pdIndexesToReuse.push_back(supportedPrimitiveDescriptors.size() - 1); + } + } - order = {0, 2, 3, 4, 1}; - offsets = {0, 0, 0, 0, 0}; + if (axis != channelAxis) + return; - SizeVector blkDims = dstDims.ToSizeVector(); - blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[4], blkDims[1] }; + // Optimized inplace case - SizeVector strides(numOfDim); - strides.resize(numOfDim); - // C is the last in NDHWC, so all strides are max() - for (size_t i = 0; i < numOfDim; i++) { - strides[i] = (std::numeric_limits::max)(); - } + for (auto refPdIndex : pdIndexesToReuse) { + const auto& refConfig = supportedPrimitiveDescriptors[refPdIndex].getConfig(); + auto config = refConfig; - config.outConfs[0].desc = TensorDesc(outputPrecision, - dstDims.ToSizeVector(), - { blkDims, order, offset, offsets, strides }); - for (size_t i = 0; i < getParentEdges().size(); i++) { - auto parentEdge = getParentEdgeAt(i); - - SizeVector blkDims = parentEdge->getDims().ToSizeVector(); - blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[4], blkDims[1] }; + const auto& order = refConfig.outConfs[0].desc.getBlockingDesc().getOrder(); + const auto& blkDims = refConfig.outConfs[0].desc.getBlockingDesc().getBlockDims(); + auto numOfDim = blkDims.size(); - config.inConfs[i].inPlace = -1; // Change to 0 here if inplace concat is supported for NDHWC in mkldnn + SizeVector offsets(numOfDim, 0lu); + SizeVector strides(numOfDim); + strides.back() = 1lu; + size_t offset = (std::numeric_limits::max)(); - config.inConfs[i].desc = TensorDesc(inputPrecision, parentEdge->getDims().ToSizeVector(), - {blkDims, order, offset, offsets, strides}); + for (size_t i = 2; i <= numOfDim; i++) { + if (numOfDim - i < axis) { + strides[numOfDim - i] = (std::numeric_limits::max)(); + } else { + strides[numOfDim - i] = strides[numOfDim - i + 1] * blkDims[numOfDim - i + 1]; } - - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::ndhwc); - - return; - } - } - - SizeVector strides(numOfDim); - strides[numOfDim - 1] = 1; - for (size_t i = 2; i <= numOfDim; i++) { - if (numOfDim - i < axis) { - strides[numOfDim - i] = (std::numeric_limits::max)(); - } else { - strides[numOfDim - i] = strides[numOfDim - i + 1] * dstDims[numOfDim - i + 1]; } - } - config.outConfs[0].desc = TensorDesc( - MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType), - dstDims.ToSizeVector(), - {dstDims.ToSizeVector(), order, offset, offsets, strides}); - for (size_t i = 0; i < getParentEdges().size(); i++) { - auto parentEdge = getParentEdgeAt(i); - config.inConfs[i].inPlace = 0; - config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(), - {parentEdge->getDims().ToSizeVector(), order, offset, offsets, strides}); - } + config.outConfs[0].desc = TensorDesc(outputPrecision, dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides}); + memory::format_tag outFmt = MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat(); - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, MKLDNNMemory::Convert(config.outConfs[0].desc.getLayout())); + for (size_t i = 0; i < getParentEdges().size(); i++) { + const auto& srcBlkDims = refConfig.inConfs[i].desc.getBlockingDesc().getBlockDims(); + const auto& dims = refConfig.inConfs[i].desc.getDims(); - if (numOfDim == 4lu || numOfDim == 5lu) { - size_t blkDimsLen = numOfDim + 1; - order.resize(blkDimsLen); - for (size_t i = 0; i < numOfDim; i++) { - order[i] = i; - } - order[numOfDim] = 1lu; - offsets = SizeVector(blkDimsLen, 0lu); - - // nChw8c, nChw16c, nCdhw8c, nCdhw16c - for (size_t sizeS : {8lu, 16lu}) { - SizeVector blkDims = dstDims.ToSizeVector(); - if (blkDims[1] % sizeS) - continue; - blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu); - blkDims.push_back(sizeS); - - strides.resize(blkDimsLen); - strides[blkDimsLen - 1] = 1; - for (size_t i = 2lu; i <= blkDimsLen; i++) { - if (blkDimsLen - i < axis) { - strides[blkDimsLen - i] = (std::numeric_limits::max)(); - } else { - strides[blkDimsLen - i] = strides[blkDimsLen - i + 1] * blkDims[blkDimsLen - i + 1]; - } - } - config.outConfs[0].desc = TensorDesc( - MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType), - dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides}); - - bool canInplace = true; - for (size_t i = 0lu; canInplace && i < getParentEdges().size(); i++) { - auto parentEdge = getParentEdgeAt(i); - blkDims = parentEdge->getDims().ToSizeVector(); - if (blkDims[1] % sizeS) - canInplace = false; - - blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu); - blkDims.push_back(sizeS); - config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(), - {blkDims, order, offset, offsets, strides}); - } - if (canInplace) { - auto dstFormat = numOfDim == 4lu ? sizeS == 8lu ? mkldnn::memory::format_tag::nChw8c : mkldnn::memory::format_tag::nChw16c - : sizeS == 8lu ? mkldnn::memory::format_tag::nCdhw8c : mkldnn::memory::format_tag::nCdhw16c; - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, dstFormat); - } + config.inConfs[i].inPlace = 0; + config.inConfs[i].desc = TensorDesc(inputPrecision, dims, {srcBlkDims, order, offset, offsets, strides}); } + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, outFmt); } } void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() { - bool hasUnknown = false; std::vector canSelectPrimitive; - for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) { - bool hasAny = true; - auto &primDescInfo = supportedPrimitiveDescriptors[i]; - if (primDescInfo.getImplementationType() != impl_desc_type::unknown || - primDescInfo.getConfig().inConfs[0].inPlace < 0) - continue; - hasUnknown = true; - for (auto iInfo : primDescInfo.getConfig().inConfs) { - if (iInfo.desc.getLayout() != InferenceEngine::Layout::ANY) { - hasAny = false; - break; - } - } - if (hasAny) { - for (auto oInfo : primDescInfo.getConfig().outConfs) { - if (oInfo.desc.getLayout() != InferenceEngine::Layout::ANY) { - hasAny = false; - break; - } - } - } - - if (!hasAny) { - canSelectPrimitive.push_back(i); - } - } + bool canOptimize = true; - bool hasDoubleConnection = false; + // The double connection marks that some tensor should + // be replicated. Inplace approach is not applicable + // for that case. for (int i = 0; i < getParentEdges().size(); i++) { for (int j = i + 1; j < getParentEdges().size(); j++) { - if (getParentEdgeAt(i) == getParentEdgeAt(j)) hasDoubleConnection = true; + if (getParentEdgeAt(i) == getParentEdgeAt(j)) canOptimize = false; } } - if (hasDoubleConnection) { - // The double connection marks that some tensor should - // be replicated. Inplace approach is not applicable - // for that case. Descriptor with index 0 is pure copy - // implementation - selectPrimitiveDescriptorByIndex(0); - return; - } - - bool canOptimize = true; - for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) { - const auto& parent = getParentEdgeAt(i)->getParent(); - for (size_t j = 0; canOptimize && j < parent->getChildEdges().size(); j++) { - const auto& child = parent->getChildEdgeAt(j)->getChild(); - const auto* childConcat = dynamic_cast(child.get()); - if (!childConcat || childConcat == this) - continue; - if (childConcat->isOptimized()) - canOptimize = false; - } - } - if (hasUnknown && axis == 1) { - if (canSelectPrimitive.size() == 1) { - selectPrimitiveDescriptorByIndex(static_cast(canSelectPrimitive[0])); - return; - } - } else { + if (axis != channelAxis) { canOptimize = false; } @@ -432,44 +248,57 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() { } size_t maxCount = 0; - auto convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector()); + auto outDims = getChildEdgeAt(0)->getDims().ToSizeVector(); + auto convertTo = PartialBlkDesc::makePlain(outDims); for (auto &it : formatFrequency) { if (it.second > maxCount) { maxCount = it.second; convertTo = it.first; + } else if (it.second == maxCount) { + if (isInQuantizedGraph && it.first == PartialBlkDesc::makeTailC(outDims)) { + convertTo = it.first; + } else if (it.first == PartialBlkDesc::makeCBlocked(outDims, 8) || it.first == PartialBlkDesc::makeCBlocked(outDims, 16)) { + convertTo = it.first; + } } } - if (canOptimize && convertTo.isAutoExtendedWith(getChildEdgeAt(0)->getDims().ToSizeVector())) - convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector()); - for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) { + if (convertTo.isAutoExtendedWith(outDims)) + convertTo = PartialBlkDesc::makePlain(outDims); + for (size_t i = 0; i < getParentEdges().size(); i++) { if (convertTo.isAutoExtendedWith(getParentEdgeAt(i)->getDims().ToSizeVector())) - convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector()); + convertTo = PartialBlkDesc::makePlain(outDims); } - for (auto supportedPdIndex : canSelectPrimitive) { - if (PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[supportedPdIndex].getConfig().inConfs[0].desc) == convertTo) { - selectPrimitiveDescriptorByIndex(static_cast(supportedPdIndex)); + for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); ++i) { + if (PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc) == convertTo) { + if (IMPLICATION(supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown, canOptimize)) { + canSelectPrimitive.push_back(i); + } + } + } + + if (canSelectPrimitive.size() == 1) { + selectPrimitiveDescriptorByIndex(static_cast(canSelectPrimitive[0])); + return; + } + + // if there are more then one PD with similar data layouts - select the optimized one + for (auto indx : canSelectPrimitive) { + if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::unknown) { + selectPrimitiveDescriptorByIndex(static_cast(indx)); return; } } + // if there are no matching data layouts, select first optimized implementation for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) { - auto &primDescInfo = supportedPrimitiveDescriptors[i]; - if (primDescInfo.getImplementationType() == impl_desc_type::unknown) - continue; - if (convertTo == PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc)) { - size_t num = 0; - for (num = 0; num < getParentEdges().size(); num++) { - if (convertTo.isAutoExtendedWith(getParentEdgeAt(num)->getDims().ToSizeVector())) - break; - } - if (num == getParentEdges().size()) { - selectPrimitiveDescriptorByIndex(i); - return; - } + if (canOptimize && supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown) { + selectPrimitiveDescriptorByIndex(static_cast(i)); + return; } } + selectPrimitiveDescriptorByIndex(0); } @@ -491,6 +320,12 @@ void MKLDNNConcatNode::createPrimitive() { if (getSelectedPrimitiveDescriptor() == nullptr) IE_THROW() << "Preferable primitive descriptor is not set."; + //check if selected Tensor descriptor has nspc layout and concat axis is C + if (axis == channelAxis && getChildEdgeAt(0)->getMemory().GetDesc().isTailCFormat()) { + canOptimizeNspc = true; + return; + } + std::vector srcs_d; for (size_t i = 0; i < getParentEdges().size(); i++) { @@ -540,7 +375,7 @@ void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() { if (!isInitConfig(config)) { for (size_t i = 0; i < config.inConfs.size(); i++) { config.inConfs[i].desc = getConfiguredInputDesc(config, i); - // MKLDNN doesn't support different precision on inputs + // Concat doesn't support different precision on inputs config.inConfs[i].desc.setPrecision(inputPrecision); } @@ -560,8 +395,7 @@ void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() { return; for (size_t i = 0; i < config.outConfs.size(); i++) { - if (config.outConfs[i].desc.getLayout() == InferenceEngine::Layout::ANY || - !isUninitTensorDesc(config.outConfs[i].desc)) + if (!isUninitTensorDesc(config.outConfs[i].desc)) continue; int num = getChildEdgeAt(i)->getOutputNum(); @@ -621,49 +455,53 @@ void MKLDNNConcatNode::execute(mkldnn::stream strm) { return; } + if (canOptimizeNspc) { + execNspcSpecCase(); + return; + } + const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory(); - const mkldnn::memory::data_type data_type = dst_memory.GetDataType(); const size_t num_src = getParentEdges().size(); + std::unordered_map mem_ags {{DNNL_ARG_DST, dst_memory.GetPrimitive()}}; + for (int i = 0; i < num_src; i++) + mem_ags[DNNL_ARG_MULTIPLE_SRC + i] = getParentEdgeAt(i)->getMemory().GetPrimitive(); - const bool isInt8 = (data_type == mkldnn_s8 || data_type == mkldnn_u8); - - if (isInt8) { - uint8_t* dst_ptr = reinterpret_cast(dst_memory.GetData()); - - std::vector channels; - size_t channels_size = 0; - std::vector src_ptrs; - std::vector dst_ptrs; + (*prim).execute(strm, mem_ags); +} - for (size_t i = 0; i < num_src; i++) { - const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory(); - const size_t num_channels = src_mem.GetDims()[1]; +InferenceEngine::Precision MKLDNNConcatNode::getRuntimePrecision() const { + return MKLDNNExtensionUtils::getMaxPrecision(getInputPrecisions()); +} - channels.push_back(num_channels); - src_ptrs.push_back(reinterpret_cast(src_mem.GetData())); - dst_ptrs.push_back(dst_ptr + channels_size); - channels_size += num_channels; - } +void MKLDNNConcatNode::execNspcSpecCase() { + const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory(); + const size_t num_src = getParentEdges().size(); + uint8_t* dst_ptr = reinterpret_cast(dst_memory.GetData()); + const size_t dataSize = MKLDNNExtensionUtils::sizeOfDataType(dst_memory.GetDataType()); - const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channels[0]; + std::vector channelsDataSize; + size_t channels_size = 0; + std::vector src_ptrs; + std::vector dst_ptrs; - parallel_for(iter_count, [&](int i) { - const size_t dst_off = i * channels_size; - for (int j = 0; j < num_src; j++) { - cpu_memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channels[j], channels[j]); - } - }); - } else { - std::unordered_map mem_ags {{DNNL_ARG_DST, dst_memory.GetPrimitive()}}; - for (int i = 0; i < num_src; i++) - mem_ags[DNNL_ARG_MULTIPLE_SRC + i] = getParentEdgeAt(i)->getMemory().GetPrimitive(); + for (size_t i = 0; i < num_src; i++) { + const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory(); + const size_t num_channels = src_mem.GetDims()[channelAxis]; - (*prim).execute(strm, mem_ags); + channelsDataSize.push_back(num_channels * dataSize); + src_ptrs.push_back(reinterpret_cast(src_mem.GetData())); + dst_ptrs.push_back(dst_ptr + channels_size); + channels_size += num_channels * dataSize; } -} -InferenceEngine::Precision MKLDNNConcatNode::getRuntimePrecision() const { - return MKLDNNExtensionUtils::getMaxPrecision(getInputPrecisions()); + const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channelsDataSize[0]; + + parallel_for(iter_count, [&](int i) { + const size_t dst_off = i * channels_size; + for (int j = 0; j < num_src; j++) { + cpu_memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channelsDataSize[j], channelsDataSize[j]); + } + }); } REG_MKLDNN_PRIM_FOR(MKLDNNConcatNode, Concatenation); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h index 234eeb4e5314fd..f29c1feca90c15 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h @@ -30,8 +30,10 @@ class MKLDNNConcatNode : public MKLDNNNode { private: size_t axis = 0; + bool canOptimizeNspc = false; size_t inverseOrder(const InferenceEngine::SizeVector& order, size_t axis); + void execNspcSpecCase(); InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::FP32; InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shuffle_channels_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shuffle_channels_node.cpp index 10d59bf09776b9..95b00af386be31 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shuffle_channels_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shuffle_channels_node.cpp @@ -94,11 +94,15 @@ void MKLDNNShuffleChannelsNode::initSupportedPrimitiveDescriptors() { impl_type = impl_desc_type::ref; } - addSupportedPrimDesc({{TensorDescCreatorTypes::nspc, precision}}, - {{TensorDescCreatorTypes::nspc, precision}}, + // use ncsp as default for non-quantized networks and nspc for quantized + auto firstCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::nspc : TensorDescCreatorTypes::ncsp; + auto secondCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::ncsp : TensorDescCreatorTypes::nspc; + + addSupportedPrimDesc({{firstCreatorType, precision}}, + {{firstCreatorType, precision}}, impl_type, supportDynamicBatch_); - addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision}}, - {{TensorDescCreatorTypes::ncsp, precision}}, + addSupportedPrimDesc({{secondCreatorType, precision}}, + {{secondCreatorType, precision}}, impl_type, supportDynamicBatch_); // canUseBlocked if (axis_ != 1) { diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/concat.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/concat.cpp new file mode 100644 index 00000000000000..32b7b6d60910d0 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/concat.cpp @@ -0,0 +1,214 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ngraph_functions/builders.hpp" +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + size_t, // Concat axis + std::vector>, // Input shapes + InferenceEngine::Precision, // Network precision + std::string, // Device name + CPUSpecificParams +> concatCPUTestParams; + +class ConcatLayerCPUTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + int axis; + std::vector> inputShapes; + InferenceEngine::Precision netPrecision; + std::string targetName; + CPUSpecificParams cpuParams; + std::tie(axis, inputShapes, netPrecision, targetName, cpuParams) = obj.param; + + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_"; + result << "axis=" << axis << "_"; + result << "netPRC=" << netPrecision.name() << "_"; + result << "trgDev=" << targetName << "_"; + result << CPUTestsBase::getTestCaseName(cpuParams); + return result.str(); + } +protected: + void SetUp() override { + int axis; + std::vector> inputShape; + InferenceEngine::Precision netPrecision; + CPUSpecificParams cpuParams; + std::tie(axis, inputShape, netPrecision, targetDevice, cpuParams) = this->GetParam(); + inPrc = outPrc = netPrecision; + + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + selectedType += std::string("_") + inPrc.name(); + + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto params = ngraph::builder::makeParams(ngPrc, inputShape); + auto paramOuts = ngraph::helpers::convert2OutputVector( + ngraph::helpers::castOps2Nodes(params)); + auto concat = std::make_shared(paramOuts, axis); + + function = makeNgraphFunction(ngPrc, params, concat, "concat"); + } +}; + +TEST_P(ConcatLayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); + CheckPluginRelatedResults(executableNetwork, "Concatenation"); +} + +namespace { +const auto planar_4D_ref = CPUSpecificParams{{nchw}, {nchw}, {"ref"}, "ref"}; +const auto planar_5D_ref = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref"}, "ref"}; + +const auto planar_4D = CPUSpecificParams{{nchw}, {nchw}, {}, "unknown"}; +const auto planar_5D = CPUSpecificParams{{ncdhw}, {ncdhw}, {}, "unknown"}; + +const auto planarChannels_4D = CPUSpecificParams{{nhwc}, {nhwc}, {}, "ref"}; +const auto planarChannels_5D = CPUSpecificParams{{ndhwc}, {ndhwc}, {}, "ref"}; + +const auto blocked8_4D = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "unknown"}; +const auto blocked8_5D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "unknown"}; + +const auto blocked8_4D_ref = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "ref"}; +const auto blocked8_5D_ref = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "ref"}; + +const auto blocked16_4D = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "unknown"}; +const auto blocked16_5D = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "unknown"}; + +const auto blocked16_4D_ref = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "ref"}; +const auto blocked16_5D_ref = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "ref"}; + +// List of precisions natively supported by mkldnn. +const std::vector netPrecisions = { + Precision::I8, + Precision::I32, + Precision::FP32, + Precision::BF16 +}; + +INSTANTIATE_TEST_CASE_P(concat_Concat4D_CPU_Block8inPlace, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(1), + ::testing::Values(std::vector>{{1, 8, 3, 5}, + {1, 16, 3, 5}}), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(planar_4D, planarChannels_4D, blocked8_4D)), + ConcatLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block8, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(0, 2, 3), + ::testing::Values(std::vector>{{2, 16, 3, 5}, + {2, 16, 3, 5}}), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(planar_4D_ref, planarChannels_4D, blocked8_4D_ref)), + ConcatLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16inPlace, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(1), + ::testing::Values(std::vector>{{2, 16, 3, 5}, + {2, 32, 3, 5}}), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(blocked16_4D)), + ConcatLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(0, 2, 3), + ::testing::Values(std::vector>{{2, 32, 3, 5}, + {2, 32, 3, 5}}), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(blocked16_4D_ref)), + ConcatLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(concat_Concat5D_CPU_Block8inPlace, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(1), + ::testing::Values(std::vector>{{1, 8, 3, 5, 7}, + {1, 16, 3, 5, 7}}), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(planar_5D, planarChannels_5D, blocked8_5D)), + ConcatLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block8, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(0, 2, 3, 4), + ::testing::Values(std::vector>{{2, 16, 3, 5, 7}, + {2, 16, 3, 5, 7}}), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(planar_5D_ref, planarChannels_5D, blocked8_5D_ref)), + ConcatLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16inPlace, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(1), + ::testing::Values(std::vector>{{2, 16, 3, 5, 7}, + {2, 32, 3, 5, 7}}), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(blocked16_5D)), + ConcatLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(0, 2, 3, 4), + ::testing::Values(std::vector>{{2, 32, 3, 5, 7}, + {2, 32, 3, 5, 7}}), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(blocked16_5D_ref)), + ConcatLayerCPUTest::getTestCaseName); + + +INSTANTIATE_TEST_CASE_P(smoke_Concat_inPlace, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(1), + ::testing::Values(std::vector>{{2, 3, 5}, + {2, 4, 5}}, + std::vector>{{2, 3}, + {2, 4}}), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})), + ConcatLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_Concat3D, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(0, 2), + ::testing::Values(std::vector>{{2, 4, 5}, + {2, 4, 5}}), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), + ConcatLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_Concat_1D_2D, ConcatLayerCPUTest, + ::testing::Combine( + ::testing::Values(0), + ::testing::Values(std::vector>{{2, 4}, + {3, 4}}, + std::vector>{{2}, {3}}), + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), + ConcatLayerCPUTest::getTestCaseName); + +} // namespace +} // namespace CPULayerTestsDefinitions \ No newline at end of file diff --git a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_transpose_reorder.cpp b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_transpose_reorder.cpp index 6cefb1b5be81c9..839378de0dccb6 100644 --- a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_transpose_reorder.cpp +++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_transpose_reorder.cpp @@ -222,6 +222,7 @@ void FuseTransposeAndReorderTest2::CreateGraph() { transpose2->get_rt_info() = makeCPUInfo({memFmt2}, {memFmt2}, {}); auto concat = ngraph::builder::makeConcat({transpose1, transpose2}, 1); + concat->get_rt_info() = makeCPUInfo({memFmt1, memFmt1}, {memFmt1}, {}); ngraph::ResultVector results{std::make_shared(concat)}; function = std::make_shared(results, params, "Transpose_Transpose_Concat");