diff --git a/inference-engine/src/legacy_api/src/ie_cnn_layer_builder_ngraph.cpp b/inference-engine/src/legacy_api/src/ie_cnn_layer_builder_ngraph.cpp index fb97e194024514..014dff3720f046 100644 --- a/inference-engine/src/legacy_api/src/ie_cnn_layer_builder_ngraph.cpp +++ b/inference-engine/src/legacy_api/src/ie_cnn_layer_builder_ngraph.cpp @@ -353,6 +353,9 @@ CNNLayer::Ptr NodeConverter::createLayer(const std::shared_ case Precision::FP16: precision_str = "FP16"; break; + case Precision::BF16: + precision_str = "BF16"; + break; case Precision::FP32: precision_str = "FP32"; break; diff --git a/inference-engine/src/legacy_api/src/ngraph_ops/interp.cpp b/inference-engine/src/legacy_api/src/ngraph_ops/interp.cpp old mode 100644 new mode 100755 diff --git a/inference-engine/src/mkldnn_plugin/bf16transformer.cpp b/inference-engine/src/mkldnn_plugin/bf16transformer.cpp index 0d8ef1d76ef953..0ddaf3fdbd0f9e 100644 --- a/inference-engine/src/mkldnn_plugin/bf16transformer.cpp +++ b/inference-engine/src/mkldnn_plugin/bf16transformer.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include "ngraph/type/bfloat16.hpp" using namespace MKLDNNPlugin; @@ -23,7 +24,7 @@ void precisionColoringBF16(const CNNLayerPtr layer, if (layer && !layer->insData.empty() && layer->input()) { printed_properties.insert(printed_properties.begin(), std::pair("Precision", - layer->input()->getPrecision() == Precision::FP32 ? "FP32" : "BF16")); + layer->input()->getPrecision() == Precision::FP32 ? "FP32" : "BF16")); if (layer->input()->getPrecision() == Precision::FP32) { node_properties.emplace_back("fillcolor", "#5A5DF0"); @@ -55,20 +56,31 @@ void BF16Transformer::convertToBFloat16(InferenceEngine::CNNNetwork &network) { InputsDataMap inputs = network.getInputsInfo(); OutputsDataMap outputs = network.getOutputsInfo(); for (auto iter : sortedLayers) { + if (CaselessEq()(iter->type, "convolution")) { + auto dims = iter->insData[0].lock()->getDims(); + if ((dims.size() == 4 || dims.size() == 5) && (dims[1] == 1 || dims[1] == 3)) + continue; + } + // check, if memory output node needs to be transformed if (iter->type == "Memory" && iter->outData.size() == 0 && iter->insData[0].lock()->getPrecision() == Precision::FP32) { - auto curPrec = iter->insData[0].lock()->getPrecision(); iter->insData[0].lock()->setPrecision(Precision::BF16); } + for (size_t o = 0; o < iter->outData.size(); o++) { if (inputs.find(iter->outData[o]->getName()) == inputs.end() && outputs.find(iter->outData[o]->getName()) == outputs.end() + && !CaselessEq()(iter->type, "const") && iter->outData[o]->getPrecision() == Precision::FP32) { iter->outData[o]->setPrecision(Precision::BF16); } } } + + // insert convert after input if necessary + insertConvertAfterInput(network); + // convert all edges back to FP32 on demand optimizeToFloat(network); } @@ -255,3 +267,120 @@ InferenceEngine::MemoryBlob::Ptr BF16Transformer::convertBF16ToFloat(InferenceEn } return weightsFP32; } +void BF16Transformer::addLayerToCNNNetworkAfterData( + DataPtr parentOutData, + CNNLayer::Ptr layer, + const std::string& nextLayerName, + ICNNNetwork& net, + const int childInsDataIndex) { + CNNNetworkImpl* netImpl = dynamic_cast(&net); + if (netImpl == nullptr) { + THROW_IE_EXCEPTION << "unexpected network type"; + } + + CNNLayerPtr nextLayer; + if (!nextLayerName.empty()) { + netImpl->getLayerByName(nextLayerName.c_str(), nextLayer, nullptr); + } + + if (layer && (nextLayerName.empty() || (parentOutData == nullptr) || (childInsDataIndex != -1) || + (getInputTo(parentOutData).find(nextLayerName) != getInputTo(parentOutData).end()))) { + auto getTensorDesc = [](CNNLayerPtr& nextLayer) { + const DataPtr insData = nextLayer->insData[0].lock(); + return insData->getTensorDesc(); + }; + + const TensorDesc& parentTensorDesc = parentOutData != nullptr ? parentOutData->getTensorDesc() : getTensorDesc(nextLayer); + DataPtr newEdgeAfterLayer(new Data(layer->name, parentTensorDesc)); + newEdgeAfterLayer->setName(layer->name); + getCreatorLayer(newEdgeAfterLayer) = layer; + getInputTo(newEdgeAfterLayer).clear(); + + + if (netImpl == nullptr) { + THROW_IE_EXCEPTION << "unexpected network type"; + } + netImpl->addData(layer->name.c_str(), newEdgeAfterLayer); + IE_SUPPRESS_DEPRECATED_START + netImpl->addLayer(layer); + IE_SUPPRESS_DEPRECATED_END + + if (parentOutData != nullptr) { + getInputTo(parentOutData)[layer->name] = layer; + layer->insData.push_back(parentOutData); + } + layer->outData.push_back(newEdgeAfterLayer); + + if (!nextLayerName.empty()) { + // CNNLayerPtr nextLayer = getInputTo(parentOutData)[nextLayerName]; + getInputTo(newEdgeAfterLayer)[nextLayerName] = nextLayer; + + if (parentOutData != nullptr) { + getInputTo(parentOutData).erase(nextLayerName); + + if (childInsDataIndex == -1) { + for (size_t i = 0; i < nextLayer->insData.size(); i++) { + if (nextLayer->insData[i].lock() == parentOutData) { + nextLayer->insData[i] = newEdgeAfterLayer; + } + } + } else { + nextLayer->insData[childInsDataIndex] = newEdgeAfterLayer; + } + } else { + nextLayer->insData.push_back(newEdgeAfterLayer); + } + } else { + CNNLayerPtr parent = getCreatorLayer(parentOutData).lock(); + if (parent == nullptr) { + THROW_IE_EXCEPTION << "parent data is absent"; + } + netImpl->removeOutput(parent->name); + netImpl->addData(layer->name.c_str(), newEdgeAfterLayer); + netImpl->addOutput(layer->name); + } + } else { + THROW_IE_EXCEPTION << "Invalid argument"; + } +} + +void BF16Transformer::insertConvertAfterInput(InferenceEngine::CNNNetwork &network) { + auto inputLayers = InferenceEngine::CNNNetGetAllInputLayers(network); + for (auto inputIter : inputLayers) { + for (size_t o = 0; o < inputIter->outData.size(); o++) { + for (auto bfInitIter : getInputTo(inputIter->outData[o])) { + if (inputIter->outData[o]->getPrecision() == Precision::BF16) { + // we don't need to enforce bf16-mode for the next layer + break; + } + auto bfInitLayer = bfInitIter.second; + if (_initbf16.find(bfInitLayer->type) != _initbf16.end()) { + if (CaselessEq()(bfInitLayer->type, "convolution")) { + // TODO: have to be removed after adding suitable implementation for convolution + break; + } + // insert convert + std::string layerName = inputIter->outData[o]->getName(); + LayerParams cnnLayerParams{layerName, "Convert", Precision::FP32}; + auto lay = std::make_shared(cnnLayerParams); + std::map par = {{"name", layerName}, + {"type", "Convert"}, + {"precision", "FP32"}}; + lay->params = par; + CNNLayerPtr convertLayer(lay); + BF16Transformer::addLayerToCNNNetworkAfterData(inputIter->outData[o], convertLayer, bfInitLayer->name, + network); + // compute input port id for bfInitLayer + for (size_t i = 0; i < bfInitLayer->insData.size(); i++) { + if (bfInitLayer->insData[i].lock()->getName() == inputIter->outData[o]->getName()) { + // set conv input as bf + bfInitLayer->insData[i].lock()->setPrecision(Precision::BF16); + break; + } + } + break; + } + } + } + } +} \ No newline at end of file diff --git a/inference-engine/src/mkldnn_plugin/bf16transformer.h b/inference-engine/src/mkldnn_plugin/bf16transformer.h index 6ff30cdcae3482..3f302348e4778f 100644 --- a/inference-engine/src/mkldnn_plugin/bf16transformer.h +++ b/inference-engine/src/mkldnn_plugin/bf16transformer.h @@ -8,15 +8,22 @@ #include #include #include +#include namespace MKLDNNPlugin { class BF16Transformer { const InferenceEngine::details::caseless_set _initbf16 = - { "convolution", "fullyconnected", "innerproduct", "gemm" }; + { "convolution", "fullyconnected", "innerproduct", "gemm", "RegionYolo" }; const InferenceEngine::details::caseless_set _complementbf16 = - { "relu", "tanh", "elu", "square", "abs", "sqrt", "linear", "bounded_relu", "soft_relu", "logistic", - "exp", "gelu", "clamp", "swish", "prelu", "pooling", "norm", "gather", "memory" }; + { "relu", "tanh", "elu", "square", "abs", "sqrt", "linear", "bounded_relu", "soft_relu", "normalize", + "sigmoid", "ReLU6", "not", "activation", "HSwish", "mish", "logistic", "mod", "resample", + "exp", "gelu", "clamp", "swish", "prelu", "pooling", "norm", "gather", "memory", "mvn", "crop", "activation", + "broadcast", "convert", "BatchToSpace", "DepthToSpace", "ExtractImagePatches", "concat", "power", "lrn", + "permute", "ScatterUpdate", "ScatterElementsUpdate", "ScatterNDUpdate", "depthwise", + "select", "ShuffleChannels", "SpaceToBatch", "SpaceToDepth", "squeeze", "StridedSlice", "unsqueeze", "eltwise", + "ReduceAnd", "ReduceOr", "ReduceMax", "ReduceMin" }; + const InferenceEngine::details::caseless_set _multiinput = { "concat", "eltwise" }; // prevent fallback to fp32 without considering both input and output nodes @@ -33,6 +40,13 @@ class BF16Transformer { */ bool tryToMarkFP32(InferenceEngine::DataPtr data, const std::set &immutable); + /** + * Because of singularity of input node, layer, following input doesn't support bf16 itself. + * We fix it by insertion of convert layer, which has to be replaced to reorder in graph optimizer. + * + */ + void insertConvertAfterInput(InferenceEngine::CNNNetwork &network); + public: /** * Restores Float point data types on edges which goes to non supported layers @@ -61,6 +75,16 @@ class BF16Transformer { */ void convertToBFloat16(InferenceEngine::CNNNetwork &network); + /** + * inserts given layer after current tensor + */ + static void addLayerToCNNNetworkAfterData( + InferenceEngine::DataPtr parentOutData, + InferenceEngine::CNNLayerPtr layer, + const std::string& nextLayerName, + InferenceEngine::ICNNNetwork& net, + const int childInsDataIndex = -1); + InferenceEngine::MemoryBlob::Ptr convertBF16ToFloat(InferenceEngine::MemoryBlob::Ptr); }; diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp index ebda5795690100..d5c4e4db1db20c 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp @@ -145,6 +145,9 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap graph.RemoveDroppedNodes(); #if defined (COMPILED_CPU_MKLDNN_REORDER_NODE) + ChangeConvertToReorder(graph); + graph.RemoveDroppedNodes(); + DropDoubleReorders(graph); graph.RemoveDroppedNodes(); @@ -1918,6 +1921,55 @@ void MKLDNNGraphOptimizer::DropConvertReorder(MKLDNNGraph& graph) { } } } + +void MKLDNNGraphOptimizer::ChangeConvertToReorder(MKLDNNGraph& graph) { + std::vector continuousPrecisions{ + Precision::BF16, + Precision::FP32 + }; + for (int ind = 0; ind < graph.GetNodes().size(); ind++) { + auto convertCandidate = graph.GetNodes().at(ind); + std::string nodeType = convertCandidate->getTypeStr(); + if (!InferenceEngine::details::CaselessEq()(nodeType, "convert")) { + continue; + } + auto inputPrecision = convertCandidate->getCnnLayer()->insData[0].lock()->getPrecision(); + auto outputPrecision = convertCandidate->getCnnLayer()->outData[0]->getPrecision(); + if (std::find(continuousPrecisions.begin(), continuousPrecisions.end(), inputPrecision) == continuousPrecisions.end() || + std::find(continuousPrecisions.begin(), continuousPrecisions.end(), outputPrecision) == continuousPrecisions.end()) { + continue; + } + std::unordered_set uniqueLayerNames; + for (auto node : graph.GetNodes()) { + uniqueLayerNames.insert(node->getCnnLayer()->name); + } + auto parentEdge = convertCandidate->getParentEdges()[0].lock(); + auto parentNode = parentEdge->getParent(); + auto &childEdge = convertCandidate->getChildEdgeAt(0); + auto childNode = childEdge->getChild(); + std::string basicLayerName = childEdge->getParent()->getName() + "_" + + MKLDNNExtensionUtils::getReorderArgs(convertCandidate->getCnnLayer()->insData[0].lock()->getTensorDesc(), + convertCandidate->getCnnLayer()->outData[0]->getTensorDesc()) + + "_" + childEdge->getChild()->getName(); + std::string layerName = basicLayerName; + int idx = 0; + while (uniqueLayerNames.find(layerName) != uniqueLayerNames.end()) { + idx++; + layerName = basicLayerName + "_" + std::to_string(idx); + } + // create temporary edge + auto oldParentOutputPort = parentEdge->getInputNum(); + auto oldChildInputPort = childEdge->getOutputNum(); + MKLDNNEdgePtr tempEdge(new MKLDNNEdge(parentNode, childNode, oldParentOutputPort, oldChildInputPort)); + + graph.InsertReorder(tempEdge, layerName, convertCandidate->getCnnLayer()->insData[0].lock()->getTensorDesc(), + convertCandidate->getCnnLayer()->outData[0]->getTensorDesc(), false); + parentNode->removeEdge(parentEdge); + parentEdge->drop(); + childEdge->drop(); + graph.DropNode(convertCandidate); + } +} #endif void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h index 481ca61d0562fe..025b79c9b7e864 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h @@ -46,6 +46,7 @@ class MKLDNNGraphOptimizer { #if defined (COMPILED_CPU_MKLDNN_REORDER_NODE) void DropDoubleReorders(MKLDNNGraph& graph); void DropConvertReorder(MKLDNNGraph& graph); + void ChangeConvertToReorder(MKLDNNGraph &graph); #endif void FuseConvolutionAndZeroPoints(MKLDNNGraph &graph); void FuseBroadcastAndEltwise(MKLDNNGraph &graph); diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp index ae7db8843395d4..54856e5a4cff8d 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp @@ -105,6 +105,7 @@ void MKLDNNPlugin::MKLDNNInferRequest::PushInputData() { // these precisions are supported by mkldnn, so we push the blob directly case InferenceEngine::Precision::I8: case InferenceEngine::Precision::I32: + case InferenceEngine::Precision::BF16: case InferenceEngine::Precision::FP32: { break; } diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp index fc1fe7972de010..e66af3faa7b1e9 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp @@ -278,6 +278,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::ICNNNetwork &network, const st input_precision != InferenceEngine::Precision::I16 && input_precision != InferenceEngine::Precision::I8 && input_precision != InferenceEngine::Precision::U8 && + input_precision != InferenceEngine::Precision::BF16 && input_precision != InferenceEngine::Precision::BOOL && input_precision != InferenceEngine::Precision::I64 && input_precision != InferenceEngine::Precision::U64) { diff --git a/inference-engine/src/mkldnn_plugin/nodes/argmax.cpp b/inference-engine/src/mkldnn_plugin/nodes/argmax.cpp index 449168f504cb10..63fa62a58074e8 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/argmax.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/argmax.cpp @@ -27,7 +27,7 @@ class ArgMaxImpl: public ExtLayerBase { conf.axis_index_ = conf.has_axis_ ? std::stoi(layer->params.at("axis")) :0; - addConfig(layer, {DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)}); + addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/base.hpp b/inference-engine/src/mkldnn_plugin/nodes/base.hpp index f31812e4cbd720..b9b650b3eca616 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/base.hpp +++ b/inference-engine/src/mkldnn_plugin/nodes/base.hpp @@ -60,8 +60,8 @@ class ExtLayerBase: public ILayerExecImpl { explicit DataConfigurator(ConfLayout l): layout(l) {} - DataConfigurator(ConfLayout l, bool constant, int inplace = -1): - layout(l), constant(constant), inplace(inplace) {} + DataConfigurator(ConfLayout l, bool constant, int inplace = -1, Precision::ePrecision prc = Precision::UNSPECIFIED): + layout(l), constant(constant), inplace(inplace), prc(prc) {} DataConfigurator(ConfLayout l, Precision::ePrecision prc): layout(l), prc(prc) {} @@ -128,14 +128,7 @@ class ExtLayerBase: public ILayerExecImpl { conf.layout = ConfLayout::PLN; } - // All extension layers support only FP32 precision! - // fixing of BF16 precisions where they are - layers naturally support only FP32 - // if we see BF16, that means another floating point format which will be converted by reorder - // added by current mkl-dnn cpu plugin when it figure out diff in data types on input and output of edges InferenceEngine::Precision precision = (conf.prc == Precision::UNSPECIFIED) ? data_desc.getPrecision() : Precision(conf.prc); - if (precision == Precision::BF16) { - precision = Precision::FP32; - } if (conf.layout == ConfLayout::ANY) { dataConfig.desc = TensorDesc(precision, data_dims, InferenceEngine::Layout::ANY); } else { diff --git a/inference-engine/src/mkldnn_plugin/nodes/broadcast.cpp b/inference-engine/src/mkldnn_plugin/nodes/broadcast.cpp index 5734cf23808c58..f975202b078c93 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/broadcast.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/broadcast.cpp @@ -31,7 +31,7 @@ class BroadcastImpl: public ExtLayerBase { LayerConfig config; DataConfig dataConfig, shapeConfig; - Precision dataPrecision = layer->outData[0]->getTensorDesc().getPrecision(); + Precision dataPrecision = layer->insData[BROADCAST_INPUT].lock()->getTensorDesc().getPrecision(); const SizeVector& data_dims = layer->insData[BROADCAST_INPUT].lock()->getTensorDesc().getDims(); dataConfig.desc = TensorDesc(dataPrecision, data_dims, layer->insData[BROADCAST_INPUT].lock()->getTensorDesc().getLayout()); diff --git a/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp b/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp index 5886c16ab6604b..e27a1b83c279de 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp @@ -41,19 +41,16 @@ class BucketizeImpl : public ExtLayerBase { input_precision = input->getTensorDesc().getPrecision(); if (input_precision != Precision::FP32 && input_precision != Precision::I32 && input_precision != Precision::I64) { - THROW_IE_EXCEPTION << layer->name - << " Incorrect input precision of the input. Only FP32, I32 and I64 are supported!"; + input_precision = Precision::FP32; } boundaries_precision = boundaries->getTensorDesc().getPrecision(); if (boundaries_precision != Precision::FP32 && boundaries_precision != Precision::I32 && boundaries_precision != Precision::I64) { - THROW_IE_EXCEPTION << layer->name - << " Incorrect input precision of the boundaries tensor. Only FP32, I32 and I64 are supported!"; + boundaries_precision = Precision::FP32; } output_precision = layer->outData[OUTPUT_TENSOR_PORT]->getTensorDesc().getPrecision(); if (output_precision != Precision::I32 && output_precision != Precision::I64) { - THROW_IE_EXCEPTION << layer->name - << " Incorrect precision of the output tensor. Only I32 and I64 are supported!"; + output_precision = Precision::I32; } // check dimensions of input tensors @@ -73,8 +70,8 @@ class BucketizeImpl : public ExtLayerBase { num_values = std::accumulate(input_tensor_dims.begin(), input_tensor_dims.end(), 1, std::multiplies()); addConfig(layer, - { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); + { DataConfigurator(ConfLayout::PLN, input_precision), DataConfigurator(ConfLayout::PLN, boundaries_precision) }, + { DataConfigurator(ConfLayout::PLN, output_precision) }); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/cpu_convert.cpp b/inference-engine/src/mkldnn_plugin/nodes/common/cpu_convert.cpp index aa2cefaa618dac..17a79325f2f649 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/common/cpu_convert.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/common/cpu_convert.cpp @@ -4,13 +4,14 @@ #include "cpu_convert.h" #include "cpu_memcpy.h" +#include "utils/bfloat16.hpp" #include #include using namespace InferenceEngine; template -void convert(void *srcPtr, void *dstPtr, const size_t size) { +void convert(const void *srcPtr, void *dstPtr, const size_t size) { if (std::is_same::value) { cpu_memcpy(dstPtr, srcPtr, size*sizeof(dstType)); } else { @@ -24,7 +25,7 @@ void convert(void *srcPtr, void *dstPtr, const size_t size) { } template -void convertFrom(void *srcPtr, void *dstPtr, Precision dstPrc, const size_t size) { +void convertFrom(const void *srcPtr, void *dstPtr, Precision dstPrc, const size_t size) { switch (dstPrc) { case Precision::U8: convert::value_type>(srcPtr, dstPtr, size); @@ -50,6 +51,9 @@ void convertFrom(void *srcPtr, void *dstPtr, Precision dstPrc, const size_t size case Precision::FP32: convert::value_type>(srcPtr, dstPtr, size); break; + case Precision::BF16: + convert(srcPtr, dstPtr, size); + break; case Precision::BOOL: convert::value_type>(srcPtr, dstPtr, size); break; @@ -58,7 +62,7 @@ void convertFrom(void *srcPtr, void *dstPtr, Precision dstPrc, const size_t size } } -void cpu_convert(void *srcPtr, void *dstPtr, Precision srcPrc, Precision dstPrc, const size_t size) { +void cpu_convert(const void *srcPtr, void *dstPtr, Precision srcPrc, Precision dstPrc, const size_t size) { if (srcPtr == nullptr || dstPtr == nullptr) THROW_IE_EXCEPTION << "cpu_convert has null data pointer"; @@ -92,6 +96,9 @@ void cpu_convert(void *srcPtr, void *dstPtr, Precision srcPrc, Precision dstPrc, case Precision::FP32: convertFrom::value_type>(srcPtr, dstPtr, dstPrc, size); break; + case Precision::BF16: + convertFrom(srcPtr, dstPtr, dstPrc, size); + break; case Precision::BOOL: convertFrom::value_type>(srcPtr, dstPtr, dstPrc, size); break; diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/cpu_convert.h b/inference-engine/src/mkldnn_plugin/nodes/common/cpu_convert.h index 8c2baa37929f11..5ace2e7cd6a2ef 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/common/cpu_convert.h +++ b/inference-engine/src/mkldnn_plugin/nodes/common/cpu_convert.h @@ -20,4 +20,4 @@ * @return none. */ -void cpu_convert(void *srcPtr, void *dstPtr, InferenceEngine::Precision srcPrc, InferenceEngine::Precision dstPrc, const size_t size); +void cpu_convert(const void *srcPtr, void *dstPtr, InferenceEngine::Precision srcPrc, InferenceEngine::Precision dstPrc, const size_t size); diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/softmax.cpp b/inference-engine/src/mkldnn_plugin/nodes/common/softmax.cpp index f9b4f57a5b2e2d..bd625795203490 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/common/softmax.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/common/softmax.cpp @@ -3,25 +3,35 @@ // #include -#include #include +#include #include "jit_generator.hpp" #include "jit_uni_eltwise.hpp" +#include "utils/bfloat16.hpp" #include "softmax.h" using namespace InferenceEngine; +using namespace MKLDNNPlugin; +using namespace mkldnn; using namespace mkldnn::impl::cpu; using namespace mkldnn::impl::utils; #define GET_OFF(field) offsetof(jit_args_softmax, field) struct jit_args_softmax { - const float* src; - const float* dst; - size_t stride; + const void* src; + void* dst; + size_t src_stride; + size_t dst_stride; size_t work_amount; }; +struct jit_softmax_config_params { + Precision src_dt; + Precision dst_dt; +}; + + struct jit_uni_softmax_kernel { void (*ker_)(const jit_args_softmax *); @@ -35,14 +45,15 @@ template struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_softmax_kernel_f32) - jit_uni_softmax_kernel_f32() : jit_uni_softmax_kernel(), jit_generator() { + jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) : jit_uni_softmax_kernel(), jit_generator() { exp_injector.reset(new jit_uni_eltwise_injector_f32(this, alg_kind::eltwise_exp, 0.f, 0.f)); this->preamble(); mov(reg_src, ptr[reg_params + GET_OFF(src)]); mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); - mov(reg_stride, ptr[reg_params + GET_OFF(stride)]); + mov(reg_src_stride, ptr[reg_params + GET_OFF(src_stride)]); + mov(reg_dst_stride, ptr[reg_params + GET_OFF(dst_stride)]); mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); Xbyak::Label max_loop_label; @@ -54,12 +65,12 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge mov(aux_reg_work_amount, reg_work_amount); mov(aux_reg_src, reg_src); - uni_vmovups(vmm_max, ptr[aux_reg_src]); + load_vector(vmm_max, ptr[aux_reg_src], jcp.src_dt); L(max_loop_label); { cmp(aux_reg_work_amount, 0); jle(max_loop_end_label, T_NEAR); - uni_vmovups(vmm_val, ptr[aux_reg_src]); + load_vector(vmm_val, ptr[aux_reg_src], jcp.src_dt); if (isa == sse42) { uni_vmovups(vmm_mask, vmm_val); @@ -77,7 +88,7 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge uni_vblendvps(vmm_max, vmm_max, vmm_val, vmm_mask); } - add(aux_reg_src, reg_stride); + add(aux_reg_src, reg_src_stride); sub(aux_reg_work_amount, 1); jmp(max_loop_label, T_NEAR); @@ -93,16 +104,16 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge cmp(aux_reg_work_amount, 0); jle(exp_loop_end_label, T_NEAR); - uni_vmovups(vmm_val, ptr[aux_reg_src]); + load_vector(vmm_val, ptr[aux_reg_src], jcp.src_dt); uni_vsubps(vmm_val, vmm_val, vmm_max); exp_injector->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1); uni_vaddps(vmm_exp_sum, vmm_exp_sum, vmm_val); - uni_vmovups(ptr[aux_reg_dst], vmm_val); + store_vector(ptr[aux_reg_dst], vmm_val, jcp.dst_dt); - add(aux_reg_src, reg_stride); - add(aux_reg_dst, reg_stride); + add(aux_reg_src, reg_src_stride); + add(aux_reg_dst, reg_dst_stride); sub(aux_reg_work_amount, 1); jmp(exp_loop_label, T_NEAR); @@ -116,13 +127,13 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge cmp(aux_reg_work_amount, 0); jle(div_loop_end_label, T_NEAR); - uni_vmovups(vmm_val, ptr[aux_reg_dst]); + load_vector(vmm_val, ptr[aux_reg_dst], jcp.dst_dt); uni_vdivps(vmm_val, vmm_val, vmm_exp_sum); - uni_vmovups(ptr[aux_reg_dst], vmm_val); + store_vector(ptr[aux_reg_dst], vmm_val, jcp.dst_dt); - add(aux_reg_dst, reg_stride); + add(aux_reg_dst, reg_dst_stride); sub(aux_reg_work_amount, 1); jmp(div_loop_label, T_NEAR); @@ -147,7 +158,8 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge Xbyak::Reg64 aux_reg_dst = r15; Xbyak::Reg64 reg_work_amount = r11; Xbyak::Reg64 aux_reg_work_amount = r12; - Xbyak::Reg64 reg_stride = r14; + Xbyak::Reg64 reg_src_stride = r14; + Xbyak::Reg64 reg_dst_stride = r10; Xbyak::Reg64 reg_params = abi_param1; Vmm vmm_mask = Vmm(0); @@ -158,23 +170,64 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge const Xbyak::Opmask k_mask = Xbyak::Opmask(1); std::shared_ptr> exp_injector; + + inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, Precision src_dt) { + switch (src_dt) { + case Precision::FP32: + uni_vmovups(vmm_src, op); + break; + case Precision::BF16: + vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; + default: + assert(!"unknown src_dt"); + } + } + inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, Precision dst_dt) { + Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx()); + + switch (dst_dt) { + case Precision::FP32: + uni_vmovups(op, vmm_dst); + break; + case Precision::BF16: + vcvtneps2bf16(ymm_dst, vmm_dst); + uni_vmovups(op, ymm_dst); + break; + default: + assert(!"unknown dst_dt"); + } + } }; -SoftmaxGeneric::SoftmaxGeneric() { +SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc) + : input_prec(inpPrc), output_prec(outPrc) { + if (Precision::BF16 == output_prec) { + if (!mayiuse(avx512_core_bf16)) { + THROW_IE_EXCEPTION << "SoftmaxGeneric doesn't support BF16 precision on this target."; + } + } + block_size = 1; + auto jcp = jit_softmax_config_params(); + jcp.src_dt = inpPrc; + jcp.dst_dt = outPrc; + if (mayiuse(avx512_common)) { - softmax_kernel.reset(new jit_uni_softmax_kernel_f32()); + softmax_kernel.reset(new jit_uni_softmax_kernel_f32(jcp)); block_size = 16; } else if (mayiuse(avx2)) { - softmax_kernel.reset(new jit_uni_softmax_kernel_f32()); + softmax_kernel.reset(new jit_uni_softmax_kernel_f32(jcp)); block_size = 8; } else if (mayiuse(sse42)) { - softmax_kernel.reset(new jit_uni_softmax_kernel_f32()); + softmax_kernel.reset(new jit_uni_softmax_kernel_f32(jcp)); block_size = 4; } } -void SoftmaxGeneric::execute(const float *src_data, float *dst_data, int B, int C, int H, int W) { +template +void SoftmaxGeneric::calculate(const in_data_t *src_data, out_data_t *dst_data, int B, int C, int H, int W) { for (int b = 0; b < B; b++) { int tail_start = 0; if (softmax_kernel) { @@ -185,7 +238,8 @@ void SoftmaxGeneric::execute(const float *src_data, float *dst_data, int B, int arg.src = src_data + b * C * H * W + ib * block_size; arg.dst = dst_data + b * C * H * W + ib * block_size; - arg.stride = static_cast((size_t)(H) * W * sizeof(float)); + arg.src_stride = static_cast((size_t)(H) * W * sizeof(in_data_t)); + arg.dst_stride = static_cast((size_t)(H) * W * sizeof(out_data_t)); arg.work_amount = static_cast(C); (*softmax_kernel)(&arg); @@ -214,3 +268,31 @@ void SoftmaxGeneric::execute(const float *src_data, float *dst_data, int B, int }); } } + +void SoftmaxGeneric::execute(const uint8_t *src_data, uint8_t *dst_data, int B, int C, int H, int W) { + if (Precision::FP32 == input_prec) { + auto float_src_data = reinterpret_cast(src_data); + if (Precision::FP32 == output_prec) { + auto float_dst_data = reinterpret_cast(dst_data); + calculate(float_src_data, float_dst_data, B, C, H, W); + } else if (Precision::BF16 == output_prec) { + auto bf16_dst_data = reinterpret_cast(dst_data); + calculate(float_src_data, bf16_dst_data, B, C, H, W); + } else { + THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name(); + } + } else if (Precision::BF16 == input_prec) { + auto bf16_src_data = reinterpret_cast(src_data); + if (Precision::FP32 == output_prec) { + auto float_dst_data = reinterpret_cast(dst_data); + calculate(bf16_src_data, float_dst_data, B, C, H, W); + } else if (Precision::BF16 == output_prec) { + auto bf16_dst_data = reinterpret_cast(dst_data); + calculate(bf16_dst_data, bf16_dst_data, B, C, H, W); + } else { + THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name(); + } + } else { + THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name(); + } +} diff --git a/inference-engine/src/mkldnn_plugin/nodes/common/softmax.h b/inference-engine/src/mkldnn_plugin/nodes/common/softmax.h index 2849439c370a70..53046ed406ecfc 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/common/softmax.h +++ b/inference-engine/src/mkldnn_plugin/nodes/common/softmax.h @@ -6,6 +6,7 @@ #include #include +#include #include "defs.h" #include "ie_parallel.hpp" @@ -37,12 +38,16 @@ void softmax_many_batches(const float *src_data, float *dst_data, int B, int C, class SoftmaxGeneric { public: - SoftmaxGeneric(); + SoftmaxGeneric(InferenceEngine::Precision inpPrc, InferenceEngine::Precision outPrc); - void execute(const float *src_data, float *dst_data, int B, int C, int H, int W); + void execute(const uint8_t *src_data, uint8_t *dst_data, int B, int C, int H, int W); +private: + template + void calculate(const in_data_t* src_data, out_data_t* dst_data, int B, int C, int H, int W); private: int block_size; + InferenceEngine::Precision input_prec, output_prec; std::shared_ptr softmax_kernel; }; diff --git a/inference-engine/src/mkldnn_plugin/nodes/convert.cpp b/inference-engine/src/mkldnn_plugin/nodes/convert.cpp index eed226db4b211e..9e2cf81c6d4b8e 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/convert.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/convert.cpp @@ -4,10 +4,8 @@ #include "base.hpp" -#include #include #include -#include "ie_parallel.hpp" #include "ie_precision.hpp" #include "common/cpu_convert.h" diff --git a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy.cpp b/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy.cpp index 717af9f9e3ff17..87e688684a1d3f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy.cpp @@ -20,8 +20,8 @@ class CTCGreedyDecoderImpl: public ExtLayerBase { THROW_IE_EXCEPTION << "Incorrect number of input/output edges!"; std::vector inps; - inps.resize(layer->insData.size(), DataConfigurator(ConfLayout::PLN)); - addConfig(layer, inps, {DataConfigurator(ConfLayout::PLN)}); + inps.resize(layer->insData.size(), DataConfigurator(ConfLayout::PLN, Precision::FP32)); + addConfig(layer, inps, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp b/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp index a1954db4378dcb..6ac058e22c977f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp @@ -27,13 +27,10 @@ class CTCLossImpl : public ExtLayerBase { auto logitsData = layer->insData[0].lock(); if (logitsData == nullptr) THROW_IE_EXCEPTION << _logPrefix << " has nullable logits data"; - auto logitsPrecision = logitsData->getTensorDesc().getPrecision(); - if (logitsPrecision == Precision::BF16) - logitsPrecision = Precision::FP32; LayerConfig config; config.inConfs.resize(layer->insData.size()); - config.inConfs[0].desc = TensorDesc(logitsPrecision, + config.inConfs[0].desc = TensorDesc(Precision::FP32, logitsData->getTensorDesc().getDims(), TensorDesc::getLayoutByDims(logitsData->getTensorDesc().getDims())); auto intPrecision = Precision::I32; @@ -48,7 +45,7 @@ class CTCLossImpl : public ExtLayerBase { DataConfig outConfig; auto& outDims = layer->outData[0]->getTensorDesc().getDims(); - outConfig.desc = TensorDesc(logitsPrecision, + outConfig.desc = TensorDesc(Precision::FP32, outDims, TensorDesc::getLayoutByDims(outDims)); config.outConfs.push_back(outConfig); diff --git a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp b/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp index 140a56e5416eef..e96cf5ee32eaa4 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp @@ -112,8 +112,8 @@ class DetectionOutputImpl: public ExtLayerBase { _num_priors_actual = InferenceEngine::make_shared_blob({Precision::I32, num_priors_actual_size, C}); _num_priors_actual->allocate(); - std::vector in_data_conf(layer->insData.size(), DataConfigurator(ConfLayout::PLN)); - addConfig(layer, in_data_conf, {DataConfigurator(ConfLayout::PLN)}); + std::vector in_data_conf(layer->insData.size(), DataConfigurator(ConfLayout::PLN, Precision::FP32)); + addConfig(layer, in_data_conf, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/embedding_bag_sum.cpp b/inference-engine/src/mkldnn_plugin/nodes/embedding_bag_sum.cpp index 58f907f5835d76..dace4c5195c72f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/embedding_bag_sum.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/embedding_bag_sum.cpp @@ -4,7 +4,6 @@ #include "embedding_bag_sum.hpp" #include "ie_parallel.hpp" -#include "jit_generator.hpp" #include "list.hpp" #include diff --git a/inference-engine/src/mkldnn_plugin/nodes/fill.cpp b/inference-engine/src/mkldnn_plugin/nodes/fill.cpp index e08897184a1701..e3831b83ef1e36 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/fill.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/fill.cpp @@ -28,9 +28,6 @@ class FillImpl: public ExtLayerBase { if (fill_dims.size() > 1) THROW_IE_EXCEPTION << layer->name << " Fill dimensions vector should be 1 dimension"; - if (layer->insData[FILL_DIMS].lock()->getTensorDesc().getPrecision() != Precision::I32) - THROW_IE_EXCEPTION << layer->name << " Fill dimensions vector should be I32!"; - SizeVector value_dims = layer->insData[FILL_VALUE].lock()->getTensorDesc().getDims(); if (value_dims.size() > 1) THROW_IE_EXCEPTION << layer->name << " Value scalar should have 1 dimension"; @@ -39,12 +36,12 @@ class FillImpl: public ExtLayerBase { layer->outData[0]->getTensorDesc().getPrecision() == Precision::I32) && !(layer->insData[FILL_VALUE].lock()->getTensorDesc().getPrecision() == Precision::FP32 && layer->outData[0]->getTensorDesc().getPrecision() == Precision::FP32)) { - THROW_IE_EXCEPTION << layer->name << - " 'Value' input scalars and output tensor should have same precision and only FP32 and I32 are supported!"; + addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::FP32) }, + { DataConfigurator(ConfLayout::PLN, Precision::FP32) }); + } else { + addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN) }, + { DataConfigurator(ConfLayout::PLN) }); } - - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/gather.cpp b/inference-engine/src/mkldnn_plugin/nodes/gather.cpp index 4cc1e6fb10ca1c..24af01f77546c3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/gather.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/gather.cpp @@ -27,7 +27,7 @@ class GatherImpl: public ExtLayerBase { Precision inIdxPrecision = layer->insData[GATHER_INDEXES].lock()->getTensorDesc().getPrecision(); if (inIdxPrecision != Precision::FP32 && inIdxPrecision != Precision::I32 && inIdxPrecision != Precision::FP16) - THROW_IE_EXCEPTION << layer->name << " Incorrect input precision. Only FP32, FP16 or I32 are supported!"; + inIdxPrecision = Precision::I32; axis = layer->GetParamAsInt("axis"); @@ -52,7 +52,7 @@ class GatherImpl: public ExtLayerBase { LayerConfig config; DataConfig dataConfigIdx, dataConfigDct; - Precision dataPrecision = layer->outData[0]->getTensorDesc().getPrecision(); + Precision dataPrecision = layer->insData[GATHER_DICTIONARY].lock()->getTensorDesc().getPrecision(); dataConfigDct.desc = TensorDesc(dataPrecision, dictionary_dims, layer->insData[GATHER_DICTIONARY].lock()->getTensorDesc().getLayoutByDims(dictionary_dims)); config.inConfs.push_back(dataConfigDct); diff --git a/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp b/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp index 5e420b22ddd23a..7a0b527c18af16 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp @@ -30,9 +30,8 @@ class GatherTreeImpl: public ExtLayerBase { THROW_IE_EXCEPTION << layer->name << " Incorrect number of output edges."; precision = layer->insData[GATHER_TREE_STEP_IDX].lock()->getTensorDesc().getPrecision(); - if (precision != Precision::FP32 && precision != Precision::I32) - THROW_IE_EXCEPTION << layer->name << " Incorrect data tensor precision. Only I32 or FP32 are supported."; + precision = Precision::FP32; if (layer->insData[GATHER_TREE_PARENT_IDX].lock()->getTensorDesc().getPrecision() != precision || layer->insData[GATHER_TREE_MAX_SEQ_LEN].lock()->getTensorDesc().getPrecision() != precision || @@ -49,9 +48,9 @@ class GatherTreeImpl: public ExtLayerBase { if (layer->insData[GATHER_TREE_END_TOKEN].lock()->getTensorDesc().getDims().size() != 1) THROW_IE_EXCEPTION << layer->name << " end_token should be 1 dimension"; - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), - DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); + addConfig(layer, { DataConfigurator(ConfLayout::PLN, precision), DataConfigurator(ConfLayout::PLN, precision), + DataConfigurator(ConfLayout::PLN, precision), DataConfigurator(ConfLayout::PLN, precision) }, + { DataConfigurator(ConfLayout::PLN, precision) }); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/grn.cpp b/inference-engine/src/mkldnn_plugin/nodes/grn.cpp index b5e4e214965ade..d412ab38554653 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/grn.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/grn.cpp @@ -22,7 +22,7 @@ class GRNImpl: public ExtLayerBase { bias = layer->GetParamAsFloat("bias"); - addConfig(layer, {{ConfLayout::PLN, false, 0}}, {{ConfLayout::PLN, false, 0}}); + addConfig(layer, {{ConfLayout::PLN, false, 0, Precision::FP32}}, {{ConfLayout::PLN, false, 0, Precision::FP32}}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/interp.cpp b/inference-engine/src/mkldnn_plugin/nodes/interp.cpp index 873575b8be4b96..6e2186899c3c33 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/interp.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/interp.cpp @@ -157,21 +157,13 @@ class InterpImpl: public ExtLayerBase { if (inData->getTensorDesc().getDims().size() != 4) THROW_IE_EXCEPTION << "Interp supports only 4d blobs!"; - auto src_precision = inData->getTensorDesc().getPrecision(); - if (src_precision != Precision::FP32 && src_precision != Precision::U8 && src_precision != Precision::BF16) - THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only U8 or FP32 or BF16 are supported!"; - - auto dst_precision = layer->outData[0]->getTensorDesc().getPrecision(); - if (dst_precision != Precision::FP32 && dst_precision != Precision::BF16) - THROW_IE_EXCEPTION << layer->name << " Incorrect output data tensor precision. Only FP32 or BF16 are supported!"; - // We don't read other parameters since they are needed only for dst reshape in caffe pad_beg = layer->GetParamAsInt("pad_beg"); pad_end = layer->GetParamAsInt("pad_end"); align_corners = layer->GetParamAsBool("align_corners", true); ConfLayout blk_layout; - if (src_precision == Precision::U8) { + if (inData->getTensorDesc().getPrecision() == Precision::U8) { LayerConfig config; DataConfig dataConfigDct; dataConfigDct.desc = TensorDesc(Precision::U8, inData->getTensorDesc().getDims(), Layout::NCHW); @@ -197,15 +189,15 @@ class InterpImpl: public ExtLayerBase { if (mayiuse(avx512_common)) { blk_layout = ConfLayout::BLK16; interp_kernel.reset(new jit_uni_interp_kernel_f32()); - addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) }); + addConfig(layer, { DataConfigurator(blk_layout, Precision::FP32) }, { DataConfigurator(blk_layout, Precision::FP32) }); } else if (mayiuse(avx2)) { blk_layout = ConfLayout::BLK8; interp_kernel.reset(new jit_uni_interp_kernel_f32()); - addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) }); + addConfig(layer, { DataConfigurator(blk_layout, Precision::FP32) }, { DataConfigurator(blk_layout, Precision::FP32) }); } else { blk_layout = ConfLayout::BLK8; interp_kernel.reset(new jit_uni_interp_kernel_f32()); - addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) }); + addConfig(layer, { DataConfigurator(blk_layout, Precision::FP32) }, { DataConfigurator(blk_layout, Precision::FP32) }); } } } catch (InferenceEngine::details::InferenceEngineException &ex) { diff --git a/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp b/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp index 3c3a32e4862ce6..d95309afc4797f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp @@ -51,7 +51,7 @@ class LogSoftmaxImpl: public ExtLayerBase { for (size_t i = (axis + 1); i < dims.size(); i++) reduced_axis_stride *= dims[i]; - addConfig(layer, { { ConfLayout::PLN, false, 0 } }, { { ConfLayout::PLN, false, 0 } }); + addConfig(layer, { { ConfLayout::PLN, false, 0, Precision::FP32 } }, { { ConfLayout::PLN, false, 0, Precision::FP32 } }); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/math.cpp b/inference-engine/src/mkldnn_plugin/nodes/math.cpp index 5a63ffe7128bb9..26d5939b98631f 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/math.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/math.cpp @@ -86,7 +86,7 @@ class MathImpl: public ExtLayerBase { else THROW_IE_EXCEPTION << layer->name << " Incorrect Math layer type!"; - addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}); + addConfig(layer, {DataConfigurator(ConfLayout::PLN, false, 0, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, false, 0, Precision::FP32)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp index 3b90a458ddb3af..d6dde5692f1516 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp @@ -58,13 +58,12 @@ void MKLDNNCropNode::initSupportedPrimitiveDescriptors() { return; InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision(); - if (precision != InferenceEngine::Precision::FP32) - precision = InferenceEngine::Precision::FP32; auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); precision = getCnnLayer()->outData[0]->getPrecision(); - if (precision != InferenceEngine::Precision::FP32) - precision = InferenceEngine::Precision::FP32; auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision); + if (inputDataType != outputDataType) { + outputDataType = inputDataType; // Crop doesn't convert precisions, only moves data + } auto& inDims = getParentEdgeAt(0)->getDims(); if (inDims.ndims() != 2 && inDims.ndims() != 4 && inDims.ndims() != 5) { @@ -125,19 +124,19 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) { if (!MKLDNNMemory::IsPlainFormat(parentMem.GetFormat())) { m_block_size = parentMem.GetDescriptor().data.layout_desc.blocking.block_dims[1]; } - int m_inner_dim = dims[dims.size() - 1] * m_block_size; + const int m_inner_dim = dims[dims.size() - 1] * m_block_size; const memory &dst_d = getChildEdgeAt(0)->getMemory().GetPrimitive(); - int dst_ndims = dst_d.get_primitive_desc().desc().data.ndims; + const int dst_ndims = dst_d.get_primitive_desc().desc().data.ndims; // TODO: Rewrite it in general case. For every tensor // and rank, without using letter N,C,D,H,W - int OFFSET_N = (dst_ndims > 0) ? offsets[0] : 0; - int OFFSET_C = (dst_ndims > 1) ? offsets[1] : 0; - int OFFSET_D = (dst_ndims > 4) ? offsets[offsets.size() - 3] : 0; - int OFFSET_H = (dst_ndims > 2) ? offsets[offsets.size() - 2] : 0; - int OFFSET_W = (dst_ndims > 3) ? offsets[offsets.size() - 1] : 0; + const int OFFSET_N = (dst_ndims > 0) ? offsets[0] : 0; + const int OFFSET_C = (dst_ndims > 1) ? offsets[1] : 0; + const int OFFSET_D = (dst_ndims > 4) ? offsets[offsets.size() - 3] : 0; + const int OFFSET_H = (dst_ndims > 2) ? offsets[offsets.size() - 2] : 0; + const int OFFSET_W = (dst_ndims > 3) ? offsets[offsets.size() - 1] : 0; // TODO: Check applicability of dyn_batch_lim in early steps. // crop of batch dimension doesn't support dyn batch. @@ -155,42 +154,16 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) { const int IH = (src_ndims > 2) ? src_dims[src_dims.size() - 2] : 1; const int IW = (src_ndims > 3) ? src_dims[src_dims.size() - 1] : 1; - const auto *src_data = reinterpret_cast(parentMem.GetData()) + - parentMem.GetDescriptor().data.layout_desc.blocking.offset_padding; - float *dst_data = reinterpret_cast(getChildEdgeAt(0)->getMemory().GetData()) + - getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; + const uint8_t itemSize = MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(parentMem.GetDataType())); + + const auto *src_data = reinterpret_cast(parentMem.GetData()) + + itemSize * parentMem.GetDescriptor().data.layout_desc.blocking.offset_padding; + auto *dst_data = reinterpret_cast(getChildEdgeAt(0)->getMemory().GetData()) + + itemSize * getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding; -#ifdef _WIN32 - if (OD == 1 && OH == 1 && OW == 1 && ID == 1 && IH == 1 && IW == 1) { - for (int n = 0; n < ON; ++n) { - cpu_memcpy(&dst_data[n*OC], &src_data[(n+OFFSET_N)*IC + OFFSET_C], OC * sizeof(float)); - } - } else { - for (int n = 0; n < ON; ++n) { - for (int c = 0; c < OC; c += m_block_size) { - for (int d = 0; d < OD; ++d) { - for (int h = 0; h < OH; ++h) { - int dst_ind = - n*OC*OD*OH*OW + c*OD*OH*OW + d*OH*OW*m_block_size + - h*OW*m_block_size; - - int src_ind = - (n+OFFSET_N)*IC*ID*IH*IW + - (c+OFFSET_C)*ID*IH*IW + - (d+OFFSET_D)*IH*IW*m_block_size + - (h+OFFSET_H)*IW*m_block_size + - OFFSET_W*m_block_size; - - cpu_memcpy(dst_data + dst_ind, src_data + src_ind, m_inner_dim * sizeof(float)); - } - } - } - } - } -#else if (OD == 1 && OH == 1 && OW == 1 && ID == 1 && IH == 1 && IW == 1) { parallel_for(ON, [&](int n) { - cpu_memcpy(&dst_data[n*OC], &src_data[(n+OFFSET_N)*IC + OFFSET_C], OC * sizeof(float)); + cpu_memcpy(dst_data + itemSize * n * OC, src_data + itemSize *((n+OFFSET_N)*IC + OFFSET_C), OC * itemSize); }); } else { parallel_for2d(ON, (OC / m_block_size), [&](int n, int c) { @@ -201,7 +174,7 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) { ((d+OFFSET_D)*IH*IW + OFFSET_H*IW + OFFSET_W)*m_block_size; for (int h = 0; h < OH; ++h) { - cpu_memcpy(dst_data + dst_ind, src_data + src_ind, m_inner_dim * sizeof(float)); + cpu_memcpy(dst_data + itemSize * dst_ind, src_data + itemSize * src_ind, m_inner_dim * itemSize); src_ind += IW * m_block_size; dst_ind += OW * m_block_size; @@ -209,7 +182,6 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) { } }); } -#endif } bool MKLDNNCropNode::created() const { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp index 625a5b276541fe..2181c3f47167f6 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp @@ -12,6 +12,7 @@ #include #include #include +#include "utils/bfloat16.hpp" #include #include "ie_parallel.hpp" #include @@ -31,6 +32,15 @@ using namespace Xbyak; #define GET_OFF(field) offsetof(jit_mvn_call_args, field) +// some utility functions +static inline bool isFloatCompatible(Precision prc) { + return Precision::FP32 == prc || Precision::BF16 == prc; +} + +static inline bool isFloatCompatible(memory::data_type type) { + return memory::f32 == type || memory::bf16 == type; +} + // normalize_variance = false : src->mean // normalize_variance = true : src+mean->variance:sqr(x-mean) template @@ -88,13 +98,13 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k load_vector(vmm_val, ptr[reg_src], jcp_.src_dt); if (jcp_.normalize_variance) { - if (jcp_.src_dt != memory::f32) + if (!isFloatCompatible(jcp_.src_dt)) uni_vcvtdq2ps(vmm_val, vmm_val); uni_vsubps(vmm_val, vmm_val, vmm_mean); uni_vfmadd231ps(vmm_variance, vmm_val, vmm_val); } else { - if (jcp_.src_dt != memory::f32) + if (!isFloatCompatible(jcp_.src_dt)) uni_vpaddd(vmm_sum, vmm_sum, vmm_val); else uni_vaddps(vmm_sum, vmm_sum, vmm_val); @@ -138,7 +148,7 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k uni_vmovups(ptr[reg_variance], vmm_variance); } else { - if (jcp_.src_dt != memory::f32) + if (!isFloatCompatible(jcp_.src_dt)) uni_vcvtdq2ps(vmm_sum, vmm_sum); if (!jcp_.planar_layout && !jcp_.across_channels) { @@ -199,6 +209,10 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k case memory::u8: uni_vpmovzxbd(vmm_src, op); break; + case memory::bf16: + uni_vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; default: assert(!"unknown dst_dt"); } @@ -348,11 +362,15 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator case memory::u8: uni_vpmovzxbd(vmm_src, op); break; + case memory::bf16: + uni_vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; default: assert(!"unknown dst_dt"); } - if (src_dt != memory::f32) + if (!isFloatCompatible(src_dt)) uni_vcvtdq2ps(vmm_src, vmm_src); } @@ -362,6 +380,9 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator if (dst_dt == memory::f32) { uni_vmovups(op, vmm_dst); + } else if (dst_dt == memory::bf16) { + vcvtneps2bf16(ymm_dst, vmm_dst); + uni_vmovups(op, ymm_dst); } else if (dst_dt == memory::u8) { uni_vcvtps2dq(vmm_dst, vmm_dst); if (isa == cpu::avx512_common) { @@ -413,7 +434,7 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator depthwise_inj_idx++; } else if (post_op.is_quantization()) { bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; - bool do_rounding = do_dequantization || dst_dt == memory::f32 || i != p.len_ - 1; + bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len_ - 1; int s_idx = vmm_val.getIdx(); quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off); @@ -475,8 +496,17 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() { if (getParentEdgeAt(0)->getDims().ndims() < 4 || getParentEdgeAt(0)->getDims().ndims() > 5 || across_channels != 0 || normalize_variance != 1) { - inputPrecision = Precision::FP32; - outputPrecision = Precision::FP32; + if (!isFloatCompatible(inputPrecision)) { + inputPrecision = Precision::FP32; + } + if (!isFloatCompatible(outputPrecision)) { + outputPrecision = Precision::FP32; + } + } + + if (!mayiuse(avx512_core_bf16)) { + if (outputPrecision == Precision::BF16) + outputPrecision = Precision::FP32; } auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision); @@ -498,39 +528,50 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() { config.inConfs[0].inPlace = -1; config.outConfs[0].inPlace = canBeInplace ? 0 : -1; - auto pushDesc = [&](memory::format format) { + auto pushDesc = [&](memory::format format, impl_desc_type impl_type) { config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format); config.outConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), outputDataType, format); - supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, format}); + supportedPrimitiveDescriptors.push_back({config, impl_type, format}); }; + impl_desc_type impl_type; + if (mayiuse(cpu::avx512_common)) { + impl_type = impl_desc_type::jit_avx512; + } else if (mayiuse(cpu::avx2)) { + impl_type = impl_desc_type::jit_avx2; + } else if (mayiuse(cpu::sse42)) { + impl_type = impl_desc_type::jit_sse42; + } else { + impl_type = impl_desc_type::ref; + } + if (across_channels == 0 && normalize_variance == 1) { if (getParentEdgeAt(0)->getDims().ndims() == 4) { - pushDesc(memory::nhwc); + pushDesc(memory::nhwc, impl_type); } else if (getParentEdgeAt(0)->getDims().ndims() == 5) { - pushDesc(memory::ndhwc); + pushDesc(memory::ndhwc, impl_type); } } - if (inputPrecision == Precision::FP32 && outputPrecision == Precision::FP32) { - if (getParentEdgeAt(0)->getDims().ndims() == 4) { - if (mayiuse(cpu::avx512_common)) { - pushDesc(memory::nChw16c); - } else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) { - pushDesc(memory::nChw8c); + if (isFloatCompatible(inputPrecision) && isFloatCompatible(outputPrecision)) { + if (impl_desc_type::jit_avx512 == impl_type) { + if (getParentEdgeAt(0)->getDims().ndims() == 4) { + pushDesc(memory::nChw16c, impl_type); + } else if (getParentEdgeAt(0)->getDims().ndims() == 5) { + pushDesc(memory::nCdhw16c, impl_type); } - } else if (getParentEdgeAt(0)->getDims().ndims() == 5) { - if (mayiuse(cpu::avx512_common)) { - pushDesc(memory::nCdhw16c); - } else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) { - pushDesc(memory::nCdhw8c); + } else if (impl_desc_type::jit_avx2 == impl_type || impl_desc_type::jit_sse42 == impl_type) { + if (getParentEdgeAt(0)->getDims().ndims() == 4) { + pushDesc(memory::nChw8c, impl_type); + } else if (getParentEdgeAt(0)->getDims().ndims() == 5) { + pushDesc(memory::nCdhw8c, impl_type); } } if (fusedWith.empty()) { if (canBeInplace) config.inConfs[0].inPlace = 0; - pushDesc(MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims())); + pushDesc(MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims()), impl_type); } } } @@ -614,11 +655,32 @@ void MKLDNNMVNNode::execute(mkldnn::stream strm) { Layout layout = getParentEdgeAt(0)->getDesc().getLayout(); - auto src_data = reinterpret_cast(srcMemPtr->GetData()); - auto dst_data = reinterpret_cast(dstMemPtr->GetData()); - if (layout == C || layout == NC || layout == CHW || layout == NCHW || layout == NCDHW) { - mvn_pln(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + if (input_prec == Precision::FP32) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + if (output_prec == Precision::FP32) { + auto dst_data = reinterpret_cast(dstMemPtr->GetData()); + mvn_pln(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else if (output_prec == Precision::BF16) { + auto dst_data = reinterpret_cast(dstMemPtr->GetData()); + mvn_pln(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else { + THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name(); + } + } else if (input_prec == Precision::BF16) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + if (output_prec == Precision::FP32) { + auto dst_data = reinterpret_cast(dstMemPtr->GetData()); + mvn_pln(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else if (output_prec == Precision::BF16) { + auto dst_data = reinterpret_cast(dstMemPtr->GetData()); + mvn_pln(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else { + THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name(); + } + } else { + THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name(); + } } else { if (output_prec == Precision::U8) { auto dst_data = reinterpret_cast(dstMemPtr->GetData()); @@ -631,6 +693,11 @@ void MKLDNNMVNNode::execute(mkldnn::stream strm) { } else if (input_prec == Precision::FP32) { auto src_data = reinterpret_cast(srcMemPtr->GetData()); mvn_blk(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else if (input_prec == Precision::BF16) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + mvn_blk(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else { + THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name(); } } else if (output_prec == Precision::I8) { auto dst_data = reinterpret_cast(dstMemPtr->GetData()); @@ -643,6 +710,11 @@ void MKLDNNMVNNode::execute(mkldnn::stream strm) { } else if (input_prec == Precision::FP32) { auto src_data = reinterpret_cast(srcMemPtr->GetData()); mvn_blk(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else if (input_prec == Precision::BF16) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + mvn_blk(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else { + THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name(); } } else if (output_prec == Precision::FP32) { auto dst_data = reinterpret_cast(dstMemPtr->GetData()); @@ -655,7 +727,31 @@ void MKLDNNMVNNode::execute(mkldnn::stream strm) { } else if (input_prec == Precision::FP32) { auto src_data = reinterpret_cast(srcMemPtr->GetData()); mvn_blk(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else if (input_prec == Precision::BF16) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + mvn_blk(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else { + THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name(); } + } else if (output_prec == Precision::BF16) { + auto dst_data = reinterpret_cast(dstMemPtr->GetData()); + if (input_prec == Precision::U8) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + mvn_blk(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else if (input_prec == Precision::I8) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + mvn_blk(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else if (input_prec == Precision::FP32) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + mvn_blk(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else if (input_prec == Precision::BF16) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + mvn_blk(src_data, dst_data, getParentEdgeAt(0)->getDesc().getDims()); + } else { + THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name(); + } + } else { + THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name(); } } } @@ -673,7 +769,8 @@ std::tuple MKLDNNMVNNode::get5dShapes(co return shapes; } -void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVector& dims) { +template +void MKLDNNMVNNode::mvn_pln(const in_data_t* src_data, out_data_t* dst_data, const SizeVector& dims) { size_t blk_size = 1; // blk size in vmm if (mayiuse(cpu::avx512_common)) { blk_size = 16; @@ -705,7 +802,7 @@ void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVe auto arg = jit_mvn_call_args(); arg.src = src_data + cc; arg.sum = static_cast(&mean_internal); - arg.src_stride = static_cast(blk_size * sizeof(float)); + arg.src_stride = static_cast(blk_size * sizeof(in_data_t)); arg.work_amount = static_cast(C2 / blk_size); (*mvn_mean_kernel)(&arg); for (size_t tail = tail_across_channels; tail < C2; tail++) { @@ -737,7 +834,7 @@ void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVe arg.src = src_data + cc; arg.mean = static_cast(&mean); arg.variance = static_cast(&variance_internal); - arg.src_stride = static_cast(blk_size * sizeof(float)); + arg.src_stride = static_cast(blk_size * sizeof(in_data_t)); arg.work_amount = static_cast(C2 / blk_size); (*mvn_variance_kernel)(&arg); @@ -766,8 +863,8 @@ void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVe arg.dst = dst_data + cc; arg.mean = static_cast(&mean); arg.variance = static_cast(&variance); - arg.src_stride = static_cast(blk_size * sizeof(float)); - arg.dst_stride = static_cast(blk_size * sizeof(float)); + arg.src_stride = static_cast(blk_size * sizeof(in_data_t)); + arg.dst_stride = static_cast(blk_size * sizeof(out_data_t)); arg.work_amount = static_cast(C2 / blk_size); (*mvn_kernel)(&arg); @@ -792,8 +889,8 @@ void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVe arg.src = src_data + cc; arg.dst = dst_data + cc; arg.mean = static_cast(&mean); - arg.src_stride = static_cast(blk_size * sizeof(float)); - arg.dst_stride = static_cast(blk_size * sizeof(float)); + arg.src_stride = static_cast(blk_size * sizeof(in_data_t)); + arg.dst_stride = static_cast(blk_size * sizeof(out_data_t)); arg.work_amount = static_cast(C2 / blk_size); (*mvn_kernel)(&arg); @@ -823,8 +920,8 @@ void MKLDNNMVNNode::mvn_pln(const float* src_data, float* dst_data, const SizeVe arg.src = src_data + cc; arg.dst = dst_data + cc; arg.sum = static_cast(&mean); - arg.src_stride = static_cast(blk_size * sizeof(float)); - arg.dst_stride = static_cast(blk_size * sizeof(float)); + arg.src_stride = static_cast(blk_size * sizeof(in_data_t)); + arg.dst_stride = static_cast(blk_size * sizeof(out_data_t)); arg.work_amount = static_cast(C2 / blk_size); (*mvn_mean_kernel)(&arg); @@ -1227,7 +1324,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con } else if (post_op.is_quantization()) { bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; - bool do_rounding = do_dequantization || output_prec == Precision::FP32 || + bool do_rounding = do_dequantization || isFloatCompatible(output_prec) || i != p.len_ - 1; auto quant = post_op.quantization; @@ -1251,7 +1348,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con } } } - if (output_prec == Precision::FP32) { + if (isFloatCompatible(output_prec)) { dst_data[ch + w * src_stride] = dst_value; } else if (output_prec == Precision::U8) { dst_data[ch + w * src_stride] = (dst_value >= 0) ? lroundf(dst_value) : 0; @@ -1300,7 +1397,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con size_t ch = cd + h * C0; for (size_t w = 0lu; w < W; w++) { float dst_value = src_data[ch + w * src_stride] - mean_buffer_ptr[c]; - if (output_prec == Precision::FP32) { + if (isFloatCompatible(output_prec)) { dst_data[ch + w * src_stride] = dst_value; } else if (output_prec == Precision::U8) { dst_data[ch + w * src_stride] = (dst_value >= 0) ? lroundf(dst_value) : 0; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.h index 3919b94817b8ef..97203d9b22e513 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.h @@ -81,7 +81,8 @@ class MKLDNNMVNNode : public MKLDNNNode { } private: - void mvn_pln(const float* src_data, float* dst_data, const InferenceEngine::SizeVector& dims); + template + void mvn_pln(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims); template void mvn_blk(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp index 114579e3fb80e7..6ce21a3911723c 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp @@ -5,6 +5,7 @@ #include "mkldnn_quantize_node.h" #include "mkldnn_eltwise_node.h" #include +#include "utils/bfloat16.hpp" #include #include "ie_parallel.hpp" #include "jit_uni_eltwise.hpp" @@ -24,6 +25,10 @@ using namespace Xbyak; #define GET_OFF(field) offsetof(jit_normalize_call_args, field) +static inline bool isFloatCompatible(memory::data_type type) { + return memory::f32 == type || memory::bf16 == type; +} + template struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_normalize_modulo_kernel_f32) @@ -119,6 +124,10 @@ struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_ker case memory::s32: uni_vmovups(vmm_src, op); break; + case memory::bf16: + uni_vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; case memory::s8: uni_vpmovsxbd(vmm_src, op); break; @@ -128,8 +137,7 @@ struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_ker default: assert(!"unknown dst_dt"); } - - if (src_dt != memory::f32) + if (!isFloatCompatible(src_dt)) uni_vcvtdq2ps(vmm_src, vmm_src); } }; @@ -239,7 +247,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji Xbyak::Label tail_loop_label; Xbyak::Label tail_loop_end_label; - int step = vlen / sizeof(float); + int step = jcp_.src_dt == memory::bf16 ? 16 : (vlen / sizeof(float)); L(main_loop_label); { cmp(reg_work_amount, step); @@ -322,7 +330,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji Xbyak::Label tail_loop_label; Xbyak::Label tail_loop_end_label; - int step = vlen / sizeof(float); + int step = jcp_.src_dt == memory::bf16 ? 16 : (vlen / sizeof(float)); L(main_loop_label); { cmp(reg_work_amount, step); @@ -520,6 +528,10 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji case memory::s32: uni_vmovups(vmm_src, op); break; + case memory::bf16: + uni_vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; case memory::s8: uni_vpmovsxbd(vmm_src, op); break; @@ -529,8 +541,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji default: assert(!"unknown dst_dt"); } - - if (src_dt != memory::f32) + if (!isFloatCompatible(src_dt)) uni_vcvtdq2ps(vmm_src, vmm_src); } @@ -540,6 +551,10 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji case memory::s32: movss(xmm_src, op); break; + case memory::bf16: + pinsrw(xmm_src, op, 0x0); + uni_vpslld(xmm_src, xmm_src, 16); + break; case memory::s8: movsx(reg_tmp_32, op); movq(xmm_src, reg_tmp_64); @@ -552,7 +567,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji assert(!"unknown dst_dt"); } - if (src_dt != data_type::f32) { + if (!isFloatCompatible(src_dt)) { uni_vcvtdq2ps(xmm_src, xmm_src); } } @@ -563,6 +578,9 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji if (dst_dt == memory::f32) { uni_vmovups(op, vmm_dst); + } else if (dst_dt == memory::bf16) { + vcvtneps2bf16(ymm_dst, vmm_dst); + vmovdqu16(op, ymm_dst); } else if (dst_dt == memory::u8) { uni_vcvtps2dq(vmm_dst, vmm_dst); if (isa == cpu::avx512_common) { @@ -596,7 +614,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji } inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) { - if (dst_dt != data_type::f32) { + if (!isFloatCompatible(dst_dt)) { uni_vcvtps2dq(xmm_dst, xmm_dst); } @@ -605,6 +623,10 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji case memory::s32: movss(op, xmm_dst); break; + case memory::bf16: + uni_vpsrld(xmm_dst, xmm_dst, 16); + pextrw(op, xmm_dst, 0x0); + break; case memory::s8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); @@ -653,7 +675,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji || quantization_injectors[quantization_inj_idx] == nullptr) assert(!"Invalid quantization injectors."); bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; - bool do_rounding = do_dequantization || dst_dt == memory::f32 || i != p.len_ - 1; + bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len_ - 1; int s_idx = vmm_val.getIdx(); @@ -747,9 +769,7 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() { setPostOps(attr, true); Precision inputPrecision = getCnnLayer()->insData[0].lock()->getPrecision(); - inputPrecision = inputPrecision == Precision::BF16 ? Precision(Precision::FP32) : inputPrecision; Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision(); - outputPrecision = outputPrecision == Precision::BF16 ? Precision(Precision::FP32) : outputPrecision; if (!fusedWith.empty()) { auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer(); @@ -758,6 +778,13 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() { } } + if (inputPrecision == Precision::BF16 || outputPrecision == Precision::BF16) { + if (!mayiuse(avx512_core_bf16)) + inputPrecision = outputPrecision = Precision::FP32; + else + inputPrecision = outputPrecision = Precision::BF16; + } + auto isOneOf = [&](InferenceEngine::Precision precision, std::vector precisions) { for (auto p : precisions) { if (precision == p) { @@ -766,10 +793,10 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() { } return false; }; - if (!isOneOf(inputPrecision, {Precision::FP32, Precision::I8, Precision::U8})) { + if (!isOneOf(inputPrecision, {Precision::FP32, Precision::BF16, Precision::I8, Precision::U8})) { THROW_IE_EXCEPTION << "Unsupported input precision. " << getName(); } - if (!isOneOf(outputPrecision, {Precision::FP32, Precision::I8, Precision::U8})) { + if (!isOneOf(outputPrecision, {Precision::FP32, Precision::BF16, Precision::I8, Precision::U8})) { THROW_IE_EXCEPTION << "Unsupported output precision. " << getName(); } if (!isOneOf(weights_prec, {Precision::FP32, Precision::BF16})) { @@ -918,6 +945,8 @@ void MKLDNNNormalizeNode::execute(mkldnn::stream strm) { } else if (input_prec == Precision::FP32) { auto src_data = reinterpret_cast(src_ptr); normalize_function(src_data, dst_data, dims); + } else { + THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name(); } } else if (output_prec == Precision::I8) { auto dst_data = reinterpret_cast(dst_ptr); @@ -930,6 +959,8 @@ void MKLDNNNormalizeNode::execute(mkldnn::stream strm) { } else if (input_prec == Precision::FP32) { auto src_data = reinterpret_cast(src_ptr); normalize_function(src_data, dst_data, dims); + } else { + THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name(); } } else if (output_prec == Precision::FP32) { auto dst_data = reinterpret_cast(dst_ptr); @@ -942,7 +973,15 @@ void MKLDNNNormalizeNode::execute(mkldnn::stream strm) { } else if (input_prec == Precision::FP32) { auto src_data = reinterpret_cast(src_ptr); normalize_function(src_data, dst_data, dims); + } else { + THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name(); } + } else if (output_prec == Precision::BF16) { + auto dst_data = reinterpret_cast(dst_ptr); + auto src_data = reinterpret_cast(src_ptr); + normalize_function(src_data, dst_data, dims); + } else { + THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name(); } } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp index 6c597e309e06ed..30cc84ae586a5b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp @@ -91,13 +91,7 @@ void MKLDNNPoolingNode::getSupportedDescriptors() { MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc}; MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc}; createDescriptor({ in_candidate }, { out_candidate }); - } else if ((parentDims.ndims() == 4 || parentDims.ndims() == 5) && (inputDataType == memory::bf16 || outputDataType == memory::bf16)) { - MKLDNNMemoryDesc in_candidate{ parentDims, memory::bf16, parentDims.ndims() == 5 ? memory::format::nCdhw16c : memory::format::nChw16c}; - MKLDNNMemoryDesc out_candidate{ childDims, memory::bf16, parentDims.ndims() == 5 ? memory::format::nCdhw16c : memory::format::nChw16c}; - createDescriptor({ in_candidate }, { out_candidate }); } else if ((parentDims.ndims() == 4 || parentDims.ndims() == 5) && parentDims[1] == 1) { - inputDataType = memory::f32; - outputDataType = memory::f32; // WA. We should force planar layout since it provides better performance MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format::ncdhw : memory::format::nchw}; MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ncdhw : memory::format::nchw}; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp index d43b347e7e69d2..81c11c330b955e 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp @@ -12,6 +12,7 @@ #include #include #include +#include "utils/bfloat16.hpp" #include "ie_parallel.hpp" #include @@ -64,6 +65,11 @@ using namespace Xbyak; #define GET_PTR_NCD_BASE_PTR_N_BLK const uint8_t *in_ptr_ncd = in_ptr_n + src_data_size * (icb * ID + id) * IH * IW * blk_size; \ uint8_t *out_ptr_ncd = out_ptr_n + dst_data_size * (ocb * OD + od) * OH * OW * blk_size; +// some utility functions +static inline bool isFloatCompatible(memory::data_type type) { + return memory::f32 == type || memory::bf16 == type; +} + template struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_kernel_f32) @@ -278,13 +284,13 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene uni_vpxor(vmm_dst, vmm_dst, vmm_dst); break; case Reduce::Max: - if (jcp_.dst_dt == memory::f32) + if (isFloatCompatible(jcp_.dst_dt)) uni_vmovups(vmm_dst, table_val(2)); else uni_vmovups(vmm_dst, table_val(4)); break; case Reduce::Min: - if (jcp_.dst_dt == memory::f32) + if (isFloatCompatible(jcp_.dst_dt)) uni_vmovups(vmm_dst, table_val(3)); else uni_vmovups(vmm_dst, table_val(5)); @@ -540,6 +546,10 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene case memory::s32: uni_vmovups(vmm_src, op); break; + case memory::bf16: + uni_vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; case memory::s8: uni_vpmovsxbd(vmm_src, op); break; @@ -550,7 +560,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene assert(!"unknown src_dt"); } - if (src_dt != memory::f32) + if (!isFloatCompatible(src_dt)) uni_vcvtdq2ps(vmm_src, vmm_src); } @@ -560,6 +570,10 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene case memory::s32: movss(xmm_src, op); break; + case memory::bf16: + pinsrw(xmm_src, op, 0x0); + uni_vpslld(xmm_src, xmm_src, 16); + break; case memory::s8: movsx(reg_tmp_32, op); movq(xmm_src, reg_tmp_64); @@ -572,7 +586,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene assert(!"unknown src_dt"); } - if (src_dt != data_type::f32) { + if (!isFloatCompatible(src_dt)) { uni_vcvtdq2ps(xmm_src, xmm_src); } } @@ -581,7 +595,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene Xmm xmm_dst = Xmm(vmm_dst.getIdx()); Ymm ymm_dst = Ymm(vmm_dst.getIdx()); - if (dst_dt != memory::f32) { + if (!isFloatCompatible(dst_dt)) { uni_vcvtps2dq(vmm_dst, vmm_dst); } @@ -590,6 +604,10 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene case memory::s32: uni_vmovups(op, vmm_dst); break; + case memory::bf16: + vcvtneps2bf16(ymm_dst, vmm_dst); + uni_vmovups(op, ymm_dst); + break; case memory::s8: if (isa == avx512_common) { vmaxps(vmm_dst, vmm_zero, vmm_dst); @@ -625,7 +643,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene } inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) { - if (dst_dt != memory::f32) { + if (!isFloatCompatible(dst_dt)) { uni_vcvtps2dq(xmm_dst, xmm_dst); } @@ -634,6 +652,10 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene case memory::s32: movss(op, xmm_dst); break; + case memory::bf16: + uni_vpsrld(xmm_dst, xmm_dst, 16); + pextrw(op, xmm_dst, 0x0); + break; case memory::s8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); @@ -680,9 +702,10 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... switch (dst_dt) { case memory::f32: - movss(xmm_aux3, ptr[reg_dst]); + case memory::bf16: + load_scalar(xmm_aux3, ptr[reg_dst], dst_dt); horiz_ps(xmm_dst, xmm_aux3); - movss(ptr[reg_dst], xmm_dst); + store_scalar(ptr[reg_dst], xmm_dst, dst_dt); break; case memory::s32: movss(xmm_aux3, ptr[reg_dst]); @@ -981,6 +1004,10 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi case memory::s32: uni_vmovups(vmm_src, op); break; + case memory::bf16: + uni_vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; case memory::s8: uni_vpmovsxbd(vmm_src, op); break; @@ -991,7 +1018,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi assert(!"unknown src_dt"); } - if (src_dt != memory::f32) + if (!isFloatCompatible(src_dt)) uni_vcvtdq2ps(vmm_src, vmm_src); } @@ -1001,6 +1028,10 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi case memory::s32: movss(xmm_src, op); break; + case memory::bf16: + pinsrw(xmm_src, op, 0x0); + uni_vpslld(xmm_src, xmm_src, 16); + break; case memory::s8: movsx(reg_tmp_32, op); movq(xmm_src, reg_tmp_64); @@ -1013,7 +1044,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi assert(!"unknown src_dt"); } - if (src_dt != data_type::f32) { + if (!isFloatCompatible(src_dt)) { uni_vcvtdq2ps(xmm_src, xmm_src); } } @@ -1022,7 +1053,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi Xmm xmm_dst = Xmm(vmm_dst.getIdx()); Ymm ymm_dst = Ymm(vmm_dst.getIdx()); - if (dst_dt != memory::f32) { + if (!isFloatCompatible(dst_dt)) { uni_vcvtps2dq(vmm_dst, vmm_dst); } @@ -1031,6 +1062,10 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi case memory::s32: uni_vmovups(op, vmm_dst); break; + case memory::bf16: + vcvtneps2bf16(ymm_dst, vmm_dst); + uni_vmovups(op, ymm_dst); + break; case memory::s8: if (isa == avx512_common) { vmaxps(vmm_dst, vmm_zero, vmm_dst); @@ -1066,7 +1101,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi } inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) { - if (dst_dt != memory::f32) { + if (!isFloatCompatible(dst_dt)) { uni_vcvtps2dq(xmm_dst, xmm_dst); } @@ -1075,6 +1110,10 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi case memory::s32: movss(op, xmm_dst); break; + case memory::bf16: + uni_vpsrld(xmm_dst, xmm_dst, 16); + pextrw(op, xmm_dst, 0x0); + break; case memory::s8: uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); @@ -1123,6 +1162,10 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi case memory::f32: movss(ptr[reg_dst], xmm_dst); break; + case memory::bf16: + uni_vpsrld(xmm_dst, xmm_dst, 16); + pextrw(ptr[reg_dst], xmm_dst, 0x0); + break; case memory::s32: uni_vcvtps2dq(xmm_dst, xmm_dst); movss(ptr[reg_dst], xmm_dst); @@ -1173,9 +1216,10 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),... switch (dst_dt) { case memory::f32: - movss(xmm_aux3, ptr[reg_dst]); + case memory::bf16: + load_scalar(xmm_aux3, ptr[reg_dst], dst_dt); horiz_ps(xmm_dst, xmm_aux3); - movss(ptr[reg_dst], xmm_dst); + store_scalar(ptr[reg_dst], xmm_dst, dst_dt); break; case memory::s32: movss(xmm_aux3, ptr[reg_dst]); @@ -1292,11 +1336,33 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; + static const Precision supportedPrecisions[] = { + Precision::FP32, + Precision::BF16, + Precision::I32, + Precision::I8, + Precision::U8 + }; + Precision inputPrecision = getCnnLayer()->insData[REDUCE_DATA].lock()->getPrecision(); Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision(); - if (inputPrecision == Precision::BF16) inputPrecision = Precision::FP32; - if (outputPrecision == Precision::BF16) outputPrecision = Precision::FP32; + jit_mode = (mayiuse(cpu::sse42)) && getParentEdgeAt(REDUCE_DATA)->getDims().ndims() <= 5 && + std::find(std::begin(supportedPrecisions), std::end(supportedPrecisions), inputPrecision) != std::end(supportedPrecisions) && + std::find(std::begin(supportedPrecisions), std::end(supportedPrecisions), outputPrecision) != std::end(supportedPrecisions); + + if (jit_mode) { + // Since in jit mode we use the output memory as an intermediate accumulator for certain reduce modes, we can't use BF16 output precision due to + // the possible accuracy loss. Therefore, for such mods, we will change the output precision to FP32. + if (Precision::BF16 == outputPrecision) { + if (!mayiuse(avx512_core_bf16)) { + outputPrecision = Precision::FP32; + } else if (reduceMode != Reduce::And && reduceMode != Reduce::Or && + reduceMode != Reduce::Max && reduceMode != Reduce::Min) { + outputPrecision = Precision::FP32; + } + } + } auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision); auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outputPrecision); @@ -1317,37 +1383,42 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() { config.inConfs[REDUCE_INDEXES].inPlace = -1; config.outConfs[0].inPlace = -1; - auto pushDesc = [&](memory::format inFormat, memory::format outFormat, memory::data_type inDataType, memory::data_type outDataType) { + auto pushDesc = [&](memory::format inFormat, memory::format outFormat, memory::data_type inDataType, + memory::data_type outDataType, impl_desc_type impl_type) { config.inConfs[REDUCE_DATA].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_DATA)->getDims(), inDataType, inFormat); config.inConfs[REDUCE_INDEXES].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_INDEXES)->getDims(), memory::s32, memory::x); config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outDataType, outFormat); - supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, outFormat}); + supportedPrimitiveDescriptors.push_back({config, impl_type, outFormat}); }; - jit_mode = (mayiuse(cpu::sse42)) && getParentEdgeAt(REDUCE_DATA)->getDims().ndims() <= 5 && - (inputPrecision == Precision::FP32 || inputPrecision == Precision::I32 || inputPrecision == Precision::U8 || inputPrecision == Precision::I8) && - (outputPrecision == Precision::FP32 || outputPrecision == Precision::I32 || outputPrecision == Precision::U8 || outputPrecision == Precision::I8); if (jit_mode) { + impl_desc_type impl_type = impl_desc_type::jit_sse42; + if (mayiuse(cpu::avx512_common)) { + impl_type = impl_desc_type::jit_avx512; + } else if (mayiuse(cpu::avx2)) { + impl_type = impl_desc_type::jit_avx2; + } + pushDesc(MKLDNNMemory::GetPlainFormat(memory::dims(getParentEdgeAt(REDUCE_DATA)->getDims().ndims())), - MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), inputDataType, outputDataType); + MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), inputDataType, outputDataType, impl_type); if (keep_dims) { if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() == 4 && getParentEdgeAt(REDUCE_DATA)->getDims().ToSizeVector()[1] > 1) { if (mayiuse(cpu::avx512_common)) { - pushDesc(memory::nChw16c, memory::nChw16c, inputDataType, outputDataType); + pushDesc(memory::nChw16c, memory::nChw16c, inputDataType, outputDataType, impl_type); } else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) { - pushDesc(memory::nChw8c, memory::nChw8c, inputDataType, outputDataType); + pushDesc(memory::nChw8c, memory::nChw8c, inputDataType, outputDataType, impl_type); } } else if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() == 5 && getParentEdgeAt(REDUCE_DATA)->getDims().ToSizeVector()[1] > 1) { if (mayiuse(cpu::avx512_common)) { - pushDesc(memory::nCdhw16c, memory::nCdhw16c, inputDataType, outputDataType); + pushDesc(memory::nCdhw16c, memory::nCdhw16c, inputDataType, outputDataType, impl_type); } else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) { - pushDesc(memory::nCdhw8c, memory::nCdhw8c, inputDataType, outputDataType); + pushDesc(memory::nCdhw8c, memory::nCdhw8c, inputDataType, outputDataType, impl_type); } } } } else { pushDesc(MKLDNNMemory::GetPlainFormat(memory::dims(getParentEdgeAt(REDUCE_DATA)->getDims().ndims())), - MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), memory::f32, memory::f32); + MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), memory::f32, memory::f32, impl_desc_type::ref); } } @@ -1714,6 +1785,9 @@ inline void MKLDNNReduceNode::init_dst_data(uint8_t *out_ptr, size_t dst_size) { } else if (output_prec == Precision::I32) { auto out_p = reinterpret_cast(out_ptr); parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); + } else if (output_prec == Precision::BF16) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); } else if (output_prec == Precision::U8) { auto out_p = reinterpret_cast(out_ptr); parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = static_cast(1); }); @@ -1729,6 +1803,9 @@ inline void MKLDNNReduceNode::init_dst_data(uint8_t *out_ptr, size_t dst_size) { } else if (output_prec == Precision::I32) { auto out_p = reinterpret_cast(out_ptr); parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::min(); }); + } else if (output_prec == Precision::BF16) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::min(); }); } else if (output_prec == Precision::U8) { auto out_p = reinterpret_cast(out_ptr); parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::min(); }); @@ -1744,6 +1821,9 @@ inline void MKLDNNReduceNode::init_dst_data(uint8_t *out_ptr, size_t dst_size) { } else if (output_prec == Precision::I32) { auto out_p = reinterpret_cast(out_ptr); parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); + } else if (output_prec == Precision::BF16) { + auto out_p = reinterpret_cast(out_ptr); + parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); } else if (output_prec == Precision::U8) { auto out_p = reinterpret_cast(out_ptr); parallel_for(dst_size / dst_data_size, [&](size_t i) { out_p[i] = std::numeric_limits::max(); }); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.cpp index 035b4526548783..7ae7ce809c87d8 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.cpp @@ -12,6 +12,7 @@ #include #include #include +#include "utils/bfloat16.hpp" #include #include "ie_parallel.hpp" #include @@ -33,6 +34,14 @@ using namespace Xbyak; #define GET_OFF(field) offsetof(jit_resample_call_args, field) +static inline bool isFloatCompatible(Precision prc) { + return Precision::FP32 == prc || Precision::BF16 == prc; +} + +static inline bool isFloatCompatible(memory::data_type type) { + return memory::f32 == type || memory::bf16 == type; +} + template struct jit_uni_resample_nearest_kernel_f32 : public jit_uni_resample_nearest_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_resample_nearest_kernel_f32) @@ -73,7 +82,7 @@ struct jit_uni_resample_nearest_kernel_f32 : public jit_uni_resample_nearest_ker if (isa == cpu::avx512_common) uni_vpxor(vmm_zero, vmm_zero, vmm_zero); - int blk_size = vlen / sizeof(float); + int blk_size = jcp_.src_dt == memory::bf16 ? 16 : (vlen / sizeof(float)); if (isa == cpu::sse42) blk_size *= 2; @@ -197,11 +206,15 @@ struct jit_uni_resample_nearest_kernel_f32 : public jit_uni_resample_nearest_ker case memory::u8: uni_vpmovzxbd(vmm_src, op); break; + case memory::bf16: + uni_vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; default: assert(!"unknown dst_dt"); } - if (src_dt != memory::f32) + if (!isFloatCompatible(src_dt)) uni_vcvtdq2ps(vmm_src, vmm_src); } @@ -211,6 +224,9 @@ struct jit_uni_resample_nearest_kernel_f32 : public jit_uni_resample_nearest_ker if (dst_dt == memory::f32) { uni_vmovups(op, vmm_dst); + } else if (dst_dt == memory::bf16) { + vcvtneps2bf16(ymm_dst, vmm_dst); + vmovdqu16(op, ymm_dst); } else if (dst_dt == memory::u8) { uni_vcvtps2dq(vmm_dst, vmm_dst); if (isa == cpu::avx512_common) { @@ -262,8 +278,7 @@ struct jit_uni_resample_nearest_kernel_f32 : public jit_uni_resample_nearest_ker depthwise_inj_idx++; } else if (post_op.is_quantization()) { bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; - bool do_rounding = do_dequantization || dst_dt == memory::f32 || i != p.len_ - 1; - + bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len_ - 1; int s_idx = vmm_val.getIdx(); quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off); @@ -320,12 +335,11 @@ void MKLDNNResampleNode::initSupportedPrimitiveDescriptors() { } } - if (inputPrecision == Precision::BF16) { - inputPrecision = Precision::FP32; - } - - if (outputPrecision == Precision::BF16) { - outputPrecision = Precision::FP32; + if (inputPrecision == Precision::BF16 || outputPrecision == Precision::BF16) { + if (!mayiuse(avx512_core_bf16)) + inputPrecision = outputPrecision = Precision::FP32; + else + inputPrecision = outputPrecision = Precision::BF16; } auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision); @@ -358,7 +372,7 @@ void MKLDNNResampleNode::initSupportedPrimitiveDescriptors() { pushDesc(memory::ndhwc); } - if (inputPrecision == Precision::FP32 && outputPrecision == Precision::FP32) { + if (isFloatCompatible(inputPrecision) && isFloatCompatible(outputPrecision)) { if (getParentEdgeAt(0)->getDims().ndims() == 4) { if (mayiuse(cpu::avx512_common)) { pushDesc(memory::nChw16c); @@ -456,9 +470,6 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) { Layout layout = getParentEdgeAt(0)->getDesc().getLayout(); - const auto src_data = reinterpret_cast(srcMemPtr->GetData()); - auto dst_data = reinterpret_cast(dstMemPtr->GetData()); - SizeVector src_dim = getParentEdgeAt(0)->getDesc().getDims(); SizeVector dst_dim = getChildEdgeAt(0)->getDesc().getDims(); @@ -479,7 +490,17 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) { if (type == "caffe.ResampleParameter.NEAREST") { if (layout == NCHW || layout == NCDHW) { - NearestNeighbor_PLN(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW); + if (output_prec == Precision::FP32) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + auto dst_data = reinterpret_cast(dstMemPtr->GetData()); + NearestNeighbor_PLN(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW); + } else if (output_prec == Precision::BF16) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + auto dst_data = reinterpret_cast(dstMemPtr->GetData()); + NearestNeighbor_PLN(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW); + } else { + THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name(); + } } else { if (output_prec == Precision::U8) { auto dst_data = reinterpret_cast(dstMemPtr->GetData()); @@ -492,6 +513,8 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) { } else if (input_prec == Precision::FP32) { auto src_data = reinterpret_cast(srcMemPtr->GetData()); NearestNeighbor_BLK(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW); + } else { + THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name(); } } else if (output_prec == Precision::I8) { auto dst_data = reinterpret_cast(dstMemPtr->GetData()); @@ -504,6 +527,8 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) { } else if (input_prec == Precision::FP32) { auto src_data = reinterpret_cast(srcMemPtr->GetData()); NearestNeighbor_BLK(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW); + } else { + THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name(); } } else if (output_prec == Precision::FP32) { auto dst_data = reinterpret_cast(dstMemPtr->GetData()); @@ -516,7 +541,15 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) { } else if (input_prec == Precision::FP32) { auto src_data = reinterpret_cast(srcMemPtr->GetData()); NearestNeighbor_BLK(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW); + } else { + THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name(); } + } else if (output_prec == Precision::BF16) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + auto dst_data = reinterpret_cast(dstMemPtr->GetData()); + NearestNeighbor_BLK(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW); + } else { + THROW_IE_EXCEPTION << "Unsupported output precision: " << output_prec.name(); } } } else if (type == "caffe.ResampleParameter.LINEAR") { @@ -535,12 +568,22 @@ void MKLDNNResampleNode::execute(mkldnn::stream strm) { auto src_data = reinterpret_cast(srcMemPtr->GetData()); auto dst_data = reinterpret_cast(dstMemPtr->GetData()); LinearInterpolation(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width, isDownsample && antialias); + } else if (input_prec == Precision::BF16) { + auto src_data = reinterpret_cast(srcMemPtr->GetData()); + auto dst_data = reinterpret_cast(dstMemPtr->GetData()); + LinearInterpolation(src_data, dst_data, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width, + isDownsample && antialias); + } else { + THROW_IE_EXCEPTION << "Unsupported input precision: " << input_prec.name(); } + } else { + THROW_IE_EXCEPTION << "Unsupported resample parameter type: " << type; } } // f32 and no fused, f32->input is f32, no fuse->output is f32 -void MKLDNNResampleNode::NearestNeighbor_PLN(const float *in_ptr_, float *out_ptr_, int B, int C, int ID, int IH, int IW, +template +void MKLDNNResampleNode::NearestNeighbor_PLN(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW, float fx, float fy, float fz, int OD, int OH, int OW) { std::vector index_buffer(OD * OH * OW); for (int oz = 0; oz < OD; oz++) { @@ -560,8 +603,8 @@ void MKLDNNResampleNode::NearestNeighbor_PLN(const float *in_ptr_, float *out_pt } if (resample_nearest_kernel) { parallel_for2d(B, C, [&](size_t b, size_t c) { - const float *in_ptr = in_ptr_ + IW * IH * ID * C * b + IW * IH * ID * c; - float *out_ptr = out_ptr_ + OW * OH * OD * C * b + OW * OH * OD * c; + const in_data_t *in_ptr = in_ptr_ + IW * IH * ID * C * b + IW * IH * ID * c; + out_data_t *out_ptr = out_ptr_ + OW * OH * OD * C * b + OW * OH * OD * c; // for OW*OH*OD auto arg = jit_resample_call_args(); @@ -580,8 +623,8 @@ void MKLDNNResampleNode::NearestNeighbor_PLN(const float *in_ptr_, float *out_pt }); } else { parallel_for2d(B, C, [&](size_t b, size_t c) { - const float *in_ptr = in_ptr_ + IW * IH * ID * C * b + IW * IH * ID * c; - float *out_ptr = out_ptr_ + OW * OH * OD * C * b + OW * OH * OD * c; + const in_data_t *in_ptr = in_ptr_ + IW * IH * ID * C * b + IW * IH * ID * c; + out_data_t *out_ptr = out_ptr_ + OW * OH * OD * C * b + OW * OH * OD * c; for (int i_dst = 0; i_dst < OW * OH * OD; i_dst++) { out_ptr[i_dst] = in_ptr[index_buffer[i_dst]]; @@ -646,7 +689,7 @@ void MKLDNNResampleNode::NearestNeighbor_BLK(const in_data_t *in_ptr_, out_data_ for (int c = tail; c < C; c++) { float dst_value = static_cast(in_ptr_dhw[c]); apply_post_ops_scalar(dst_value, c); - if (output_prec == Precision::FP32) { + if (isFloatCompatible(output_prec)) { out_ptr_dhw[c] = dst_value; } else if (output_prec == Precision::U8) { out_ptr_dhw[c] = (dst_value >= 0) ? lroundf(dst_value) : 0; @@ -671,7 +714,7 @@ void MKLDNNResampleNode::NearestNeighbor_BLK(const in_data_t *in_ptr_, out_data_ for (int c = 0; c < C; c++) { float dst_value = static_cast(in_ptr_dhw[c]); apply_post_ops_scalar(dst_value, c); - if (output_prec == Precision::FP32) { + if (isFloatCompatible(output_prec)) { out_ptr_dhw[c] = dst_value; } else if (output_prec == Precision::U8) { out_ptr_dhw[c] = (dst_value >= 0) ? lroundf(dst_value) : 0; @@ -723,7 +766,7 @@ void MKLDNNResampleNode::NearestNeighbor_BLK(const in_data_t *in_ptr_, out_data_ for (int blk = 0; blk < blk_size; blk++) { float dst_value = static_cast(in_ptr_cbdhw[blk]); apply_post_ops_scalar(dst_value, cb * blk_size + blk); - if (output_prec == Precision::FP32) { + if (isFloatCompatible(output_prec)) { out_ptr_cbdhw[blk] = dst_value; } else if (output_prec == Precision::U8) { out_ptr_cbdhw[blk] = (dst_value >= 0) ? lroundf(dst_value) : 0; @@ -749,8 +792,8 @@ void MKLDNNResampleNode::LinearInterpolation(const in_data_t *in_ptr_, out_data_ float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias) { if (IW == OW && IH == OH && ID == OD) { size_t size = B * C * ID * IH * IW; - if (input_prec == Precision::FP32) { - size *= sizeof(float); + if (isFloatCompatible(input_prec)) { + size *= sizeof(in_data_t); } cpu_memcpy(out_ptr_, in_ptr_, size); return; @@ -816,7 +859,7 @@ void MKLDNNResampleNode::LinearInterpolation(const in_data_t *in_ptr_, out_data_ out_ptr_ncdh[ox] = 0; } else { float dst_value = sum / wsum; - if (output_prec == Precision::FP32) { + if (isFloatCompatible(output_prec)) { out_ptr_ncdh[ox] = dst_value; } else if (output_prec == Precision::U8) { out_ptr_ncdh[ox] = (dst_value >= 0) ? lroundf(dst_value) : 0; @@ -846,7 +889,7 @@ inline void MKLDNNResampleNode::apply_post_ops_scalar(float &dst_value, int inde } else if (post_op.is_quantization()) { bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; - bool do_rounding = do_dequantization || output_prec == Precision::FP32 || + bool do_rounding = do_dequantization || isFloatCompatible(output_prec) || i != p.len_ - 1; auto quant = post_op.quantization; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.h index 15603f158ab371..47137a0dfefee4 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_resample_node.h @@ -78,7 +78,8 @@ class MKLDNNResampleNode : public MKLDNNNode { } private: - void NearestNeighbor_PLN(const float *in_ptr_, float *out_ptr_, int B, int C, int ID, int IH, int IW, + template + void NearestNeighbor_PLN(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW, float fx, float fy, float fz, int OD, int OH, int OW); template void NearestNeighbor_BLK(const in_data_t *in_ptr_, out_data_t *out_ptr_, int B, int C, int ID, int IH, int IW, diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp index fe34f812c623ed..b83159f28657eb 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp @@ -14,21 +14,11 @@ #include #include "ie_parallel.hpp" #include - -#include "jit_generator.hpp" -#include "jit_uni_eltwise.hpp" -#include "jit_uni_depthwise.hpp" -#include "jit_uni_quantization.hpp" #include "common/cpu_memcpy.h" using namespace mkldnn; using namespace MKLDNNPlugin; using namespace InferenceEngine; -using namespace mkldnn::impl; -using namespace mkldnn::impl::cpu; -using namespace mkldnn::impl::utils; -using namespace Xbyak; - MKLDNNScatterUpdateNode::MKLDNNScatterUpdateNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(layer, eng, cache), dataSize(0lu), indicesSize(0lu), axisSize(0lu), diff --git a/inference-engine/src/mkldnn_plugin/nodes/one_hot.cpp b/inference-engine/src/mkldnn_plugin/nodes/one_hot.cpp index 56d43ea7872a3c..e470a48ce7f41e 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/one_hot.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/one_hot.cpp @@ -36,6 +36,8 @@ class OneHotImpl: public ExtLayerBase { // check a precision of the input tensor input_precision = layer->insData[0].lock()->getTensorDesc().getPrecision(); + if (input_precision == Precision::BF16) + input_precision = Precision::FP32; if (input_precision != Precision::I32 && input_precision != Precision::FP32) { THROW_IE_EXCEPTION << layer->name << " Incorrect input precision for the input. Only I32 and FP32 are supported!"; } diff --git a/inference-engine/src/mkldnn_plugin/nodes/powerfile.cpp b/inference-engine/src/mkldnn_plugin/nodes/powerfile.cpp index c0a0cbf9ab0fe4..6aa503ae018317 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/powerfile.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/powerfile.cpp @@ -27,7 +27,7 @@ class PowerFileImpl: public ExtLayerBase { shift_.push_back(1); shift_.push_back(0); - addConfig(layer, {DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)}); + addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/priorbox.cpp b/inference-engine/src/mkldnn_plugin/nodes/priorbox.cpp index d372c76074670e..74c5d2b5461c10 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/priorbox.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/priorbox.cpp @@ -116,7 +116,7 @@ class PriorBoxImpl: public ExtLayerBase { THROW_IE_EXCEPTION << "Wrong number of variance values. Not less than 1 and more than 4 variance values."; } - addConfig(layer, {{ConfLayout::ANY, true}, {ConfLayout::ANY, true}}, {{ConfLayout::PLN, true}}); + addConfig(layer, {{ConfLayout::ANY, true}, {ConfLayout::ANY, true}}, {{ConfLayout::PLN, true, -1, Precision::FP32}}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/priorbox_clustered.cpp b/inference-engine/src/mkldnn_plugin/nodes/priorbox_clustered.cpp index 954f7d6fed6628..1fcd1df6c395a0 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/priorbox_clustered.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/priorbox_clustered.cpp @@ -32,7 +32,7 @@ class PriorBoxClusteredImpl: public ExtLayerBase { step_w_ = layer->GetParamAsFloat("step_w", 0); offset_ = layer->GetParamAsFloat("offset"); - addConfig(layer, {{ConfLayout::PLN, true}, {ConfLayout::PLN, true}}, {{ConfLayout::PLN, true}}); + addConfig(layer, {{ConfLayout::PLN, true}, {ConfLayout::PLN, true}}, {{ConfLayout::PLN, true, -1, Precision::FP32}}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/priorgridgenerator_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/priorgridgenerator_onnx.cpp index c783797ea26413..c98e7475f2364e 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/priorgridgenerator_onnx.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/priorgridgenerator_onnx.cpp @@ -45,8 +45,8 @@ class ExperimentalDetectronPriorGridGeneratorImpl: public ExtLayerBase { stride_w_ = layer->GetParamAsFloat("stride_x", 0); addConfig(layer, - {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::ANY), DataConfigurator(ConfLayout::ANY)}, - {DataConfigurator(ConfLayout::PLN)}); + {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::ANY), DataConfigurator(ConfLayout::ANY)}, + {DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/proposal.cpp b/inference-engine/src/mkldnn_plugin/nodes/proposal.cpp index 2b1dd1b599c04e..ac9f12ba77b3a5 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/proposal.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/proposal.cpp @@ -119,11 +119,12 @@ class ProposalImpl : public ExtLayerBase { store_prob = layer->outData.size() == 2; if (store_prob) { - addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, - {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}); + addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32), + DataConfigurator(ConfLayout::PLN, Precision::FP32)}, + {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } else { - addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, - {DataConfigurator(ConfLayout::PLN)}); + addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32), + DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } } catch (const InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); diff --git a/inference-engine/src/mkldnn_plugin/nodes/proposal_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/proposal_onnx.cpp index e6370b16a5a173..12e2dd61499ee3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/proposal_onnx.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/proposal_onnx.cpp @@ -296,9 +296,9 @@ class ONNXCustomProposalImpl : public ExtLayerBase { roi_indices_.resize(post_nms_topn_); addConfig(layer, - {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), - DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, - {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}); + {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32), + DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32)}, + {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/psroi.cpp b/inference-engine/src/mkldnn_plugin/nodes/psroi.cpp index f42061338b56e4..7b03df16e83cd2 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/psroi.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/psroi.cpp @@ -212,8 +212,8 @@ class PSROIPoolingImpl: public ExtLayerBase { int part_w = w * part_size_ / pooled_width_; int class_id = c / channels_each_class; float trans_x = no_trans_ ? 0 : - bottom_trans[(((n * num_classes + class_id) * 2) * part_size_ + part_h) - * part_size_ + part_w] * trans_std_; + bottom_trans[(((n * num_classes + class_id) * 2) * part_size_ + part_h) + * part_size_ + part_w] * trans_std_; float trans_y = no_trans_ ? 0 : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size_ + part_h) * part_size_ + part_w] * trans_std_; diff --git a/inference-engine/src/mkldnn_plugin/nodes/range.cpp b/inference-engine/src/mkldnn_plugin/nodes/range.cpp index 3f6c2ecfb41ce0..693f768c83d643 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/range.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/range.cpp @@ -48,13 +48,12 @@ class RangeImpl: public ExtLayerBase { layer->insData[RANGE_LIMIT].lock()->getTensorDesc().getPrecision() == Precision::FP32 && layer->insData[RANGE_DELTA].lock()->getTensorDesc().getPrecision() == Precision::FP32 && layer->outData[0]->getTensorDesc().getPrecision() == Precision::FP32)) { - THROW_IE_EXCEPTION << layer->name << - " 'Start', 'Limit', 'Delta' input scalars and output tensor should have same precision" << - "and only FP32 and I32 are supported!"; + addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32), + DataConfigurator(ConfLayout::PLN, Precision::FP32) }, { DataConfigurator(ConfLayout::PLN, Precision::FP32) }); + } else { + addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, + { DataConfigurator(ConfLayout::PLN) }); } - - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/region_yolo.cpp b/inference-engine/src/mkldnn_plugin/nodes/region_yolo.cpp index 9bf522a4c60069..c81a36c97399fa 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/region_yolo.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/region_yolo.cpp @@ -5,13 +5,19 @@ #include "base.hpp" #include "common/defs.h" #include "common/softmax.h" +#include "common/cpu_convert.h" #include #include #include #include +#include +#include "utils/bfloat16.hpp" +#include "common/cpu_memcpy.h" #include "jit_generator.hpp" #include "jit_uni_eltwise.hpp" +using namespace MKLDNNPlugin; +using namespace mkldnn; using namespace mkldnn::impl::cpu; using namespace mkldnn::impl::utils; @@ -22,11 +28,18 @@ namespace Cpu { #define GET_OFF(field) offsetof(jit_args_logistic, field) struct jit_args_logistic { - const float* src; - const float* dst; + const void* src; + void* dst; size_t work_amount; }; +struct jit_logistic_config_params { + InferenceEngine::Precision src_dt; + InferenceEngine::Precision dst_dt; + unsigned src_data_size; + unsigned dst_data_size; +}; + struct jit_uni_logistic_kernel { void (*ker_)(const jit_args_logistic *); @@ -40,7 +53,7 @@ template struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_logistic_kernel_f32) - jit_uni_logistic_kernel_f32() : jit_uni_logistic_kernel(), jit_generator() { + jit_uni_logistic_kernel_f32(jit_logistic_config_params jcp) : jit_uni_logistic_kernel(), jit_generator() { exp_injector.reset(new jit_uni_eltwise_injector_f32(this, alg_kind::eltwise_exp, 0.f, 0.f)); this->preamble(); @@ -59,12 +72,12 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_ cmp(reg_work_amount, step); jl(tail_loop_label, T_NEAR); - uni_vmovups(vmm_src, ptr[reg_src]); + load_vector(vmm_src, ptr[reg_src], jcp.src_dt); compute_kernel(); - uni_vmovups(ptr[reg_dst], vmm_src); + store_vector(ptr[reg_dst], vmm_src, jcp.dst_dt); - add(reg_src, step * sizeof(float)); - add(reg_dst, step * sizeof(float)); + add(reg_src, step * jcp.src_data_size); + add(reg_dst, step * jcp.dst_data_size); sub(reg_work_amount, step); jmp(main_loop_label, T_NEAR); @@ -75,12 +88,12 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_ cmp(reg_work_amount, step); jl(exit_label, T_NEAR); - movss(xmm_src, ptr[reg_src]); + load_scalar(xmm_src, ptr[reg_src], jcp.src_dt); compute_kernel(); - movss(ptr[reg_dst], xmm_src); + store_scalar(ptr[reg_dst], xmm_src, jcp.dst_dt); - add(reg_src, step * sizeof(float)); - add(reg_dst, step * sizeof(float)); + add(reg_src, step * jcp.src_data_size); + add(reg_dst, step * jcp.dst_data_size); sub(reg_work_amount, step); jmp(tail_loop_label, T_NEAR); @@ -164,6 +177,61 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_ int mask_sign = 0x80000000; // 0 // mask to extract sign int float_1 = 0x3f800000; // 1 // 1.0f } vals_for_logistic_activate; + + inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, InferenceEngine::Precision src_dt) { + switch (src_dt) { + case InferenceEngine::Precision::FP32: + uni_vmovups(vmm_src, op); + break; + case InferenceEngine::Precision::BF16: + vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; + default: + assert(!"unknown src_dt"); + } + } + inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, InferenceEngine::Precision dst_dt) { + Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx()); + + switch (dst_dt) { + case InferenceEngine::Precision::FP32: + uni_vmovups(op, vmm_dst); + break; + case InferenceEngine::Precision::BF16: + vcvtneps2bf16(ymm_dst, vmm_dst); + uni_vmovups(op, ymm_dst); + break; + default: + assert(!"unknown dst_dt"); + } + } + inline void load_scalar(Xbyak::Xmm xmm_src, const Xbyak::Address &op, InferenceEngine::Precision src_dt) { + switch (src_dt) { + case InferenceEngine::Precision::FP32: + movss(xmm_src, op); + break; + case InferenceEngine::Precision::BF16: + pinsrw(xmm_src, op, 0x0); + uni_vpslld(xmm_src, xmm_src, 16); + break; + default: + assert(!"unknown src_dt"); + } + } + inline void store_scalar(const Xbyak::Address &op, Xbyak::Xmm xmm_dst, InferenceEngine::Precision dst_dt) { + switch (dst_dt) { + case InferenceEngine::Precision::FP32: + movss(op, xmm_dst); + break; + case InferenceEngine::Precision::BF16: + uni_vpsrld(xmm_dst, xmm_dst, 16); + pextrw(op, xmm_dst, 0x0); + break; + default: + assert(!"unknown dst_dt"); + } + } }; class RegionYoloImpl: public ExtLayerBase { @@ -173,27 +241,48 @@ class RegionYoloImpl: public ExtLayerBase { if (layer->insData.size() != 1 || layer->outData.empty()) THROW_IE_EXCEPTION << "Incorrect number of input/output edges!"; + input_prec = layer->insData.front().lock()->getPrecision(); + output_prec = layer->outData.front()->getPrecision(); + + if (input_prec != Precision::FP32 && input_prec != Precision::BF16) { + input_prec = Precision::FP32; + } + + if (output_prec != Precision::FP32 && output_prec != Precision::BF16) { + output_prec = Precision::FP32; + } + + if (Precision::BF16 == output_prec) { + if (!mayiuse(avx512_core_bf16)) { + output_prec = Precision::FP32; + } + } + classes = layer->GetParamAsInt("classes"); coords = layer->GetParamAsInt("coords"); num = layer->GetParamAsInt("num"); do_softmax = layer->GetParamAsBool("do_softmax", true); mask = layer->GetParamAsInts("mask", {}); + jit_logistic_config_params jcp; + jcp.src_dt = jcp.dst_dt = output_prec; + jcp.src_data_size = jcp.dst_data_size = output_prec.size(); + block_size = 1; if (mayiuse(avx512_common)) { - logistic_kernel.reset(new jit_uni_logistic_kernel_f32()); + logistic_kernel.reset(new jit_uni_logistic_kernel_f32(jcp)); block_size = 16; } else if (mayiuse(avx2)) { - logistic_kernel.reset(new jit_uni_logistic_kernel_f32()); + logistic_kernel.reset(new jit_uni_logistic_kernel_f32(jcp)); block_size = 8; } else if (mayiuse(sse42)) { - logistic_kernel.reset(new jit_uni_logistic_kernel_f32()); + logistic_kernel.reset(new jit_uni_logistic_kernel_f32(jcp)); block_size = 4; } - softmax_kernel.reset(new SoftmaxGeneric()); + softmax_kernel = std::make_shared(input_prec, output_prec); - addConfig(layer, {DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)}); + addConfig(layer, {DataConfigurator(ConfLayout::PLN, input_prec)}, {DataConfigurator(ConfLayout::PLN, output_prec)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } @@ -201,19 +290,12 @@ class RegionYoloImpl: public ExtLayerBase { StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - const auto *src_data = inputs[0]->cbuffer().as(); - auto *dst_data = outputs[0]->buffer().as(); - - int mask_size = mask.size(); + size_t mask_size = mask.size(); - int IW = (inputs[0]->getTensorDesc().getDims().size() > 3) ? inputs[0]->getTensorDesc().getDims()[3] : 1; - int IH = (inputs[0]->getTensorDesc().getDims().size() > 2) ? inputs[0]->getTensorDesc().getDims()[2] : 1; - int IC = (inputs[0]->getTensorDesc().getDims().size() > 1) ? inputs[0]->getTensorDesc().getDims()[1] : 1; - int B = (inputs[0]->getTensorDesc().getDims().size() > 0) ? inputs[0]->getTensorDesc().getDims()[0] : 1; - - parallel_for(B * IC * IH * IW, [&](int i) { - dst_data[i] = src_data[i]; - }); + size_t IW = (inputs[0]->getTensorDesc().getDims().size() > 3) ? inputs[0]->getTensorDesc().getDims()[3] : 1; + size_t IH = (inputs[0]->getTensorDesc().getDims().size() > 2) ? inputs[0]->getTensorDesc().getDims()[2] : 1; + size_t IC = (inputs[0]->getTensorDesc().getDims().size() > 1) ? inputs[0]->getTensorDesc().getDims()[1] : 1; + size_t B = (inputs[0]->getTensorDesc().getDims().size() > 0) ? inputs[0]->getTensorDesc().getDims()[0] : 1; int end_index = 0; int num_ = 0; @@ -226,26 +308,41 @@ class RegionYoloImpl: public ExtLayerBase { end_index = IW * IH * (classes + 1); num_ = mask_size; } - int inputs_size = IH * IW * num_ * (classes + coords + 1); - int total_size = 2 * IH * IW; + size_t inputs_size = IH * IW * num_ * (classes + coords + 1); + size_t total_size = 2 * IH * IW; + + const auto *src_data = inputs[0]->cbuffer().as(); + auto *dst_data = outputs[0]->buffer().as(); + + try { + cpu_convert(src_data, dst_data, inputs[0]->getTensorDesc().getPrecision(), outputs[0]->getTensorDesc().getPrecision(), B * IC * IH * IW); - for (int b = 0; b < B; b++) { - for (int n = 0; n < num_; n++) { - int index = b * inputs_size + n * IW * IH * (classes + coords + 1); - calculate_logistic(index, total_size, dst_data); + for (int b = 0; b < B; b++) { + for (int n = 0; n < num_; n++) { + size_t index = b * inputs_size + n * IW * IH * (classes + coords + 1); + calculate_logistic(index, total_size, dst_data); - index = b * inputs_size + IW * IH * (n * (classes + coords + 1) + coords); - calculate_logistic(index, end_index, dst_data); + index = b * inputs_size + IW * IH * (n * (classes + coords + 1) + coords); + calculate_logistic(index, end_index, dst_data); + } } - } - if (do_softmax) { - int index = IW * IH * (coords + 1); - int batch_offset = inputs_size / num; - for (int b = 0; b < B * num; b++) - softmax_kernel->execute(src_data + index + b * batch_offset, dst_data + index + b * batch_offset, 1, classes, IH, IW); + if (do_softmax) { + int index = IW * IH * (coords + 1); + int batch_offset = inputs_size / num; + for (int b = 0; b < B * num; b++) { + softmax_kernel->execute(src_data + input_prec.size() * (index + b * batch_offset), + dst_data + output_prec.size() * (index + b * batch_offset), 1, classes, IH, IW); + } + } + } + catch (const std::exception& excp) { + snprintf(resp->msg, sizeof(resp->msg), "%s", excp.what()); + return GENERAL_ERROR; + } + catch(...) { + return GENERAL_ERROR; } - return OK; } @@ -255,6 +352,7 @@ class RegionYoloImpl: public ExtLayerBase { int num; float do_softmax; std::vector mask; + Precision input_prec, output_prec; int block_size; std::shared_ptr logistic_kernel; @@ -281,7 +379,9 @@ class RegionYoloImpl: public ExtLayerBase { return src; } - inline void calculate_logistic(int start_index, int count, float* dst_data) { + + inline void calculate_logistic(size_t start_index, int count, uint8_t * dst_data) { + auto dst_data_size = output_prec.size(); if (logistic_kernel) { int blocks_num = div_up(count, block_size); parallel_for(blocks_num, [&](int ib) { @@ -289,15 +389,24 @@ class RegionYoloImpl: public ExtLayerBase { int work_amount = std::min(count - idx, block_size); auto arg = jit_args_logistic(); - arg.src = dst_data + start_index + idx; - arg.dst = dst_data + start_index + idx; + arg.src = arg.dst = dst_data + dst_data_size * (start_index + idx); arg.work_amount = static_cast(work_amount); (*logistic_kernel)(&arg); }); } else { - for (int i = 0; i < count; i++) { - dst_data[i + start_index] = logistic_scalar(dst_data[i + start_index]); + if (Precision::FP32 == output_prec) { + auto float_dst_data = reinterpret_cast(dst_data); + for (int i = 0; i < count; i++) { + float_dst_data[i + start_index] = logistic_scalar(float_dst_data[i + start_index]); + } + } else if (Precision::BF16 == output_prec) { + auto bf16_dst_data = reinterpret_cast(dst_data); + for (int i = 0; i < count; i++) { + bf16_dst_data[i + start_index] = logistic_scalar(bf16_dst_data[i + start_index]); + } + } else { + THROW_IE_EXCEPTION << "Unsupported precision configuration outPrc=" << output_prec.name(); } } } diff --git a/inference-engine/src/mkldnn_plugin/nodes/reorg_yolo.cpp b/inference-engine/src/mkldnn_plugin/nodes/reorg_yolo.cpp index 0b74fbd4395317..750b3634015315 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/reorg_yolo.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/reorg_yolo.cpp @@ -18,7 +18,7 @@ class ReorgYoloImpl: public ExtLayerBase { stride = layer->GetParamAsInt("stride"); - addConfig(layer, {DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)}); + addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/reverse_sequence.cpp b/inference-engine/src/mkldnn_plugin/nodes/reverse_sequence.cpp index a76a0d4ce3dcbe..bcb8d90c28c36e 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/reverse_sequence.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/reverse_sequence.cpp @@ -23,10 +23,12 @@ class ReverseSequenceImpl: public ExtLayerBase { THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; src_dims = layer->insData[REVERSESEQUENCE_DATA].lock()->getTensorDesc().getDims(); + + Precision lengthsPrecision = layer->insData[REVERSESEQUENCE_LENGTHS].lock()->getTensorDesc().getPrecision(); + if (lengthsPrecision != Precision::I32 && lengthsPrecision != Precision::FP32) + lengthsPrecision = Precision::I32; + SizeVector seq_lengths_dims = layer->insData[REVERSESEQUENCE_LENGTHS].lock()->getTensorDesc().getDims(); - if (layer->insData[REVERSESEQUENCE_LENGTHS].lock()->getTensorDesc().getPrecision() != Precision::I32 && - layer->insData[REVERSESEQUENCE_LENGTHS].lock()->getTensorDesc().getPrecision() != Precision::FP32) - THROW_IE_EXCEPTION << layer->name << " Incorrect 'seq_lengths' input precision. Only FP32 and I32 are supported!"; if (seq_lengths_dims.size() > 1) THROW_IE_EXCEPTION << layer->name << " Seq_lengths vector should be 1 dimension"; @@ -60,7 +62,7 @@ class ReverseSequenceImpl: public ExtLayerBase { work_amount_dst = srcStrides[0] * src_dims[0]; addConfig(layer, - { DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN) }, + { DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, lengthsPrecision) }, { DataConfigurator(ConfLayout::PLN, Precision::FP32) }); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); diff --git a/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp index 2c82c6e7a7face..f95de39c184ec0 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp @@ -328,8 +328,8 @@ class ExperimentalDetectronROIFeatureExtractorImpl: public ExtLayerBase { pooled_height_ = output_dim_; pooled_width_ = output_dim_; - std::vector inputs_layouts(layer->insData.size(), DataConfigurator(ConfLayout::PLN)); - std::vector outputs_layouts(layer->outData.size(), DataConfigurator(ConfLayout::PLN)); + std::vector inputs_layouts(layer->insData.size(), DataConfigurator(ConfLayout::PLN, Precision::FP32)); + std::vector outputs_layouts(layer->outData.size(), DataConfigurator(ConfLayout::PLN, Precision::FP32)); addConfig(layer, inputs_layouts, outputs_layouts); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); diff --git a/inference-engine/src/mkldnn_plugin/nodes/select.cpp b/inference-engine/src/mkldnn_plugin/nodes/select.cpp index 3813986f4e134f..5e84e9fa8f7fb7 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/select.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/select.cpp @@ -31,8 +31,12 @@ class SelectImpl: public ExtLayerBase { broadcast = layer->GetParamAsString("auto_broadcast", "numpy"); - if (layer->insData[THEN].lock()->getTensorDesc().getPrecision() != layer->insData[ELSE].lock()->getTensorDesc().getPrecision()) - THROW_IE_EXCEPTION << "Select layer with name '" << layer->name << "' has different precisions on 'Then' and 'Else' inputs"; + auto inputPrecision = layer->insData[THEN].lock()->getTensorDesc().getPrecision(); + if (inputPrecision == Precision::BF16 || layer->insData[ELSE].lock()->getTensorDesc().getPrecision() == Precision::BF16) { + inputPrecision = Precision::BF16; + } else if (layer->insData[THEN].lock()->getTensorDesc().getPrecision() != layer->insData[ELSE].lock()->getTensorDesc().getPrecision()) { + THROW_IE_EXCEPTION << "Select layer with name '" << layer->name << "' has different precisions on 'Then' and 'Else' inputs "; + } const auto& conditionPrecision = layer->insData[CONDITION].lock()->getTensorDesc().getPrecision(); if (conditionPrecision != Precision::BOOL && conditionPrecision != Precision::I32 && conditionPrecision != Precision::U8) @@ -100,7 +104,7 @@ class SelectImpl: public ExtLayerBase { inConfig.inPlace = -1; inConfig.constant = false; - Precision inPrecision = layer->insData[i].lock()->getTensorDesc().getPrecision(); + Precision inPrecision = i == CONDITION ? conditionPrecision : inputPrecision; const SizeVector& inDims = layer->insData[i].lock()->getTensorDesc().getDims(); inConfig.desc = TensorDesc(inPrecision, inDims, InferenceEngine::TensorDesc::getLayoutByDims(inDims)); @@ -110,9 +114,8 @@ class SelectImpl: public ExtLayerBase { DataConfig outConfig; outConfig.inPlace = -1; outConfig.constant = false; - Precision outPrecision = layer->insData[1].lock()->getTensorDesc().getPrecision(); const SizeVector& outDims = layer->outData[0]->getTensorDesc().getDims(); - outConfig.desc = TensorDesc(outPrecision, outDims, InferenceEngine::TensorDesc::getLayoutByDims(outDims)); + outConfig.desc = TensorDesc(inputPrecision, outDims, InferenceEngine::TensorDesc::getLayoutByDims(outDims)); config.outConfs.push_back(outConfig); config.dynBatchSupport = false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/simplernms.cpp b/inference-engine/src/mkldnn_plugin/nodes/simplernms.cpp index 2bc2f8c506ace2..80997266cdb443 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/simplernms.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/simplernms.cpp @@ -225,8 +225,8 @@ class SimplerNMSImpl : public ExtLayerBase { layer->insData[0].lock()->getTensorDesc().getDims().size() != 4) THROW_IE_EXCEPTION << "Unsupported dimensions!"; - addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, - {DataConfigurator(ConfLayout::PLN)}); + addConfig(layer, {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32), + DataConfigurator(ConfLayout::PLN, Precision::FP32)}, {DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/space_to_batch.cpp b/inference-engine/src/mkldnn_plugin/nodes/space_to_batch.cpp index d84da1ac400bf0..fc9b08aa05de4b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/space_to_batch.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/space_to_batch.cpp @@ -54,15 +54,11 @@ class SpaceToBatchImpl: public ExtLayerBase { auto inData = spaceToBatchLayer->insData[i].lock(); if (inData == nullptr) THROW_IE_EXCEPTION << "'" << spaceToBatchLayer->name << "' layer has nullable input data"; - config.inConfs[i].desc = TensorDesc(inData->getTensorDesc().getPrecision(), - inData->getTensorDesc().getDims(), - inData->getTensorDesc().getLayout()); + config.inConfs[i].desc = TensorDesc(precision, inData->getTensorDesc().getDims(), inData->getTensorDesc().getLayout()); } DataConfig outConfig; - outConfig.desc = TensorDesc(layer->outData[0]->getTensorDesc().getPrecision(), - out_dims, - layer->outData[0]->getTensorDesc().getLayout()); + outConfig.desc = TensorDesc(precision, out_dims, layer->outData[0]->getTensorDesc().getLayout()); config.outConfs.push_back(outConfig); config.dynBatchSupport = false; confs.push_back(config); diff --git a/inference-engine/src/mkldnn_plugin/nodes/sparse_fill_empty_rows.cpp b/inference-engine/src/mkldnn_plugin/nodes/sparse_fill_empty_rows.cpp index 6f559832f2a260..a73e58a353fc6c 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/sparse_fill_empty_rows.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/sparse_fill_empty_rows.cpp @@ -25,11 +25,6 @@ class SparseFillEmptyRowsImpl : public ExtLayerBase { THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!"; } - Precision input_indices_precision = layer->insData[INPUT_INDICES_PORT].lock()->getTensorDesc().getPrecision(); - if (input_indices_precision != Precision::FP32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect input precision. Only FP32 is supported!"; - } - // check dimensions of input tensors SizeVector input_indices_dims = layer->insData[INPUT_INDICES_PORT].lock()->getTensorDesc().getDims(); if (input_indices_dims.size() != 2 || input_indices_dims[1] != 2) { @@ -75,8 +70,10 @@ class SparseFillEmptyRowsImpl : public ExtLayerBase { // TODO: check that dense shape value is set addConfig(layer, - {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, - {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}); + {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32), + DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32)}, + {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32), + DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); diff --git a/inference-engine/src/mkldnn_plugin/nodes/sparse_segment_reduce.cpp b/inference-engine/src/mkldnn_plugin/nodes/sparse_segment_reduce.cpp index c145709b0fd83d..2bea7fde1fb442 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/sparse_segment_reduce.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/sparse_segment_reduce.cpp @@ -38,20 +38,6 @@ class SparseSegmentReduceImpl : public ExtLayerBase { else THROW_IE_EXCEPTION << layer->name << " Incorrect SparseSegmentReduce layer type!"; - // check a precision of input tensors - Precision input_data_precision = layer->insData[INPUT_DATA_PORT].lock()->getTensorDesc().getPrecision(); - if (input_data_precision != Precision::FP32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect precision of the input data. Only FP32 is supported!"; - } - Precision input_indices_precision = layer->insData[INPUT_INDICES_PORT].lock()->getTensorDesc().getPrecision(); - if (input_indices_precision != Precision::FP32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect precision of the input indices. Only FP32 is supported!"; - } - Precision input_segment_ids_precision = layer->insData[INPUT_SEGMENT_IDS_PORT].lock()->getTensorDesc().getPrecision(); - if (input_segment_ids_precision != Precision::FP32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect precision of segment IDs. Only FP32 is supported!"; - } - // check shapes of the second and third input tensors input_indices_dims = layer->insData[INPUT_INDICES_PORT].lock()->getTensorDesc().getDims(); if (input_indices_dims.size() != 1) { @@ -65,12 +51,6 @@ class SparseSegmentReduceImpl : public ExtLayerBase { THROW_IE_EXCEPTION << layer->name << " Shapes for input indices and segment IDs must match."; } - // check a precision of output tensor - Precision output_precision = layer->insData[OUTPUT_PORT].lock()->getTensorDesc().getPrecision(); - if (output_precision != Precision::FP32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect precision of output data. Only FP32 is supported!"; - } - // check shapes of output tensor input_data_dims = layer->insData[INPUT_DATA_PORT].lock()->getTensorDesc().getDims(); output_dims = layer->outData[OUTPUT_PORT]->getTensorDesc().getDims(); @@ -88,8 +68,8 @@ class SparseSegmentReduceImpl : public ExtLayerBase { // confugure layouts of input and output ports addConfig(layer, - { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); + { DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32), + DataConfigurator(ConfLayout::PLN, Precision::FP32) }, { DataConfigurator(ConfLayout::PLN, Precision::FP32) }); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); diff --git a/inference-engine/src/mkldnn_plugin/nodes/sparse_to_dense.cpp b/inference-engine/src/mkldnn_plugin/nodes/sparse_to_dense.cpp index abc2c1dec29ab8..526248c580e2c4 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/sparse_to_dense.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/sparse_to_dense.cpp @@ -28,26 +28,6 @@ class SparseToDenseImpl : public ExtLayerBase { with_default_value = true; } - // check precisions for input tensors - Precision input_indices_precision = layer->insData[INPUT_INDICES_PORT].lock()->getTensorDesc().getPrecision(); - if (input_indices_precision != Precision::I32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect input precision for input indices. Only I32 is supported!"; - } - Precision input_dense_shape_precision = layer->insData[INPUT_DENSE_SHAPE_PORT].lock()->getTensorDesc().getPrecision(); - if (input_dense_shape_precision != Precision::I32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect input precision for input dense shape. Only I32 is supported!"; - } - Precision input_values_precision = layer->insData[INPUT_VALUES_PORT].lock()->getTensorDesc().getPrecision(); - if (input_values_precision != Precision::I32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect input precision for input values. Only I32 is supported!"; - } - if (with_default_value) { - Precision input_default_value_precision = layer->insData[INPUT_DEFAULT_VALUE_PORT].lock()->getTensorDesc().getPrecision(); - if (input_default_value_precision != Precision::I32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect input precision for input default value. Only I32 is supported!"; - } - } - // check dimensions of input tensors SizeVector input_dense_shape_dims = layer->insData[INPUT_DENSE_SHAPE_PORT].lock()->getTensorDesc().getDims(); if (input_dense_shape_dims.size() != 1 || input_dense_shape_dims[0] < 1) { @@ -73,14 +53,14 @@ class SparseToDenseImpl : public ExtLayerBase { // TODO: check that dense shape value is set if (with_default_value) { addConfig(layer, - { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), - DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); + { DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32), + DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32) }, + { DataConfigurator(ConfLayout::PLN, Precision::I32) }); } else { addConfig(layer, - { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), - DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); + { DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32), + DataConfigurator(ConfLayout::PLN, Precision::I32) }, + { DataConfigurator(ConfLayout::PLN, Precision::I32) }); } } catch (InferenceEngine::details::InferenceEngineException &ex) { diff --git a/inference-engine/src/mkldnn_plugin/nodes/sparse_weighted_reduce.cpp b/inference-engine/src/mkldnn_plugin/nodes/sparse_weighted_reduce.cpp index 2ed9b2266060ce..6023476ebf0046 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/sparse_weighted_reduce.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/sparse_weighted_reduce.cpp @@ -127,14 +127,15 @@ class ExperimentalSparseWeightedReduceImpl : public ExtLayerBase { // TODO: check that dense shape value is set if (with_weights) { addConfig(layer, - { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), - DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); + { DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32), + DataConfigurator(ConfLayout::PLN, Precision::I32), + DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::I32), + DataConfigurator(ConfLayout::PLN, Precision::FP32) }, { DataConfigurator(ConfLayout::PLN, Precision::FP32) }); } else { addConfig(layer, - { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), - DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); + { DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32), + DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::FP32), + DataConfigurator(ConfLayout::PLN, Precision::I32) }, { DataConfigurator(ConfLayout::PLN, Precision::FP32) }); } } catch (InferenceEngine::details::InferenceEngineException &ex) { diff --git a/inference-engine/src/mkldnn_plugin/nodes/strided_slice.cpp b/inference-engine/src/mkldnn_plugin/nodes/strided_slice.cpp index 5e375b462d1804..f8d4af798d57ef 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/strided_slice.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/strided_slice.cpp @@ -35,8 +35,6 @@ class StridedSliceImpl: public ExtLayerBase { begin_dims = {}; if (layer->insData.size() > 1) { begin_dims = layer->insData[STRIDEDSLICE_BEGIN].lock()->getTensorDesc().getDims(); - if (layer->insData[STRIDEDSLICE_BEGIN].lock()->getTensorDesc().getPrecision() != Precision::I32) - THROW_IE_EXCEPTION << layer->name << " Incorrect 'begin' input precision. Only I32 is supported!"; if (begin_dims.size() > 1) THROW_IE_EXCEPTION << layer->name << " Begin vector should be 1 dimension"; bounds_size = begin_dims[0]; @@ -44,8 +42,6 @@ class StridedSliceImpl: public ExtLayerBase { if (layer->insData.size() > 2) { end_dims = layer->insData[STRIDEDSLICE_END].lock()->getTensorDesc().getDims(); - if (layer->insData[STRIDEDSLICE_END].lock()->getTensorDesc().getPrecision() != Precision::I32) - THROW_IE_EXCEPTION << layer->name << " Incorrect 'end' input precision. Only I32 is supported!"; if (end_dims.size() > 1) THROW_IE_EXCEPTION << layer->name << " End vector should be 1 dimension"; if (begin_dims[0] != end_dims[0]) @@ -54,8 +50,6 @@ class StridedSliceImpl: public ExtLayerBase { if (layer->insData.size() > 3) { stride_dims = layer->insData[STRIDEDSLICE_STRIDE].lock()->getTensorDesc().getDims(); - if (layer->insData[STRIDEDSLICE_STRIDE].lock()->getTensorDesc().getPrecision() != Precision::I32) - THROW_IE_EXCEPTION << layer->name << " Incorrect 'strides' input precision. Only I32 is supported!"; if (stride_dims.size() > 1) THROW_IE_EXCEPTION << layer->name << " End vector should be 1 dimension"; if (begin_dims[0] != stride_dims[0]) @@ -134,16 +128,19 @@ class StridedSliceImpl: public ExtLayerBase { srcStrides = layer->insData[STRIDEDSLICE_DATA].lock()->getTensorDesc().getBlockingDesc().getStrides(); dstStrides = layer->outData[0]->getTensorDesc().getBlockingDesc().getStrides(); + Precision dataPrecision = layer->insData[STRIDEDSLICE_DATA].lock()->getTensorDesc().getPrecision(); if (layer->insData.size() == 1) { - addConfig(layer, { DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); + addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision) }, { DataConfigurator(ConfLayout::PLN, dataPrecision) }); } else if (layer->insData.size() == 2) { - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); + addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision), DataConfigurator(ConfLayout::PLN, Precision::I32) }, + { DataConfigurator(ConfLayout::PLN, dataPrecision) }); } else if (layer->insData.size() == 3) { - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); + addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision), DataConfigurator(ConfLayout::PLN, Precision::I32), + DataConfigurator(ConfLayout::PLN, Precision::I32) }, { DataConfigurator(ConfLayout::PLN, dataPrecision) }); } else { - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), - DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) }); + addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision), DataConfigurator(ConfLayout::PLN, Precision::I32), + DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32) }, + { DataConfigurator(ConfLayout::PLN, dataPrecision) }); } } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); @@ -151,8 +148,6 @@ class StridedSliceImpl: public ExtLayerBase { } StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - const float *src_data = inputs[STRIDEDSLICE_DATA]->cbuffer().as() + - inputs[STRIDEDSLICE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding(); int *begin = nullptr, *end = nullptr, *stride = nullptr; if (begin_dims.size()) begin = inputs[STRIDEDSLICE_BEGIN]->cbuffer().as() + inputs[STRIDEDSLICE_BEGIN]->getTensorDesc().getBlockingDesc().getOffsetPadding(); @@ -160,17 +155,12 @@ class StridedSliceImpl: public ExtLayerBase { end = inputs[STRIDEDSLICE_END]->cbuffer().as() + inputs[STRIDEDSLICE_END]->getTensorDesc().getBlockingDesc().getOffsetPadding(); if (stride_dims.size()) stride = inputs[STRIDEDSLICE_STRIDE]->cbuffer().as() + inputs[STRIDEDSLICE_STRIDE]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - float* dst_data = outputs[0]->cbuffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); InferenceEngine::SizeVector src_dims = inputs[STRIDEDSLICE_DATA]->getTensorDesc().getDims(); InferenceEngine::SizeVector srcStrides = inputs[STRIDEDSLICE_DATA]->getTensorDesc().getBlockingDesc().getStrides(); InferenceEngine::SizeVector dst_dims = outputs[0]->getTensorDesc().getDims(); InferenceEngine::SizeVector dstStrides = outputs[0]->getTensorDesc().getBlockingDesc().getStrides(); - auto dst_size = outputs[0]->byteSize(); - memset(dst_data, 0, dst_size); - size_t i, j, k, bj, ej, sj; InferenceEngine::SizeVector our_dims; InferenceEngine::SizeVector out_dims; @@ -231,13 +221,49 @@ class StridedSliceImpl: public ExtLayerBase { return PARAMETER_MISMATCH; } + const size_t inputsPrecSize = inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().size(); if (static_cast(src_dims.size()) == max_dims && shrink_axis == 0 && - stride_dms[stride_dms.size()-1] == 1 && stride_dms.size() > 1) - strided_slice_vp(src_data, dst_data); - else if (static_cast(src_dims.size()) == max_dims && shrink_axis == 0) - strided_slice_p(src_data, dst_data); - else - strided_slice(src_data, dst_data, our_dims); + stride_dms[stride_dms.size()-1] == 1 && stride_dms.size() > 1) { + if (inputsPrecSize != outputs[0]->getTensorDesc().getPrecision().size()) { + if (resp) { + std::string errorMsg = "StridedSlice layer doesn't support 'Data' input precision: " + + std::string(inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().name()); + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return GENERAL_ERROR; + } + strided_slice_vp(inputs[STRIDEDSLICE_DATA], outputs[0]); + } else if (static_cast(src_dims.size()) == max_dims && shrink_axis == 0) { + switch (inputsPrecSize) { + case 1: { strided_slice_p(inputs[STRIDEDSLICE_DATA], outputs[0]); break; } + case 2: { strided_slice_p(inputs[STRIDEDSLICE_DATA], outputs[0]); break; } + case 4: { strided_slice_p(inputs[STRIDEDSLICE_DATA], outputs[0]); break; } + case 8: { strided_slice_p(inputs[STRIDEDSLICE_DATA], outputs[0]); break; } + default: { + if (resp) { + std::string errorMsg = "StridedSlice layer doesn't support 'Data' input precision: " + + std::string(inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().name()); + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return GENERAL_ERROR; + } + } + } else { + switch (inputsPrecSize) { + case 1: { strided_slice(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; } + case 2: { strided_slice(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; } + case 4: { strided_slice(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; } + case 8: { strided_slice(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; } + default: { + if (resp) { + std::string errorMsg = "StridedSlice layer doesn't support 'Data' input precision: " + + std::string(inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().name()); + errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); + } + return GENERAL_ERROR; + } + } + } return OK; } @@ -248,9 +274,11 @@ class StridedSliceImpl: public ExtLayerBase { const size_t STRIDEDSLICE_END = 2; const size_t STRIDEDSLICE_STRIDE = 3; - void strided_slice(const float *src_data, float* dst_data, std::vector &dims); - void strided_slice_vp(const float *src_data, float* dst_data); - void strided_slice_p(const float *src_data, float* dst_data); + template + void strided_slice(Blob::Ptr&, Blob::Ptr& dst_data, std::vector &dims); + void strided_slice_vp(Blob::Ptr&, Blob::Ptr& dst_data); + template + void strided_slice_p(Blob::Ptr&, Blob::Ptr& dst_data); SizeVector begin_dims; SizeVector end_dims; @@ -275,7 +303,13 @@ class StridedSliceImpl: public ExtLayerBase { int ellipsis_pos1, ellipsis_pos2; }; -void StridedSliceImpl::strided_slice(const float *src_data, float* dst_data, std::vector &dims) { +template +void StridedSliceImpl::strided_slice(Blob::Ptr& input, Blob::Ptr& output, std::vector &dims) { + auto* src_data = input->cbuffer().as() + input->getTensorDesc().getBlockingDesc().getOffsetPadding(); + auto* dst_data = output->buffer().as() + output->getTensorDesc().getBlockingDesc().getOffsetPadding(); + auto dst_size = output->byteSize(); + memset(dst_data, 0, dst_size); + size_t work_amount_dst = dstStrides[0] * dst_dims[0]; parallel_nt(0, [&](const int ithr, const int nthr) { int j; @@ -306,10 +340,16 @@ void StridedSliceImpl::strided_slice(const float *src_data, float* dst_data, std }); } -void StridedSliceImpl::strided_slice_vp(const float *src_data, float* dst_data) { +void StridedSliceImpl::strided_slice_vp(Blob::Ptr& input, Blob::Ptr& output) { + size_t dataSize = input->getTensorDesc().getPrecision().size(); + const uint8_t* src_data = input->cbuffer().as() + input->getTensorDesc().getBlockingDesc().getOffsetPadding() * dataSize; + uint8_t* dst_data = output->buffer().as() + output->getTensorDesc().getBlockingDesc().getOffsetPadding() * dataSize; + auto dst_size = output->byteSize(); + memset(dst_data, 0, dst_size); + // Vectorized copy size_t dims_size_1 = dst_dims.size() - 1; - size_t dataLength = dst_dims[dims_size_1]; + size_t len = dst_dims[dims_size_1] * dataSize; size_t work_amount_dst = dstStrides[0] * dst_dims[0] / dst_dims[dims_size_1]; parallel_nt(0, [&](const int ithr, const int nthr) { @@ -323,8 +363,8 @@ void StridedSliceImpl::strided_slice_vp(const float *src_data, float* dst_data) i /= dst_dims[j]; } - for (size_t iwork = start, dst_idx = start * dataLength, i = 1; iwork < end; ++iwork, dst_idx += dataLength) { - cpu_memcpy(&dst_data[dst_idx], &src_data[src_idx], sizeof(float) * dataLength); + for (size_t iwork = start, dst_idx = start * len, i = 1; iwork < end; ++iwork, dst_idx += len) { + cpu_memcpy(&dst_data[dst_idx], &src_data[src_idx * dataSize], len); for (int j = dims_size_1 - 1; j >= 0; j--) { counters[j]++; if (counters[j] < dst_dims[j]) { @@ -342,7 +382,13 @@ void StridedSliceImpl::strided_slice_vp(const float *src_data, float* dst_data) }); } -void StridedSliceImpl::strided_slice_p(const float *src_data, float* dst_data) { +template +void StridedSliceImpl::strided_slice_p(Blob::Ptr& input, Blob::Ptr& output) { + auto* src_data = input->cbuffer().as() + input->getTensorDesc().getBlockingDesc().getOffsetPadding(); + auto* dst_data = output->buffer().as() + output->getTensorDesc().getBlockingDesc().getOffsetPadding(); + auto dst_size = output->byteSize(); + memset(dst_data, 0, dst_size); + size_t dims_size = dst_dims.size(); size_t work_amount_dst = dstStrides[0] * dst_dims[0]; diff --git a/inference-engine/src/mkldnn_plugin/nodes/topk.cpp b/inference-engine/src/mkldnn_plugin/nodes/topk.cpp index f2f715ec68734d..09ab13796b9e87 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/topk.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/topk.cpp @@ -30,14 +30,6 @@ class TopKImpl: public ExtLayerBase { if (layer->outData.size() != 1 && layer->outData.size() != 2) THROW_IE_EXCEPTION << layer->name << " Incorrect number of output edges!"; - // DataConfigurator::addConfig will automatically change BF16 datatype to FP32 - // it can be changed back by explicit modification like confs.back().outConfs[i].desc.setPrecision(Precision::BF16); - // if current layer supports BF16 naturally. usually they are not and nothing special is not required - if ((layer->insData[TOPK_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32 && - layer->insData[TOPK_DATA].lock()->getTensorDesc().getPrecision() != Precision::BF16) || - layer->insData[TOPK_K].lock()->getTensorDesc().getPrecision() != Precision::I32) - THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect input data/index values precision."; - if (layer->insData[TOPK_K].lock()->getTensorDesc().getDims().size() > 1) THROW_IE_EXCEPTION << layer->name << " TopKImpl - Index vector should be 1 dimension"; @@ -47,10 +39,6 @@ class TopKImpl: public ExtLayerBase { THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect input/output tensor dimension sizes"; if (layer->outData.size() == 2) { - if (layer->outData[TOPK_VALUE]->getTensorDesc().getPrecision() != Precision::FP32 && - layer->outData[TOPK_VALUE]->getTensorDesc().getPrecision() != Precision::BF16) - THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect output data tensor precision. Floating point datatypes are supported!"; - SizeVector dst_idx_dims = layer->outData[TOPK_INDEX]->getTensorDesc().getDims(); if (dst_dims.size() != dst_idx_dims.size()) THROW_IE_EXCEPTION << layer->name << " Incorrect output tensor dimension sizes"; @@ -102,11 +90,11 @@ class TopKImpl: public ExtLayerBase { before_num = count(src_dims, 0, axis); if (layer->outData.size() == 1) { - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, + addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::I32) }, { DataConfigurator(ConfLayout::PLN) }); } else { - addConfig(layer, { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }); + addConfig(layer, { DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::I32) }, + { DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN) }); // TODO: WA... While ICNNNetwork has no clear rule to fill tensor precision // it use precision of parent layer. So each output tensor Data object has diff --git a/inference-engine/src/mkldnn_plugin/nodes/topkrois_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/topkrois_onnx.cpp index 9fe2c7cb461cdb..195e3ecfff4d82 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/topkrois_onnx.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/topkrois_onnx.cpp @@ -39,8 +39,8 @@ class ExperimentalDetectronTopKROIsImpl: public ExtLayerBase { max_rois_num_ = layer->GetParamAsInt("max_rois", 0); addConfig(layer, - {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, - {DataConfigurator(ConfLayout::PLN)}); + {DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32)}, + {DataConfigurator(ConfLayout::PLN, Precision::FP32)}); } catch (InferenceEngine::details::InferenceEngineException &ex) { errorMsg = ex.what(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/unique.cpp b/inference-engine/src/mkldnn_plugin/nodes/unique.cpp index 950a8fd2eb99f7..f544789041f615 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/unique.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/unique.cpp @@ -61,20 +61,12 @@ class UniqueImpl : public ExtLayerBase { // check dimensions of output tensors and its precisions size_t cur_output_port = 0; SizeVector output_uniques_dims = layer->outData[cur_output_port]->getTensorDesc().getDims(); - Precision output_uniques_precision = layer->outData[cur_output_port]->getTensorDesc().getPrecision(); - if (output_uniques_precision != Precision::FP32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect precision for output tensor of unique elements. Only FP32 is supported!"; - } if (output_uniques_dims.size() != 1 || output_uniques_dims[0] != num_elements) { THROW_IE_EXCEPTION << layer->name << " Incorrect dimensions for output tensor of unique elements."; } if (return_inverse) { cur_output_port++; SizeVector output_indices_dims = layer->outData[cur_output_port]->getTensorDesc().getDims(); - Precision output_indices_precision = layer->outData[cur_output_port]->getTensorDesc().getPrecision(); - if (output_indices_precision != Precision::FP32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect precision for output tensor of indices. Only FP32 is supported!"; - } if (output_indices_dims.size() != 1 || output_indices_dims[0] != num_elements) { THROW_IE_EXCEPTION << layer->name << " Incorrect dimensions for output tensor of indices."; } @@ -82,10 +74,6 @@ class UniqueImpl : public ExtLayerBase { if (return_counts) { cur_output_port++; SizeVector output_counts_dims = layer->outData[cur_output_port]->getTensorDesc().getDims(); - Precision output_counts_precision = layer->outData[cur_output_port]->getTensorDesc().getPrecision(); - if (output_counts_precision != Precision::FP32) { - THROW_IE_EXCEPTION << layer->name << " Incorrect precision for output tensor of counts. Only FP32 is supported!"; - } if (output_counts_dims.size() != 1 || output_counts_dims[0] != num_elements) { THROW_IE_EXCEPTION << layer->name << " Incorrect dimensions for output tensor of counts."; } @@ -94,16 +82,16 @@ class UniqueImpl : public ExtLayerBase { // add a layer configuration if (layer->outData.size() == 1) { addConfig(layer, - { DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN) }); + { DataConfigurator(ConfLayout::PLN, Precision::FP32) }, + { DataConfigurator(ConfLayout::PLN, Precision::FP32) }); } else if (layer->outData.size() == 2) { addConfig(layer, - { DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }); + { DataConfigurator(ConfLayout::PLN, Precision::FP32) }, + { DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32) }); } else if (layer->outData.size() == 3) { addConfig(layer, - { DataConfigurator(ConfLayout::PLN) }, - { DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN) }); + { DataConfigurator(ConfLayout::PLN, Precision::FP32) }, { DataConfigurator(ConfLayout::PLN, Precision::FP32), + DataConfigurator(ConfLayout::PLN, Precision::FP32), DataConfigurator(ConfLayout::PLN, Precision::FP32) }); } } catch (InferenceEngine::details::InferenceEngineException &ex) { diff --git a/inference-engine/src/mkldnn_plugin/utils/bfloat16.hpp b/inference-engine/src/mkldnn_plugin/utils/bfloat16.hpp new file mode 100644 index 00000000000000..35fac1fa682462 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/utils/bfloat16.hpp @@ -0,0 +1,141 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +/** + * The bfloat16_t class can be used as an arithmetic type. All arithmetic operations goes through conversion to the float data type. + */ + + +#define BFLOAT16_ROUND_MODE_TRUNCATE + +namespace MKLDNNPlugin { +class bfloat16_t { +public: + constexpr bfloat16_t() + : m_value{0} + { + } + bfloat16_t(float value) noexcept + : m_value{ +#if defined BFLOAT16_ROUND_MODE_TO_NEAREST + round_to_nearest(value) +#elif defined BFLOAT16_ROUND_MODE_TO_NEAREST_EVEN + round_to_nearest_even(value) +#elif defined BFLOAT16_ROUND_MODE_TRUNCATE + truncate(value) +#else +#error \ + "ROUNDING_MODE must be one of BFLOAT16_ROUND_MODE_TO_NEAREST, BFLOAT16_ROUND_MODE_TO_NEAREST_EVEN, or BFLOAT16_ROUND_MODE_TRUNCATE" +#endif + } + { + } + + operator float() const { + return F32{uint32_t(m_value) << 16}.vfloat; + } + static constexpr bfloat16_t from_bits(uint16_t bits) { return bfloat16_t(bits, true); } + uint16_t to_bits() const { return m_value; } + + static inline uint16_t round_to_nearest_even(float x) { + return static_cast((F32(x).vint + ((F32(x).vint & 0x00010000) >> 1)) >> 16); + } + + static inline uint16_t round_to_nearest(float x) { + return static_cast((F32(x).vint + 0x8000) >> 16); + } + + static inline uint16_t truncate(float x) { return static_cast((F32(x).vint) >> 16); } + +private: + constexpr bfloat16_t(uint16_t x, bool) + : m_value{x} + { + } + union alignas(16) F32 { + F32(float val) + : vfloat{val} { + } + + F32(uint32_t val) + : vint{val} { + } + float vfloat; + uint32_t vint; + }; + uint16_t m_value; +}; +} // namespace MKLDNNPlugin + +/** + * std::numeric_limits overloaded for better compatibility with template metaprogramming. + * For example, to make the following template work: + * template + * void someFunction() { + * ... + * T maxValue = std::numeric_limits::max(); + * ... + * } + */ + +namespace std { +template <> +class numeric_limits { +public: + static constexpr bool is_specialized = true; + static constexpr MKLDNNPlugin::bfloat16_t min() noexcept { + return MKLDNNPlugin::bfloat16_t::from_bits(0x007F); + } + static constexpr MKLDNNPlugin::bfloat16_t max() noexcept { + return MKLDNNPlugin::bfloat16_t::from_bits(0x7F7F); + } + static constexpr MKLDNNPlugin::bfloat16_t lowest() noexcept { + return MKLDNNPlugin::bfloat16_t::from_bits(0xFF7F); + } + static constexpr int digits = 7; + static constexpr int digits10 = 2; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr int radix = 2; + static constexpr MKLDNNPlugin::bfloat16_t epsilon() noexcept { + return MKLDNNPlugin::bfloat16_t::from_bits(0x3C00); + } + static constexpr MKLDNNPlugin::bfloat16_t round_error() noexcept { + return MKLDNNPlugin::bfloat16_t::from_bits(0x3F00); + } + static constexpr int min_exponent = -125; + static constexpr int min_exponent10 = -37; + static constexpr int max_exponent = 128; + static constexpr int max_exponent10 = 38; + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr float_denorm_style has_denorm = denorm_absent; + static constexpr bool has_denorm_loss = false; + static constexpr MKLDNNPlugin::bfloat16_t infinity() noexcept { + return MKLDNNPlugin::bfloat16_t::from_bits(0x7F80); + } + static constexpr MKLDNNPlugin::bfloat16_t quiet_NaN() noexcept { + return MKLDNNPlugin::bfloat16_t::from_bits(0x7FC0); + } + static constexpr MKLDNNPlugin::bfloat16_t signaling_NaN() noexcept { + return MKLDNNPlugin::bfloat16_t::from_bits(0x7FC0); + } + static constexpr MKLDNNPlugin::bfloat16_t denorm_min() noexcept { + return MKLDNNPlugin::bfloat16_t::from_bits(0); + } + static constexpr bool is_iec559 = false; + static constexpr bool is_bounded = false; + static constexpr bool is_modulo = false; + static constexpr bool traps = false; + static constexpr bool tinyness_before = false; + static constexpr float_round_style round_style = round_to_nearest; +}; +} // namespace std diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/bf16_network_restoring.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/bf16_network_restoring.cpp index 5d1defd2932aae..8cc114c4594676 100644 --- a/inference-engine/tests/functional/plugin/cpu/bfloat16/bf16_network_restoring.cpp +++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/bf16_network_restoring.cpp @@ -30,7 +30,7 @@ class BF16NetworkRestore1 : public BasicBF16Test { std::shared_ptr createGraph(InferenceEngine::Precision netPrecision) override { // + Power1(FP32) // | - // + AvgPooling1(FP32) + // + AvgPooling1(BF16) // | // + Convolution1(BF16) // | @@ -45,7 +45,7 @@ class BF16NetworkRestore1 : public BasicBF16Test { // | / // ReLU3 (Fused to Conv2) / // | / - // MaxPooling1 (FP32) / + // MaxPooling1 (BF16) / // \ / // Eltwise // | @@ -180,7 +180,7 @@ class BF16NetworkRestore1 : public BasicBF16Test { // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in // performance counters expectedPrecisions["Power1"] = "FP32"; - expectedPrecisions["AvgPooling1"] = "FP32"; + expectedPrecisions["AvgPooling1"] = "BF16"; expectedPrecisions["Convolution1"] = "BF16"; expectedPrecisions["ReLU1"] = "ndef"; expectedPrecisions["Convolution2"] = "BF16"; @@ -189,7 +189,7 @@ class BF16NetworkRestore1 : public BasicBF16Test { expectedPrecisions["Norm1"] = "FP32"; expectedPrecisions["Eltwise1"] = "ndef"; expectedPrecisions["ReLU3"] = "ndef"; - expectedPrecisions["maxPooling1"] = "FP32"; + expectedPrecisions["maxPooling1"] = "BF16"; expectedPrecisions["Eltwise2"] = "FP32"; } }; diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/concat_in_place.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/concat_in_place.cpp index d77fb09f044efc..cc74eb684edb3b 100644 --- a/inference-engine/tests/functional/plugin/cpu/bfloat16/concat_in_place.cpp +++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/concat_in_place.cpp @@ -131,7 +131,7 @@ class Concat_in_place : public BasicBF16Test { expectedPrecisions["ADD_1"] = "FP32"; expectedPrecisions["CONV_1"] = "BF16"; expectedPrecisions["CONV_2"] = "BF16"; - expectedPrecisions["CONC_1_TEST"] = "FP32"; + expectedPrecisions["CONC_1_TEST"] = "BF16"; expectedPrecisions["RELU_1"] = "FP32"; } }; diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_relu_pool_conv_relu_pool.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_relu_pool_conv_relu_pool.cpp index 4dde0eaf889756..20131cb1720f4f 100644 --- a/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_relu_pool_conv_relu_pool.cpp +++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_relu_pool_conv_relu_pool.cpp @@ -32,7 +32,7 @@ class ConvReLUPoolConvReLUPool : public BasicBF16Test { // | // ReLU1 (Fused) // | - // Pooling1 (FP32) + // Pooling1 (BF16) // | // Convolution2 (BF16) // | @@ -164,7 +164,7 @@ class ConvReLUPoolConvReLUPool : public BasicBF16Test { // performance counters expectedPrecisions["Convolution_1"] = "FP32"; expectedPrecisions["ReLU_1"] = "ndef"; - expectedPrecisions["AvgPool_1"] = "FP32"; + expectedPrecisions["AvgPool_1"] = "BF16"; expectedPrecisions["Convolution_2"] = "BF16"; expectedPrecisions["ReLU_2"] = "ndef"; expectedPrecisions["MaxPool_2"] = "BF16"; diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp index 03185914a47577..2f29cb0a6c1ea3 100644 --- a/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp +++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/gather_x2_add_mul_relu_concat_matmul.cpp @@ -37,7 +37,7 @@ class Gather_x2_add_mul_relu_concat_matmul : public BasicBF16Test { // \ / / // Mul(FP32) ReLU(FP32) // \ / -// Concat(FP32) Const +// Concat(BF16) Const // \ / // Matmul(BF16) @@ -116,7 +116,7 @@ class Gather_x2_add_mul_relu_concat_matmul : public BasicBF16Test { fnPtr = createGraph(netPrecision); // STAGE2: set up safe threshold <= 5% from maximum value of output tensor - threshold = 170.02f; // Max in fp32 network by output: 3887.11 + threshold = 177.f; // Max in fp32 network by output: 3887.11 // STAGE3: // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in @@ -125,7 +125,7 @@ class Gather_x2_add_mul_relu_concat_matmul : public BasicBF16Test { expectedPrecisions["Mul_1"] = "FP32"; expectedPrecisions["Add_1"] = "FP32"; expectedPrecisions["Relu_1"] = "FP32"; - expectedPrecisions["Conc_1"] = "FP32"; + expectedPrecisions["Conc_1"] = "BF16"; expectedPrecisions["Matmul_1"] = "BF16"; } }; diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/mobilenet_ssd_with_branching.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/mobilenet_ssd_with_branching.cpp index 4855ca390151d2..aca7bd6eec27c4 100644 --- a/inference-engine/tests/functional/plugin/cpu/bfloat16/mobilenet_ssd_with_branching.cpp +++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/mobilenet_ssd_with_branching.cpp @@ -24,7 +24,7 @@ class MobileNet_ssd_with_branching : public BasicBF16Test { // | // Conv1 (FP32) // | \ - // Conv2 (FP32 so far while we have not greedy mode. This must be fixed. Such pattern shouild have Conv2 in BF16) + // Conv2 (BF16) \ // | | // relu(fused) | // | Normalize (not LRN) @@ -145,18 +145,18 @@ class MobileNet_ssd_with_branching : public BasicBF16Test { fnPtr = createGraph(netPrecision); // STAGE1: - threshold = 0.8f; // max value in latest tensor is 87.67 + threshold = 0.85f; // max value in latest tensor is 87.67 // STAGE2: // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in // performance counters expectedPrecisions["ADD_1"] = "FP32"; expectedPrecisions["CONV_1"] = "BF16"; - expectedPrecisions["CONV_2"] = "FP32"; + expectedPrecisions["CONV_2"] = "BF16"; expectedPrecisions["RELU_2"] = "ndef"; expectedPrecisions["DW_CONV"] = "BF16"; expectedPrecisions["RELU_DW"] = "ndef"; expectedPrecisions["NORM_1"] = "FP32"; - expectedPrecisions["CONC_1"] = "FP32"; + expectedPrecisions["CONC_1"] = "BF16"; } }; diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_relu.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_relu.cpp index cff8ce820f8d4f..d1bfeb0de6f999 100644 --- a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_relu.cpp +++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_relu.cpp @@ -93,7 +93,7 @@ class ScaleshiftConvRelu : public BasicBF16Test { fnPtr = createGraph(netPrecision); // STAGE1: - threshold = 5e-2; + threshold = 7e-2; // STAGE2: // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in // performance counters diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_concat_relu.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_concat_relu.cpp index f8e5ae16c5da63..b94f24111d2abc 100644 --- a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_concat_relu.cpp +++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_concat_relu.cpp @@ -117,7 +117,7 @@ class ScaleshiftConv_x2_ConcatRelu : public BasicBF16Test { expectedPrecisions["ADD_1"] = "FP32"; expectedPrecisions["CONV_1"] = "BF16"; expectedPrecisions["CONV_2"] = "BF16"; - expectedPrecisions["CONC_1"] = "FP32"; + expectedPrecisions["CONC_1"] = "BF16"; expectedPrecisions["RELU_1"] = "FP32"; } }; diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_x3_conv_eltwise_relu.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_x3_conv_eltwise_relu.cpp index 35cace53067492..a3a45a3e09c6d6 100644 --- a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_x3_conv_eltwise_relu.cpp +++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_x3_conv_eltwise_relu.cpp @@ -142,7 +142,7 @@ class Scaleshift_x3_ConvEltwiseRelu : public BasicBF16Test { fnPtr = createGraph(netPrecision); // STAGE1: - threshold = 2e-1; + threshold = 5e-1; // STAGE2: // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/plugin_config.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/plugin_config.cpp index 53e2dd7baa34de..4ad085c318fa70 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/plugin_config.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/plugin_config.cpp @@ -5,4 +5,9 @@ #include "functional_test_utils/plugin_config.hpp" void PreparePluginConfiguration(LayerTestsUtils::LayerTestsCommon* test) { + // Within the test scope we don't need any implicit bf16 optimisations, so let's run the network as is. + auto& configuration = test->GetConfiguration(); + if (!configuration.count(InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16)) { + configuration.insert({InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16, InferenceEngine::PluginConfigParams::NO}); + } } diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp index cb79f7c7f1555c..2ba662994afc30 100644 --- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp +++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp @@ -5,10 +5,11 @@ #include #include +#include #include "functional_test_utils/skip_tests_config.hpp" std::vector disabledTestPatterns() { - return { + std::vector retVector{ // TODO: Issue 26264 R"(.*(MaxPool|AvgPool).*S\(1\.2\).*Rounding=ceil.*)", // TODO: Issue 31841 @@ -58,4 +59,12 @@ std::vector disabledTestPatterns() { // TODO: Issue 43417 sporadic issue, looks like an issue in test, reproducible only on Windows platform R"(.*decomposition1_batch=5_hidden_size=10_input_size=30_.*tanh.relu.*_clip=0_linear_before_reset=1.*_targetDevice=CPU_.*)", }; + + if (!InferenceEngine::with_cpu_x86_bfloat16()) { + // on platforms which do not support bfloat16, we are disabling bf16 tests since there are no bf16 primitives, + // tests are useless on such platforms + retVector.emplace_back(R"(.*BF16.*)"); + } + + return retVector; } diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/activation.cpp new file mode 100644 index 00000000000000..975f790a4fa2db --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/activation.cpp @@ -0,0 +1,143 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; +using namespace ngraph::helpers; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + LayerTestsDefinitions::activationParams, + CPUSpecificParams> + ActivationLayerCPUTestParamSet; + +class ActivationLayerCPUTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { +public: + ActivationTypes activationType; + static std::string getTestCaseName(const testing::TestParamInfo &obj) { + LayerTestsDefinitions::activationParams basicParamsSet; + CPUSpecificParams cpuParams; + std::tie(basicParamsSet, cpuParams) = obj.param; + + std::ostringstream result; + result << LayerTestsDefinitions::ActivationLayerTest::getTestCaseName(testing::TestParamInfo( + basicParamsSet, 0)); + + result << CPUTestsBase::getTestCaseName(cpuParams); + + return result.str(); + } + InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override { + return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), 15, 0, 32768); + } + +protected: + void SetUp() override { + LayerTestsDefinitions::activationParams basicParamsSet; + CPUSpecificParams cpuParams; + std::tie(basicParamsSet, cpuParams) = this->GetParam(); + + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + + InferenceEngine::Precision netPrecision; + std::pair, std::vector> shapes; + std::pair> activationDecl; + std::tie(activationDecl, netPrecision, inPrc, outPrc, inLayout, outLayout, shapes, targetDevice) = basicParamsSet; + selectedType = getPrimitiveType() + "_" + inPrc.name(); + + activationType = activationDecl.first; + auto constantsValue = activationDecl.second; + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto params = ngraph::builder::makeParams(ngPrc, {shapes.first}); + auto activation = ngraph::builder::makeActivation(params[0], ngPrc, activationType, shapes.second, constantsValue); + activation->get_rt_info() = getCPUInfo(); + function = std::make_shared(ngraph::NodeVector{activation}, params, "Activation"); + } +}; + +TEST_P(ActivationLayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); + CheckCPUImpl(executableNetwork, "Eltwise"); +} + + +namespace { +// list only types supported by eltwise +const std::map>> activationTypes = { + {Sqrt, {{}}}, + {Sigmoid, {{}}}, + {Tanh, {{}}}, + {Relu, {{}}}, + {Gelu, {{}}}, + {Exp, {{}}}, + {Clamp, {{-2.0f, 2.0f}}}, + {Elu, {{0.1f}}}, + {Swish, {{0.1f}}}, + {HSwish, {{}}}, + {Mish, {{}}}, + {PReLu, {{-0.01f}}} +}; + +std::vector cpuParams_4D = { + CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}), + CPUSpecificParams({nhwc}, {nhwc}, {}, {}), + CPUSpecificParams({nchw}, {nchw}, {}, {}) +}; + +std::map, std::vector>> basic4D = { + {{2, 4, 4, 1}, {{}}}, + {{2, 17, 5, 4}, {{}}}, +}; + +std::vector bf16InpOutPrc = {Precision::BF16, Precision::FP32}; + +const auto basicCases4D = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)), + ::testing::Values(Precision::BF16), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::ValuesIn(CommonTestUtils::combineParams(basic4D)), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)) +); + +INSTANTIATE_TEST_CASE_P(smoke_Activation4D_Eltwise_CPU_BF16, ActivationLayerCPUTest, basicCases4D, ActivationLayerCPUTest::getTestCaseName); + +std::vector cpuParams_5D = { + CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {}), + CPUSpecificParams({ndhwc}, {ndhwc}, {}, {}), + CPUSpecificParams({ncdhw}, {ncdhw}, {}, {}) +}; + +std::map, std::vector>> basic5D = { + {{2, 4, 3, 4, 1}, {{}}}, + {{2, 17, 7, 5, 4}, {{}}}, +}; + +const auto basicCases5D = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(CommonTestUtils::combineParams(activationTypes)), + ::testing::Values(Precision::BF16), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::ValuesIn(CommonTestUtils::combineParams(basic5D)), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)) +); + +INSTANTIATE_TEST_CASE_P(smoke_Activation5D_Eltwise_CPU_BF16, ActivationLayerCPUTest, basicCases5D, ActivationLayerCPUTest::getTestCaseName); +} // namespace +} // namespace CPULayerTestsDefinitions \ No newline at end of file diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/convert.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/convert.cpp new file mode 100644 index 00000000000000..89159652129173 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/convert.cpp @@ -0,0 +1,57 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +using namespace LayerTestsDefinitions; +using namespace InferenceEngine; + +namespace CPULayerTestsDefinitions { + +class ConvertCPULayerTest : public ConvertLayerTest {}; + +TEST_P(ConvertCPULayerTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + ConvertParamsTuple params = GetParam(); + inPrc = std::get<1>(params); + outPrc = std::get<2>(params); + + Run(); +} + +namespace { +const std::vector> inShape = {{1, 2, 3, 4}}; + +// List of precisions natively supported by mkldnn. +const std::vector precisions = { + Precision::U8, + Precision::I8, + Precision::I16, + Precision::I32, + Precision::FP32, + Precision::BF16 +}; + +INSTANTIATE_TEST_CASE_P(smoke_ConvertLayerTest_From_BF16, ConvertCPULayerTest, + ::testing::Combine( + ::testing::Values(inShape), + ::testing::Values(Precision::BF16), + ::testing::ValuesIn(precisions), + ::testing::Values(Layout::ANY), + ::testing::Values(Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ConvertLayerTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_ConvertLayerTest_To_BF16, ConvertCPULayerTest, + ::testing::Combine( + ::testing::Values(inShape), + ::testing::ValuesIn(precisions), + ::testing::Values(Precision::BF16), + ::testing::Values(Layout::ANY), + ::testing::Values(Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ConvertLayerTest::getTestCaseName); +} // namespace +} // namespace CPULayerTestsDefinitions \ No newline at end of file diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/crop.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/crop.cpp new file mode 100644 index 00000000000000..920ca6fba150c8 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/crop.cpp @@ -0,0 +1,176 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "ngraph_functions/builders.hpp" +#include "test_utils/cpu_test_utils.hpp" + +// Given that the ngraph opset does not contain crop operation, we use the StridedSlice operation instead, since it is mapped to the Crop node if certain +// conditions are met. + +using namespace InferenceEngine; +using namespace CPUTestUtils; +using namespace LayerTestsDefinitions; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + StridedSliceSpecificParams, + InferenceEngine::Precision, // Net precision + std::string, // Device name + std::map, // Additional network configuration + CPUSpecificParams> CropLayerCPUTestParamSet; + +class CropLayerCPUTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + StridedSliceSpecificParams params; + InferenceEngine::Precision netPrc; + std::string targetName; + std::map additionalConfig; + CPUSpecificParams cpuParams; + std::tie(params, netPrc, targetName, additionalConfig, cpuParams) = obj.param; + + std::ostringstream result; + result << "inShape=" << CommonTestUtils::vec2str(params.inputShape) << "_"; + result << "netPRC=" << netPrc.name() << "_"; + result << "begin=" << CommonTestUtils::vec2str(params.begin) << "_"; + result << "end=" << CommonTestUtils::vec2str(params.end) << "_"; + result << "stride=" << CommonTestUtils::vec2str(params.strides) << "_"; + result << "begin_m=" << CommonTestUtils::vec2str(params.beginMask) << "_"; + result << "end_m=" << CommonTestUtils::vec2str(params.endMask) << "_"; + if (!params.newAxisMask.empty()) { + result << "new_axis_m=" << (params.newAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.newAxisMask)) << "_"; + } + if (!params.shrinkAxisMask.empty()) { + result << "shrink_m=" << (params.shrinkAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.shrinkAxisMask)) << "_"; + } + if (!params.ellipsisAxisMask.empty()) { + result << "ellipsis_m=" << (params.ellipsisAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.ellipsisAxisMask)) << "_"; + } + result << "trgDev=" << targetName; + result << CPUTestsBase::getTestCaseName(cpuParams); + + return result.str(); + } +protected: + void SetUp() override { + StridedSliceSpecificParams ssParams; + InferenceEngine::Precision netPrecision; + std::map additionalConfig; + CPUSpecificParams cpuParams; + std::tie(ssParams, netPrecision, targetDevice, additionalConfig, cpuParams) = this->GetParam(); + inPrc = outPrc = netPrecision; // because crop does not convert Precisions, but only moves the data + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + configuration.insert(additionalConfig.begin(), additionalConfig.end()); + + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto params = ngraph::builder::makeParams(ngPrc, {ssParams.inputShape}); + auto paramOuts = ngraph::helpers::convert2OutputVector( + ngraph::helpers::castOps2Nodes(params)); + auto ss = ngraph::builder::makeStridedSlice(paramOuts[0], ssParams.begin, ssParams.end, ssParams.strides, ngPrc, ssParams.beginMask, + ssParams.endMask, ssParams.newAxisMask, ssParams.shrinkAxisMask, ssParams.ellipsisAxisMask); + + selectedType = std::string("unknown_") + inPrc.name(); + + ss->get_rt_info() = getCPUInfo(); + + ngraph::ResultVector results{std::make_shared(ss)}; + function = std::make_shared(results, params, "StridedSlice"); + } +}; + +TEST_P(CropLayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); + CheckCPUImpl(executableNetwork, "Crop"); +} + +namespace { +const std::map additional_config; + +const std::vector netPrc = {Precision::BF16, Precision::FP32}; + +const std::vector testCasesPlain2D = {StridedSliceSpecificParams{ { 32, 32 }, { 0, 20 }, { 32, 30 }, { 1, 1 }, + { 0, 0 }, { 0, 0 }, { }, { }, { } }, + StridedSliceSpecificParams{ { 32, 20 }, { 2, 10 }, { 32, 20 }, { 1, 1 }, + { 0, 0 }, { 0, 0 }, { }, { }, { } } }; + +const auto CropParamsPlain2D = ::testing::Combine( + ::testing::ValuesIn(testCasesPlain2D), + ::testing::ValuesIn(netPrc), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config), + ::testing::Values(emptyCPUSpec)); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Plain_2D, CropLayerCPUTest, CropParamsPlain2D, CropLayerCPUTest::getTestCaseName); + +const std::vector testCasesPlain4D = { + StridedSliceSpecificParams{ { 1, 5, 32, 32 }, { 0, 2, 5, 4 }, { 1, 4, 28, 27 }, { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } }, + StridedSliceSpecificParams{ { 1, 5, 32, 32 }, { 0, 0, 20, 20 }, { 1, 5, 25, 25 }, { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } }, + StridedSliceSpecificParams{ { 1, 5, 32, 32 }, { 0, 0, 0, 20 }, { 1, 5, 32, 30 }, { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } }, + StridedSliceSpecificParams{ { 1, 5, 32, 20 }, { 0, 0, 2, 10 }, { 1, 5, 32, 20 }, { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } } +}; + +std::vector cpuParams_4D = { + CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}), + CPUSpecificParams({nchw}, {nchw}, {}, {}) +}; + +const auto CropParamsPlain4D = ::testing::Combine( + ::testing::ValuesIn(testCasesPlain4D), + ::testing::ValuesIn(netPrc), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config), + ::testing::Values(cpuParams_4D.at(1))); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Plain_4D, CropLayerCPUTest, CropParamsPlain4D, CropLayerCPUTest::getTestCaseName); + +const std::vector testCasesBlocked4D = { + StridedSliceSpecificParams{ { 1, 16, 32, 32 }, { 0, 0, 20, 20 }, { 1, 16, 25, 25 }, { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } }, + StridedSliceSpecificParams{ { 1, 32, 32, 32 }, { 0, 0, 0, 20 }, { 1, 16, 32, 30 }, { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } }, +}; + +const auto CropParamsBlocked4D = ::testing::Combine( + ::testing::ValuesIn(testCasesBlocked4D), + ::testing::ValuesIn(netPrc), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config), + ::testing::Values(filterCPUSpecificParams(cpuParams_4D).front())); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Blocked_4D, CropLayerCPUTest, CropParamsBlocked4D, CropLayerCPUTest::getTestCaseName); + +const std::vector testCasesPlain4DynBatch = { + StridedSliceSpecificParams{ { 10, 5, 32, 32 }, { 0, 2, 5, 4 }, { 1, 4, 28, 27 }, { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } }, + StridedSliceSpecificParams{ { 10, 5, 32, 32 }, { 0, 0, 20, 20 }, { 1, 5, 25, 25 }, { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } }, + StridedSliceSpecificParams{ { 10, 5, 32, 32 }, { 0, 0, 0, 20 }, { 1, 5, 32, 30 }, { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } }, + StridedSliceSpecificParams{ { 10, 5, 32, 20 }, { 0, 0, 2, 10 }, { 1, 5, 32, 20 }, { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { }, { }, { } } +}; + +std::map additional_config_dyn_batch = {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}, + {PluginConfigParams::KEY_DYN_BATCH_ENABLED, PluginConfigParams::YES}}; + +const auto CropParamsPlain4DynBatch = ::testing::Combine( + ::testing::ValuesIn(testCasesPlain4DynBatch), + ::testing::ValuesIn(netPrc), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config_dyn_batch), + ::testing::Values(cpuParams_4D.at(1))); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Blocked_4DynBatch, CropLayerCPUTest, CropParamsPlain4DynBatch, CropLayerCPUTest::getTestCaseName); +} // namespace +} // namespace CPULayerTestsDefinitions + diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp index 7d371b4b1a15ef..b968545b7dfb25 100644 --- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp @@ -47,17 +47,7 @@ class EltwiseLayerCPUTest : public testing::WithParamInterface inputShape1, inputShape2; if (inputShapes.size() == 1) { @@ -90,12 +80,7 @@ class EltwiseLayerCPUTest : public testing::WithParamInterface data(ngraph::shape_size(shape_input_secondary)); - data = NGraphFunctions::Utils::generateVector(ngraph::shape_size(shape_input_secondary)); - for (float &i : data) { - if (i == 0) { - i = 1; - } - } + data = NGraphFunctions::Utils::generateVector(ngraph::shape_size(shape_input_secondary), 10, 2); secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data); } else { secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary); @@ -105,7 +90,7 @@ class EltwiseLayerCPUTest : public testing::WithParamInterfaceget_rt_info() = CPUTestsBase::setCPUInfo(inFmts, outFmts, priority); + eltwise->get_rt_info() = getCPUInfo(); function = std::make_shared(eltwise, input, "Eltwise"); } }; @@ -114,7 +99,7 @@ TEST_P(EltwiseLayerCPUTest, CompareWithRefs) { SKIP_IF_CURRENT_TEST_IS_DISABLED() Run(); - CheckCPUImpl(executableNetwork, "Eltwise", inFmts, outFmts, selectedType); + CheckCPUImpl(executableNetwork, "Eltwise"); } namespace { @@ -128,7 +113,7 @@ std::vector opTypes = { CommonTestUtils::OpType::VECTOR, }; -std::vector eltwiseOpTypes = { +std::vector eltwiseOpTypesBinInp = { ngraph::helpers::EltwiseTypes::ADD, ngraph::helpers::EltwiseTypes::MULTIPLY, // TODO: Disabled because memory formats filter is not propogated through ngraph transformations @@ -138,27 +123,15 @@ std::vector eltwiseOpTypes = { ngraph::helpers::EltwiseTypes::SQUARED_DIFF, }; -std::map additional_config = {}; +std::vector eltwiseOpTypesDiffInp = { // Different number of input nodes depending on optimizations + ngraph::helpers::EltwiseTypes::POWER, + // ngraph::helpers::EltwiseTypes::MOD // Does not execute because of transformations +}; -std::vector filterCPUSpecificParams(std::vector& paramsVector) { - auto adjustBlockedFormatByIsa = [](std::vector& formats) { - for (int i = 0; i < formats.size(); i++) { - if (formats[i] == nChw16c) - formats[i] = nChw8c; - if (formats[i] == nCdhw16c) - formats[i] = nCdhw8c; - } - }; +std::map additional_config; - if (!with_cpu_x86_avx512f()) { - for (auto& param : paramsVector) { - adjustBlockedFormatByIsa(std::get<0>(param)); - adjustBlockedFormatByIsa(std::get<1>(param)); - } - } +std::vector bf16InpOutPrc = {Precision::BF16, Precision::FP32}; - return paramsVector; -} std::vector>> inShapes_4D = { {{2, 4, 4, 1}}, @@ -176,19 +149,50 @@ std::vector cpuParams_4D = { const auto params_4D_FP32 = ::testing::Combine( ::testing::Combine( ::testing::ValuesIn(inShapes_4D), - ::testing::ValuesIn(eltwiseOpTypes), + ::testing::ValuesIn(eltwiseOpTypesBinInp), ::testing::ValuesIn(secondaryInputTypes), ::testing::ValuesIn(opTypes), ::testing::Values(InferenceEngine::Precision::FP32), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(InferenceEngine::Layout::ANY), ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config)), ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D))); -INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32, EltwiseLayerCPUTest, params_4D_FP32, EltwiseLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_FP32_MemOrder, EltwiseLayerCPUTest, params_4D_FP32, EltwiseLayerCPUTest::getTestCaseName); + +const auto params_4D_BF16 = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D), + ::testing::ValuesIn(eltwiseOpTypesBinInp), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::Values(InferenceEngine::Precision::BF16), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D))); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_BF16_MemOrder, EltwiseLayerCPUTest, params_4D_BF16, EltwiseLayerCPUTest::getTestCaseName); + +const auto params_4D_BF16_emptyCPUSpec = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapes_4D), + ::testing::ValuesIn(eltwiseOpTypesDiffInp), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::Values(InferenceEngine::Precision::BF16), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::Values(emptyCPUSpec)); +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_BF16, EltwiseLayerCPUTest, params_4D_BF16_emptyCPUSpec, EltwiseLayerCPUTest::getTestCaseName); std::vector>> inShapes_5D = { {{2, 4, 3, 4, 1}}, @@ -206,19 +210,50 @@ std::vector cpuParams_5D = { const auto params_5D_FP32 = ::testing::Combine( ::testing::Combine( ::testing::ValuesIn(inShapes_5D), - ::testing::ValuesIn(eltwiseOpTypes), + ::testing::ValuesIn(eltwiseOpTypesBinInp), ::testing::ValuesIn(secondaryInputTypes), ::testing::ValuesIn(opTypes), ::testing::Values(InferenceEngine::Precision::FP32), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(InferenceEngine::Layout::ANY), ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config)), ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D))); -INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32, EltwiseLayerCPUTest, params_5D_FP32, EltwiseLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_FP32_MemOrder, EltwiseLayerCPUTest, params_5D_FP32, EltwiseLayerCPUTest::getTestCaseName); + +const auto params_5D_BF16 = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapes_5D), + ::testing::ValuesIn(eltwiseOpTypesBinInp), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::Values(InferenceEngine::Precision::BF16), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D))); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_BF16_MemOrder, EltwiseLayerCPUTest, params_5D_BF16, EltwiseLayerCPUTest::getTestCaseName); + +const auto params_5D_BF16_emptyCPUSpec = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inShapes_5D), + ::testing::ValuesIn(eltwiseOpTypesDiffInp), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::Values(InferenceEngine::Precision::BF16), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::Values(emptyCPUSpec)); +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_BF16, EltwiseLayerCPUTest, params_5D_BF16_emptyCPUSpec, EltwiseLayerCPUTest::getTestCaseName); std::vector>> inShapes_4D_Blocked_Planar = { {{2, 17, 31, 3}, {2, 1, 31, 3}}, @@ -232,12 +267,12 @@ std::vector cpuParams_4D_Blocked_Planar = { const auto params_4D_FP32_Blocked_Planar = ::testing::Combine( ::testing::Combine( ::testing::ValuesIn(inShapes_4D_Blocked_Planar), - ::testing::ValuesIn(eltwiseOpTypes), + ::testing::ValuesIn(eltwiseOpTypesBinInp), ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), ::testing::ValuesIn(opTypes), ::testing::Values(InferenceEngine::Precision::FP32), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(InferenceEngine::Layout::ANY), ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config)), @@ -258,12 +293,12 @@ std::vector cpuParams_4D_Planar_Blocked = { const auto params_4D_FP32_Planar_Blocked = ::testing::Combine( ::testing::Combine( ::testing::ValuesIn(inShapes_4D_Planar_Blocked), - ::testing::ValuesIn(eltwiseOpTypes), + ::testing::ValuesIn(eltwiseOpTypesBinInp), ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), ::testing::ValuesIn(opTypes), ::testing::Values(InferenceEngine::Precision::FP32), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(InferenceEngine::Layout::ANY), ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config)), @@ -284,12 +319,12 @@ std::vector cpuParams_5D_Blocked_Planar = { const auto params_5D_FP32_Blocked_Planar = ::testing::Combine( ::testing::Combine( ::testing::ValuesIn(inShapes_5D_Blocked_Planar), - ::testing::ValuesIn(eltwiseOpTypes), + ::testing::ValuesIn(eltwiseOpTypesBinInp), ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), ::testing::ValuesIn(opTypes), ::testing::Values(InferenceEngine::Precision::FP32), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(InferenceEngine::Layout::ANY), ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config)), @@ -310,12 +345,12 @@ std::vector cpuParams_5D_Planar_Blocked = { const auto params_5D_FP32_Planar_Blocked = ::testing::Combine( ::testing::Combine( ::testing::ValuesIn(inShapes_5D_Planar_Blocked), - ::testing::ValuesIn(eltwiseOpTypes), + ::testing::ValuesIn(eltwiseOpTypesBinInp), ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), ::testing::ValuesIn(opTypes), ::testing::Values(InferenceEngine::Precision::FP32), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(InferenceEngine::Layout::ANY), ::testing::Values(CommonTestUtils::DEVICE_CPU), ::testing::Values(additional_config)), diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp index a5b9f09fbc8851..784ced4c22649b 100644 --- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp @@ -57,7 +57,7 @@ class GroupConvolutionLayerCPUTest : public testing::WithParamInterface( ngraph::builder::makeGroupConvolution(paramOuts[0], ngPrc, kernel, stride, padBegin, padEnd, dilation, padType, convOutChannels, numGroups)); - groupConv->get_rt_info() = setCPUInfo(inFmts, outFmts, priority); + groupConv->get_rt_info() = getCPUInfo(); ngraph::ResultVector results{std::make_shared(groupConv)}; function = std::make_shared(results, params, "groupConvolution"); } @@ -67,7 +67,7 @@ TEST_P(GroupConvolutionLayerCPUTest, CompareWithRefs) { SKIP_IF_CURRENT_TEST_IS_DISABLED() Run(); - CheckCPUImpl(executableNetwork, "Convolution", inFmts, outFmts, selectedType); + CheckCPUImpl(executableNetwork, "Convolution"); } namespace { diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/interpolate.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/interpolate.cpp index 3c3891c899bc4a..9d153429994aba 100644 --- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/interpolate.cpp +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/interpolate.cpp @@ -78,21 +78,17 @@ class InterpolateLayerCPUTest : public testing::WithParamInterfaceget_rt_info() = CPUTestsBase::setCPUInfo(inFmts, outFmts, priority); + interpolate->get_rt_info() = getCPUInfo(); const ngraph::ResultVector results{std::make_shared(interpolate)}; function = std::make_shared(results, params, "interpolate"); } - - std::vector inFmts, outFmts; - std::vector priority; - std::string selectedType; }; TEST_P(InterpolateLayerCPUTest, CompareWithRefs) { SKIP_IF_CURRENT_TEST_IS_DISABLED() Run(); - CheckCPUImpl(executableNetwork, "Interpolate", inFmts, outFmts, selectedType); + CheckCPUImpl(executableNetwork, "Interpolate"); } namespace { diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/logical.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/logical.cpp new file mode 100644 index 00000000000000..a968df20a184ef --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/logical.cpp @@ -0,0 +1,158 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "ngraph_functions/builders.hpp" +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; +using namespace ngraph::helpers; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + LayerTestsDefinitions::LogicalTestParams, + CPUSpecificParams> +LogicalLayerCPUTestParamSet; + +class LogicalLayerCPUTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + LayerTestsDefinitions::LogicalTestParams basicParamsSet; + CPUSpecificParams cpuParams; + std::tie(basicParamsSet, cpuParams) = obj.param; + + std::ostringstream result; + result << LayerTestsDefinitions::LogicalLayerTest::getTestCaseName(testing::TestParamInfo( + basicParamsSet, 0)); + + result << CPUTestsBase::getTestCaseName(cpuParams); + + return result.str(); + } + +protected: + void SetUp() override { + LayerTestsDefinitions::LogicalTestParams basicParamsSet; + CPUSpecificParams cpuParams; + std::tie(basicParamsSet, cpuParams) = this->GetParam(); + + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + + LayerTestsDefinitions::LogicalParams::InputShapesTuple inputShapes; + ngraph::helpers::LogicalTypes logicalOpType; + ngraph::helpers::InputLayerType secondInputType; + InferenceEngine::Precision netPrecision; + std::string targetName; + std::map additional_config; + std::tie(inputShapes, logicalOpType, secondInputType, netPrecision, inPrc, outPrc, + inLayout, outLayout, targetDevice, additional_config) = basicParamsSet; + + selectedType = getPrimitiveType() + "_" + inPrc.name(); + + auto ngInputsPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(Precision::BOOL); // Because ngraph supports only boolean input for logical ops + configuration.insert(additional_config.begin(), additional_config.end()); + + auto inputs = ngraph::builder::makeParams(ngInputsPrc, {inputShapes.first}); + + std::shared_ptr logicalNode; + if (logicalOpType != ngraph::helpers::LogicalTypes::LOGICAL_NOT) { + auto secondInput = ngraph::builder::makeInputLayer(ngInputsPrc, secondInputType, inputShapes.second); + if (secondInputType == ngraph::helpers::InputLayerType::PARAMETER) { + inputs.push_back(std::dynamic_pointer_cast(secondInput)); + } + logicalNode = ngraph::builder::makeLogical(inputs[0], secondInput, logicalOpType); + } else { + logicalNode = ngraph::builder::makeLogical(inputs[0], ngraph::Output(), logicalOpType); + } + + logicalNode->get_rt_info() = getCPUInfo(); + + function = std::make_shared(logicalNode, inputs, "Logical"); + } +}; + +TEST_P(LogicalLayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); + CheckCPUImpl(executableNetwork, "Eltwise"); +} + +namespace { + +std::map, std::vector>> inputShapes = { + {{1}, {{1}, {17}, {1, 1}, {2, 18}, {1, 1, 2}, {2, 2, 3}, {1, 1, 2, 3}}}, + {{5}, {{1}, {1, 1}, {2, 5}, {1, 1, 1}, {2, 2, 5}}}, + {{2, 200}, {{1}, {200}, {1, 200}, {2, 200}, {2, 2, 200}}}, + {{1, 3, 20}, {{20}, {2, 1, 1}}}, + {{2, 17, 3, 4}, {{4}, {1, 3, 4}, {2, 1, 3, 4}}}, + {{2, 1, 1, 3, 1}, {{1}, {1, 3, 4}, {2, 1, 3, 4}, {1, 1, 1, 1, 1}}}, +}; + +std::map, std::vector>> inputShapesNot = { + {{1}, {}}, + {{5}, {}}, + {{2, 200}, {}}, + {{1, 3, 20}, {}}, + {{2, 17, 3, 4}, {}}, + {{2, 1, 1, 3, 1}, {}}, +}; + +std::vector inputsPrecisions = { + InferenceEngine::Precision::BOOL, +}; + +std::vector logicalOpTypes = { + ngraph::helpers::LogicalTypes::LOGICAL_AND, + ngraph::helpers::LogicalTypes::LOGICAL_OR, + ngraph::helpers::LogicalTypes::LOGICAL_XOR, +}; + +std::vector secondInputTypes = { + ngraph::helpers::InputLayerType::CONSTANT, + ngraph::helpers::InputLayerType::PARAMETER, +}; + +std::map additional_config; + +std::vector bf16InpOutPrc = {Precision::BF16, Precision::FP32}; + +const auto LogicalTestParams = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(LayerTestsDefinitions::LogicalLayerTest::combineShapes(inputShapes)), + ::testing::ValuesIn(logicalOpTypes), + ::testing::ValuesIn(secondInputTypes), + ::testing::Values(Precision::BF16), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::Values(Layout::ANY), + ::testing::Values(Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::Values(emptyCPUSpec)); + +const auto LogicalTestParamsNot = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(LayerTestsDefinitions::LogicalLayerTest::combineShapes(inputShapesNot)), + ::testing::Values(ngraph::helpers::LogicalTypes::LOGICAL_NOT), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::Values(Precision::BF16), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::ValuesIn(bf16InpOutPrc), + ::testing::Values(Layout::ANY), + ::testing::Values(Layout::ANY), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::Values(emptyCPUSpec)); + + +INSTANTIATE_TEST_CASE_P(smoke_Logical_Eltwise_CPU_BF16, LogicalLayerCPUTest, LogicalTestParams, LogicalLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_Logical_Not_Eltwise_CPU_BF16, LogicalLayerCPUTest, LogicalTestParamsNot, LogicalLayerCPUTest::getTestCaseName); + +} // namespace +} // namespace CPULayerTestsDefinitions \ No newline at end of file diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mvn.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mvn.cpp new file mode 100644 index 00000000000000..ad120a1b94051a --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/mvn.cpp @@ -0,0 +1,200 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "ngraph_functions/builders.hpp" +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + LayerTestsDefinitions::mvnParams, + CPUSpecificParams, + Precision, // CNNNetwork input precision + Precision> // CNNNetwork output precision +MvnLayerCPUTestParamSet; + +class MvnLayerCPUTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + LayerTestsDefinitions::mvnParams basicParamsSet; + CPUSpecificParams cpuParams; + Precision inputPrecision, outputPrecision; + std::tie(basicParamsSet, cpuParams, inputPrecision, outputPrecision) = obj.param; + + std::ostringstream result; + result << LayerTestsDefinitions::MvnLayerTest::getTestCaseName(testing::TestParamInfo( + basicParamsSet, 0)); + + result << "_" << "CNNInpPrc=" << inputPrecision.name(); + result << "_" << "CNNOutPrc=" << outputPrecision.name(); + + result << CPUTestsBase::getTestCaseName(cpuParams); + + return result.str(); + } +protected: + void SetUp() override { + LayerTestsDefinitions::mvnParams basicParamsSet; + CPUSpecificParams cpuParams; + std::tie(basicParamsSet, cpuParams, inPrc, outPrc) = this->GetParam(); + + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + + InferenceEngine::SizeVector inputShapes; + InferenceEngine::Precision netPrecision; + bool acrossChanels, normalizeVariance; + double eps; + std::tie(inputShapes, netPrecision, acrossChanels, normalizeVariance, eps, targetDevice) = basicParamsSet; + auto netPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto param = ngraph::builder::makeParams(netPrc, {inputShapes}); + auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(param)); + auto mvn = ngraph::builder::makeMVN(paramOuts[0], acrossChanels, normalizeVariance, eps); + ngraph::ResultVector results{std::make_shared(mvn)}; + + selectedType = getPrimitiveType() + "_" + inPrc.name(); + + threshold = 0.015f; + + mvn->get_rt_info() = getCPUInfo(); + + function = std::make_shared(results, param, "mvn"); + } +}; + +TEST_P(MvnLayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); + CheckCPUImpl(executableNetwork, "MVN"); +} + +namespace { +const std::vector> inputShapes_3D = { + {1, 32, 17}, + {1, 37, 9}, +}; + +const std::vector> inputShapes_4D = { + {1, 16, 5, 8}, + {2, 19, 5, 10}, + {7, 32, 2, 8}, + {5, 8, 3, 5}, + {4, 41, 6, 9} +}; + +const std::vector> inputShapes_5D = { + {1, 32, 8, 1, 6}, + {1, 9, 1, 15, 9}, + {6, 64, 6, 1, 18}, + {2, 31, 2, 9, 1}, + {10, 16, 5, 10, 6} +}; + +const std::vector acrossChannels = { + true, + false +}; + +const std::vector normalizeVariance = { + true, + false +}; + +const std::vector epsilon = { + 0.000000001 +}; + +std::vector inpOutPrc = {Precision::BF16, Precision::FP32}; + +std::vector cpuParams_4D = { + CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}), + CPUSpecificParams({nchw}, {nchw}, {}, {}) +}; + +std::vector cpuParams_5D = { + CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {}), + CPUSpecificParams({ncdhw}, {ncdhw}, {}, {}) +}; + +const auto Mvn3D = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inputShapes_3D), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::ValuesIn(acrossChannels), + ::testing::ValuesIn(normalizeVariance), + ::testing::ValuesIn(epsilon), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(emptyCPUSpec), + ::testing::ValuesIn(inpOutPrc), + ::testing::ValuesIn(inpOutPrc)); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_3D, MvnLayerCPUTest, Mvn3D, MvnLayerCPUTest::getTestCaseName); + +const auto Mvn4D = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inputShapes_4D), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::ValuesIn(acrossChannels), + ::testing::ValuesIn(normalizeVariance), + ::testing::ValuesIn(epsilon), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)), + ::testing::ValuesIn(inpOutPrc), + ::testing::ValuesIn(inpOutPrc)); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D, MvnLayerCPUTest, Mvn4D, MvnLayerCPUTest::getTestCaseName); + + +const auto MvnNHWC = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inputShapes_4D), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(false), + ::testing::Values(true), + ::testing::ValuesIn(epsilon), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(CPUSpecificParams({nhwc}, {nhwc}, {}, {})), + ::testing::ValuesIn(inpOutPrc), + ::testing::ValuesIn(inpOutPrc)); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_NHWC, MvnLayerCPUTest, MvnNHWC, MvnLayerCPUTest::getTestCaseName); + +const auto MvnNDHWC = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inputShapes_5D), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::Values(false), + ::testing::Values(true), + ::testing::ValuesIn(epsilon), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(CPUSpecificParams({ndhwc}, {ndhwc}, {}, {})), + ::testing::ValuesIn(inpOutPrc), + ::testing::ValuesIn(inpOutPrc)); + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_NDHWC, MvnLayerCPUTest, MvnNDHWC, MvnLayerCPUTest::getTestCaseName); + + +const auto Mvn5D = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(inputShapes_5D), + ::testing::Values(InferenceEngine::Precision::FP32), + ::testing::ValuesIn(acrossChannels), + ::testing::ValuesIn(normalizeVariance), + ::testing::ValuesIn(epsilon), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)), + ::testing::ValuesIn(inpOutPrc), + ::testing::ValuesIn(inpOutPrc)); + + +INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D, MvnLayerCPUTest, Mvn5D, MvnLayerCPUTest::getTestCaseName); + + +} // namespace +} // namespace CPULayerTestsDefinitions \ No newline at end of file diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/normalize.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/normalize.cpp new file mode 100755 index 00000000000000..9b182a1b1e90f1 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/normalize.cpp @@ -0,0 +1,132 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "ngraph_functions/builders.hpp" +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + LayerTestsDefinitions::NormalizeL2LayerTestParams, + CPUSpecificParams> +NormalizeL2LayerCPUTestParamSet; + +class NormalizeL2LayerCPUTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + LayerTestsDefinitions::NormalizeL2LayerTestParams basicParamsSet; + CPUSpecificParams cpuParams; + Precision inputPrecision, outputPrecision; + std::tie(basicParamsSet, cpuParams) = obj.param; + + std::ostringstream result; + result << LayerTestsDefinitions::NormalizeL2LayerTest::getTestCaseName(testing::TestParamInfo( + basicParamsSet, 0)); + + result << CPUTestsBase::getTestCaseName(cpuParams); + + return result.str(); + } +protected: + void SetUp() override { + LayerTestsDefinitions::NormalizeL2LayerTestParams basicParamsSet; + CPUSpecificParams cpuParams; + std::tie(basicParamsSet, cpuParams) = this->GetParam(); + + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + + std::vector axes; + float eps; + ngraph::op::EpsMode eps_mode; + InferenceEngine::SizeVector inputShapes; + InferenceEngine::Precision netPrecision; + std::tie(axes, eps, eps_mode, inputShapes, netPrecision, targetDevice) = basicParamsSet; + inPrc = outPrc = netPrecision; + auto netPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto param = ngraph::builder::makeParams(netPrc, {inputShapes}); + auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(param)); + auto normalize_l2 = ngraph::builder::makeNormalizeL2(paramOuts[0], axes, eps, eps_mode); + + ngraph::ResultVector results{std::make_shared(normalize_l2)}; + + if (Precision::BF16 == netPrecision) { + selectedType = "unknown_BF16"; + } else if (Precision::FP32 == netPrecision) { + selectedType = "unknown_FP32"; + } + + threshold = 0.015f; + + normalize_l2->get_rt_info() = getCPUInfo(); + + function = std::make_shared(results, param, "Normalize"); + } +}; + +TEST_P(NormalizeL2LayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); + CheckCPUImpl(executableNetwork, "Normalize"); +} + +namespace { + +const std::vector> axes = { + {}, + {1}, +}; +const std::vector eps = { 1e-4f }; + +const std::vector epsMode = { + ngraph::op::EpsMode::ADD, + ngraph::op::EpsMode::MAX, +}; + +std::vector inpOutPrc = {Precision::BF16}; + +std::vector cpuParams_4D = { + CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}), + CPUSpecificParams({nhwc}, {nhwc}, {}, {}), + CPUSpecificParams({nchw}, {nchw}, {}, {}) +}; + + +const std::vector netPrecisions = { + Precision::FP32, + Precision::BF16 +}; + +const auto NormalizeL23D = testing::Combine( + testing::Combine( + testing::ValuesIn(axes), + testing::ValuesIn(eps), + testing::ValuesIn(epsMode), + testing::Values(std::vector{1, 32, 17}), + testing::ValuesIn(netPrecisions), + testing::Values(CommonTestUtils::DEVICE_CPU)), + testing::Values(emptyCPUSpec)); + +INSTANTIATE_TEST_CASE_P(smoke_NormalizeL2CompareWithRefs_3D, NormalizeL2LayerCPUTest, NormalizeL23D, NormalizeL2LayerCPUTest::getTestCaseName); + +const auto NormalizeL24D = testing::Combine( + testing::Combine( + testing::ValuesIn(axes), + testing::ValuesIn(eps), + testing::ValuesIn(epsMode), + testing::Values(std::vector{1, 3, 10, 5}), + testing::ValuesIn(netPrecisions), + testing::Values(CommonTestUtils::DEVICE_CPU)), + testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D))); + +INSTANTIATE_TEST_CASE_P(smoke_NormalizeL2CompareWithRefs_4D, NormalizeL2LayerCPUTest, NormalizeL24D, NormalizeL2LayerCPUTest::getTestCaseName); + + +} // namespace +} // namespace CPULayerTestsDefinitions diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/permute.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/permute.cpp new file mode 100644 index 00000000000000..a0bf55781539b6 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/permute.cpp @@ -0,0 +1,148 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "ngraph_functions/builders.hpp" +#include "test_utils/cpu_test_utils.hpp" + +// Since the Transpose ngraph operation is converted to the permute node, we will use it in the permute test + +using namespace InferenceEngine; +using namespace CPUTestUtils; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + std::vector, // Input order + InferenceEngine::Precision, // Net precision + std::vector, // Input shapes + std::string, // Target device name + std::map, // Additional network configuration + CPUSpecificParams> PermuteLayerCPUTestParamSet; + +class PermuteLayerCPUTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + Precision netPrecision; + std::vector inputShape, inputOrder; + std::string targetDevice; + CPUSpecificParams cpuParams; + std::map additionalConfig; + std::tie(inputOrder, netPrecision, inputShape, targetDevice, additionalConfig, cpuParams) = obj.param; + + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_"; + result << "inputOrder=" << CommonTestUtils::vec2str(inputOrder) << "_"; + result << "netPRC=" << netPrecision.name() << "_"; + result << "trgDev=" << targetDevice; + result << CPUTestsBase::getTestCaseName(cpuParams); + return result.str(); + } +protected: + void SetUp() override { + SetRefMode(LayerTestsUtils::RefMode::CONSTANT_FOLDING); + + Precision netPrecision; + std::vector inputShape, inputOrder; + CPUSpecificParams cpuParams; + std::map additionalConfig; + std::tie(inputOrder, netPrecision, inputShape, targetDevice, additionalConfig, cpuParams) = this->GetParam(); + configuration.insert(additionalConfig.begin(), additionalConfig.end()); + inPrc = outPrc = netPrecision; // since the layer does not convert precisions + + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + + selectedType = std::string("unknown_") + inPrc.name(); + + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto params = ngraph::builder::makeParams(ngPrc, {inputShape}); + auto paramOuts = ngraph::helpers::convert2OutputVector( + ngraph::helpers::castOps2Nodes(params)); + + const auto inOrderShape = inputOrder.empty() ? ngraph::Shape({0}) : ngraph::Shape({inputShape.size()}); + const auto inputOrderOp = std::make_shared(ngraph::element::i64, + inOrderShape, + inputOrder); + const auto transpose = std::make_shared(paramOuts.at(0), inputOrderOp); + transpose->get_rt_info() = getCPUInfo(); + const ngraph::ResultVector results{std::make_shared(transpose)}; + function = std::make_shared(results, params, "Transpose"); + } +}; + +TEST_P(PermuteLayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); + CheckCPUImpl(executableNetwork, "Permute"); +} + +namespace { +std::map additional_config; + +const std::vector netPrecisions = { + Precision::BF16, + Precision::FP32 +}; + +const std::vector> inputShapes4D = { + {2, 32, 10, 20} +}; + +const std::vector> inputOrder4D = { + std::vector{0, 1, 2, 3}, + std::vector{0, 2, 3, 1}, + std::vector{0, 2, 1, 3}, + std::vector{1, 0, 2, 3}, + std::vector{}, +}; + +std::vector cpuParams_4D = { + CPUSpecificParams({nChw16c}, {}, {}, {}), + CPUSpecificParams({nchw}, {}, {}, {}), +}; + +const auto params4D = ::testing::Combine( + ::testing::ValuesIn(inputOrder4D), + ::testing::ValuesIn(netPrecisions), + ::testing::ValuesIn(inputShapes4D), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D))); + +INSTANTIATE_TEST_CASE_P(smoke_Permute4D_CPU, PermuteLayerCPUTest, params4D, PermuteLayerCPUTest::getTestCaseName); + +const std::vector> inputShapes5D = { + {2, 32, 5, 10, 20} +}; + +const std::vector> inputOrder5D = { + std::vector{0, 1, 2, 3, 4}, + std::vector{0, 4, 2, 3, 1}, + std::vector{0, 4, 2, 1, 3}, + std::vector{0, 2, 4, 3, 1}, + std::vector{0, 3, 2, 4, 1}, + std::vector{0, 3, 1, 4, 2}, + std::vector{1, 0, 2, 3, 4}, + std::vector{}, +}; + +std::vector cpuParams_5D = { + CPUSpecificParams({nCdhw16c}, {}, {}, {}), + CPUSpecificParams({ncdhw}, {}, {}, {}), +}; + +const auto params5D = ::testing::Combine( + ::testing::ValuesIn(inputOrder5D), + ::testing::ValuesIn(netPrecisions), + ::testing::ValuesIn(inputShapes5D), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D))); + +INSTANTIATE_TEST_CASE_P(smoke_Permute5D_CPU, PermuteLayerCPUTest, params5D, PermuteLayerCPUTest::getTestCaseName); + +} // namespace +} // namespace CPULayerTestsDefinitions \ No newline at end of file diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/reduce_ops.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/reduce_ops.cpp new file mode 100644 index 00000000000000..becf723a81fc9d --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/reduce_ops.cpp @@ -0,0 +1,352 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "ngraph_functions/builders.hpp" +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; +using namespace LayerTestsDefinitions; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple ReduceLayerCPUTestParamSet; + +class ReduceCPULayerTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + reduceMeanParams basicParamsSet; + CPUSpecificParams cpuParams; + std::tie(basicParamsSet, cpuParams) = obj.param; + + std::ostringstream result; + result << LayerTestsDefinitions::ReduceOpsLayerTest::getTestCaseName(testing::TestParamInfo( + basicParamsSet, 0)); + result << CPUTestsBase::getTestCaseName(cpuParams); + + return result.str(); + } +protected: + void SetUp() override { + reduceMeanParams basicParamsSet; + CPUSpecificParams cpuParams; + std::tie(basicParamsSet, cpuParams) = this->GetParam(); + + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + + InferenceEngine::Precision netPrecision; + bool keepDims; + std::vector inputShape; + std::vector axes; + CommonTestUtils::OpType opType; + std::tie(axes, opType, keepDims, reductionType, netPrecision, inPrc, outPrc, inLayout, inputShape, targetDevice) = basicParamsSet; + + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + auto params = ngraph::builder::makeParams(ngPrc, {inputShape}); + auto paramOuts = ngraph::helpers::convert2OutputVector( + ngraph::helpers::castOps2Nodes(params)); + + std::vector shapeAxes; + switch (opType) { + case CommonTestUtils::OpType::SCALAR: { + if (axes.size() > 1) + FAIL() << "In reduce op if op type is scalar, 'axis' input's must contain 1 element"; + break; + } + case CommonTestUtils::OpType::VECTOR: { + shapeAxes.push_back(axes.size()); + break; + } + default: + FAIL() << "Reduce op doesn't support operation type: " << opType; + } + auto reductionAxesNode = std::dynamic_pointer_cast( + std::make_shared(ngraph::element::Type_t::i64, ngraph::Shape(shapeAxes), axes)); + + const auto reduce = ngraph::builder::makeReduce(paramOuts[0], reductionAxesNode, keepDims, reductionType); + + selectedType = getPrimitiveType() + "_" + inPrc.name(); + + reduce->get_rt_info() = getCPUInfo(); + + const ngraph::ResultVector results{std::make_shared(reduce)}; + function = std::make_shared(results, params, "Reduce"); + } + InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override { + if (ngraph::helpers::ReductionType::Prod == reductionType) { + // We change the range of random values to avoid possible floating point overflow + auto blob = FuncTestUtils::createAndFillBlob(info.getTensorDesc(), 10, 5); + if (Precision::FP32 == info.getTensorDesc().getPrecision()) { + auto *rawBlobDataPtr = blob->buffer().as(); + for (size_t i = 0; i < blob->size(); ++i) { + rawBlobDataPtr[i] /= 10.f; + } + } else if (Precision::BF16 == info.getTensorDesc().getPrecision()) { + auto *rawBlobDataPtr = blob->buffer().as(); + for (size_t i = 0; i < blob->size(); ++i) { + rawBlobDataPtr[i] /= 10.f; + } + } + return blob; + } else { + return LayerTestsCommon::GenerateInput(info); + } + } + +private: + ngraph::helpers::ReductionType reductionType; +}; + +TEST_P(ReduceCPULayerTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + auto ops = function->get_ordered_ops(); + std::string name = (*(++ops.rbegin()))->get_type_name(); + + if ("ReduceLogicalAnd" == name) { + name = "ReduceAnd"; + } + if ("ReduceLogicalOr" == name) { + name = "ReduceOr"; + } + + Run(); + CheckCPUImpl(executableNetwork, name); +} +namespace { +std::vector inpOutPrc = {Precision::BF16, Precision::FP32}; + +const std::vector keepDims = { + true, + false, +}; + +const std::vector> axes = { + {0}, + {1}, + {2}, + {3} +}; + +const std::vector> axesND = { + {0, 1}, + {0, 2}, + {0, 3}, + {1, 2}, + {1, 3}, + {2, 3}, + {0, 1, 2}, + {0, 1, 3}, + {0, 2, 3}, + {1, 2, 3}, + {0, 1, 2, 3} +}; + +std::vector opTypes = { + CommonTestUtils::OpType::SCALAR, + CommonTestUtils::OpType::VECTOR, +}; + +const std::vector reductionTypes = { +// ngraph::helpers::ReductionType::Mean, //optimized out during the graph transformations +// ngraph::helpers::ReductionType::Max, //optimized out during the graph transformations +// ngraph::helpers::ReductionType::Sum, //optimized out during the graph transformations + ngraph::helpers::ReductionType::Min, + ngraph::helpers::ReductionType::Prod, + ngraph::helpers::ReductionType::L1, + ngraph::helpers::ReductionType::L2, +}; + +const std::vector reductionLogicalTypes = { + ngraph::helpers::ReductionType::LogicalOr, + ngraph::helpers::ReductionType::LogicalAnd +}; + +const std::vector> inputShapes = { + std::vector{10, 5, 15, 12}, + std::vector{3, 5, 7, 9}, +}; + +std::vector cpuParams_4D = { + CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}), + CPUSpecificParams({nchw}, {nchw}, {}, {}) +}; + +std::vector cpuParams_5D = { + CPUSpecificParams({nCdhw16c}, {nCdhw16c}, {}, {}), + CPUSpecificParams({ncdhw}, {ncdhw}, {}, {}) +}; + +const auto paramsOneAxis = ::testing::Combine( + testing::Combine( + testing::ValuesIn(axes), + testing::ValuesIn(opTypes), + testing::ValuesIn(keepDims), + testing::ValuesIn(reductionTypes), + testing::Values(InferenceEngine::Precision::FP32), + testing::ValuesIn(inpOutPrc), + testing::ValuesIn(inpOutPrc), + testing::Values(InferenceEngine::Layout::ANY), + testing::ValuesIn(inputShapes), + testing::Values(CommonTestUtils::DEVICE_CPU)), + testing::Values(emptyCPUSpec)); + +const auto paramsOneAxisLogical = testing::Combine( + testing::Combine( + testing::ValuesIn(axes), + testing::ValuesIn(opTypes), + testing::ValuesIn(keepDims), + testing::ValuesIn(reductionLogicalTypes), + testing::Values(InferenceEngine::Precision::BOOL), + testing::ValuesIn(inpOutPrc), + testing::ValuesIn(inpOutPrc), + testing::Values(InferenceEngine::Layout::ANY), + testing::ValuesIn(inputShapes), + testing::Values(CommonTestUtils::DEVICE_CPU)), + testing::Values(emptyCPUSpec)); + +const auto params_MultiAxis = testing::Combine( + testing::Combine( + testing::ValuesIn(axesND), + testing::Values(opTypes[1]), + testing::Values(false), + testing::ValuesIn(reductionTypes), + testing::Values(InferenceEngine::Precision::FP32), + testing::ValuesIn(inpOutPrc), + testing::ValuesIn(inpOutPrc), + testing::Values(InferenceEngine::Layout::ANY), + testing::Values(std::vector{2, 9, 2, 9}), + testing::Values(CommonTestUtils::DEVICE_CPU)), + testing::Values(emptyCPUSpec)); + +const auto params_MultiAxis_4D = testing::Combine( + testing::Combine( + testing::ValuesIn(axesND), + testing::Values(opTypes[1]), + testing::Values(true), + testing::ValuesIn(reductionTypes), + testing::Values(InferenceEngine::Precision::FP32), + testing::ValuesIn(inpOutPrc), + testing::ValuesIn(inpOutPrc), + testing::Values(InferenceEngine::Layout::ANY), + testing::Values(std::vector{2, 19, 2, 9}), + testing::Values(CommonTestUtils::DEVICE_CPU)), + testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D))); + +const auto params_MultiAxis_5D = testing::Combine( + testing::Combine( + testing::ValuesIn(axesND), + testing::Values(opTypes[1]), + testing::Values(true), + testing::ValuesIn(reductionTypes), + testing::Values(InferenceEngine::Precision::FP32), + testing::ValuesIn(inpOutPrc), + testing::ValuesIn(inpOutPrc), + testing::Values(InferenceEngine::Layout::ANY), + testing::Values(std::vector{2, 19, 7, 2, 9}), + testing::Values(CommonTestUtils::DEVICE_CPU)), + testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D))); + +const auto params_MultiAxisLogical = testing::Combine( + testing::Combine( + testing::ValuesIn(axesND), + testing::Values(opTypes[1]), + testing::Values(false), + testing::ValuesIn(reductionLogicalTypes), + testing::Values(InferenceEngine::Precision::BOOL), + testing::ValuesIn(inpOutPrc), + testing::ValuesIn(inpOutPrc), + testing::Values(InferenceEngine::Layout::ANY), + testing::Values(std::vector{2, 9, 2, 9}), + testing::Values(CommonTestUtils::DEVICE_CPU)), + testing::Values(emptyCPUSpec)); + +const auto params_MultiAxisLogical4D = testing::Combine( + testing::Combine( + testing::ValuesIn(axesND), + testing::Values(opTypes[1]), + testing::Values(true), + testing::ValuesIn(reductionLogicalTypes), + testing::Values(InferenceEngine::Precision::BOOL), + testing::ValuesIn(inpOutPrc), + testing::ValuesIn(inpOutPrc), + testing::Values(InferenceEngine::Layout::ANY), + testing::Values(std::vector{2, 19, 2, 9}), + testing::Values(CommonTestUtils::DEVICE_CPU)), + testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D))); + +const auto params_MultiAxisLogical5D = testing::Combine( + testing::Combine( + testing::ValuesIn(axesND), + testing::Values(opTypes[1]), + testing::Values(true), + testing::ValuesIn(reductionLogicalTypes), + testing::Values(InferenceEngine::Precision::BOOL), + testing::ValuesIn(inpOutPrc), + testing::ValuesIn(inpOutPrc), + testing::Values(InferenceEngine::Layout::ANY), + testing::Values(std::vector{2, 19, 7, 2, 9}), + testing::Values(CommonTestUtils::DEVICE_CPU)), + testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D))); + +INSTANTIATE_TEST_CASE_P( + smoke_ReduceOneAxis_CPU, + ReduceCPULayerTest, + paramsOneAxis, + ReduceCPULayerTest::getTestCaseName +); + +INSTANTIATE_TEST_CASE_P( + smoke_ReduceLogicalOneAxis_CPU, + ReduceCPULayerTest, + paramsOneAxisLogical, + ReduceCPULayerTest::getTestCaseName +); + +INSTANTIATE_TEST_CASE_P( + smoke_Reduce_ReductionTypes_CPU, + ReduceCPULayerTest, + params_MultiAxis, + ReduceCPULayerTest::getTestCaseName +); + +INSTANTIATE_TEST_CASE_P( + smoke_Reduce_ReductionTypes4D_CPU, + ReduceCPULayerTest, + params_MultiAxis_4D, + ReduceCPULayerTest::getTestCaseName +); + +INSTANTIATE_TEST_CASE_P( + smoke_Reduce_ReductionTypes5D_CPU, + ReduceCPULayerTest, + params_MultiAxis_5D, + ReduceCPULayerTest::getTestCaseName +); + +INSTANTIATE_TEST_CASE_P( + smoke_ReduceLogical_ReductionTypes_CPU, + ReduceCPULayerTest, + params_MultiAxisLogical, + ReduceCPULayerTest::getTestCaseName +); + +INSTANTIATE_TEST_CASE_P( + smoke_ReduceLogical4D_ReductionTypes_CPU, + ReduceCPULayerTest, + params_MultiAxisLogical4D, + ReduceCPULayerTest::getTestCaseName +); + +INSTANTIATE_TEST_CASE_P( + smoke_ReduceLogical5D_ReductionTypes_CPU, + ReduceCPULayerTest, + params_MultiAxisLogical5D, + ReduceCPULayerTest::getTestCaseName +); +} // namespace +} // namespace CPULayerTestsDefinitions + diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/region_yolo.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/region_yolo.cpp new file mode 100644 index 00000000000000..2fedfd2b2804f4 --- /dev/null +++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/region_yolo.cpp @@ -0,0 +1,165 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "ngraph_functions/builders.hpp" +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; + +namespace CPULayerTestsDefinitions { + +struct regionYoloAttributes { + size_t classes; + size_t coordinates; + size_t num_regions; + bool do_softmax; + int start_axis; + int end_axis; +}; + +using regionYoloParamsTuple = std::tuple< + ngraph::Shape, // Input Shape + regionYoloAttributes, // Params + std::vector, // mask + InferenceEngine::Precision, // Network input precision + InferenceEngine::Precision, // Network output precision + std::map, // Additional network configuration + std::string>; // Device name + + +class RegionYoloCPULayerTest : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + ngraph::Shape inputShape; + regionYoloAttributes attributes; + std::vector mask; + InferenceEngine::Precision inpPrecision; + InferenceEngine::Precision outPrecision; + std::string targetName; + std::map additionalConfig; + + std::tie(inputShape, attributes, mask, inpPrecision, outPrecision, additionalConfig, targetName) = obj.param; + + std::ostringstream result; + result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_"; + result << "classes=" << attributes.classes << "_"; + result << "coords=" << attributes.coordinates << "_"; + result << "num=" << attributes.num_regions << "_"; + result << "doSoftmax=" << attributes.do_softmax << "_"; + result << "axis=" << attributes.start_axis << "_"; + result << "endAxis=" << attributes.end_axis << "_"; + result << "inpPRC=" << inpPrecision.name() << "_"; + result << "outPRC=" << outPrecision.name() << "_"; + result << "targetDevice=" << targetName << "_"; + return result.str(); + } +protected: + void SetUp() override { + ngraph::Shape inputShape; + regionYoloAttributes attributes; + std::vector mask; + std::map additionalConfig; + + std::tie(inputShape, attributes, mask, inPrc, outPrc, additionalConfig, targetDevice) = this->GetParam(); + + configuration.insert(additionalConfig.begin(), additionalConfig.end()); + + selectedType = std::string("unknown_") + inPrc.name(); + + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(inPrc); + auto param = std::make_shared(ngPrc, inputShape); + auto region_yolo = std::make_shared(param, attributes.coordinates, attributes.classes, attributes.num_regions, + attributes.do_softmax, mask, attributes.start_axis, attributes.end_axis); + function = std::make_shared(std::make_shared(region_yolo), ngraph::ParameterVector{param}, "RegionYolo"); + } +}; + +TEST_P(RegionYoloCPULayerTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); + CheckCPUImpl(executableNetwork, "RegionYolo"); +} + +namespace { +const std::vector inpOutPrc = {Precision::BF16, Precision::FP32}; + +const std::map additional_config; + +const std::vector inShapes_caffe = { + {1, 125, 13, 13} +}; + +const std::vector inShapes_mxnet = { + {1, 75, 52, 52}, + {1, 75, 32, 32}, + {1, 75, 26, 26}, + {1, 75, 16, 16}, + {1, 75, 13, 13}, + {1, 75, 8, 8} +}; + +const std::vector inShapes_v3 = { + {1, 255, 52, 52}, + {1, 255, 26, 26}, + {1, 255, 13, 13} +}; + +const std::vector> masks = { + {0, 1, 2}, + {3, 4, 5}, + {6, 7, 8} +}; + +const std::vector do_softmax = {true, false}; +const std::vector classes = {80, 20}; +const std::vector num_regions = {5, 9}; +const size_t coords = 4; +const int start_axis = 1; +const int end_axis = 3; + +const regionYoloAttributes yoloV3attr = {80, 4, 9, false, 1, 3}; + +const auto testCase_yolov3 = ::testing::Combine( + ::testing::ValuesIn(inShapes_v3), + ::testing::Values(yoloV3attr), + ::testing::Values(masks[2]), + ::testing::ValuesIn(inpOutPrc), + ::testing::ValuesIn(inpOutPrc), + ::testing::Values(additional_config), + ::testing::Values(CommonTestUtils::DEVICE_CPU) +); + +const regionYoloAttributes yoloV3mxnetAttr = {20, 4, 9, false, 1, 3}; + +const auto testCase_yolov3_mxnet = ::testing::Combine( + ::testing::ValuesIn(inShapes_mxnet), + ::testing::Values(yoloV3mxnetAttr), + ::testing::Values(masks[1]), + ::testing::ValuesIn(inpOutPrc), + ::testing::ValuesIn(inpOutPrc), + ::testing::Values(additional_config), + ::testing::Values(CommonTestUtils::DEVICE_CPU) +); + +const regionYoloAttributes yoloV2caffeAttr = {20, 4, 5, true, 1, 3}; + +const auto testCase_yolov2_caffe = ::testing::Combine( + ::testing::ValuesIn(inShapes_caffe), + ::testing::Values(yoloV2caffeAttr), + ::testing::Values(masks[0]), + ::testing::ValuesIn(inpOutPrc), + ::testing::ValuesIn(inpOutPrc), + ::testing::Values(additional_config), + ::testing::Values(CommonTestUtils::DEVICE_CPU) +); + +INSTANTIATE_TEST_CASE_P(smoke_TestsRegionYolov3CPU, RegionYoloCPULayerTest, testCase_yolov3, RegionYoloCPULayerTest::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_TestsRegionYoloMxnetCPU, RegionYoloCPULayerTest, testCase_yolov3_mxnet, RegionYoloCPULayerTest::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_TestsRegionYoloCaffeCPU, RegionYoloCPULayerTest, testCase_yolov2_caffe, RegionYoloCPULayerTest::getTestCaseName); +} // namespace +} // namespace CPULayerTestsDefinitions \ No newline at end of file diff --git a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/conv_concat.cpp b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/conv_concat.cpp index 076f6560e38de7..0df806f7f86da0 100644 --- a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/conv_concat.cpp +++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/conv_concat.cpp @@ -99,7 +99,7 @@ void ConvConcatSubgraphTest::SetUp() { } } for (size_t conv = 0; conv < convolutionNodes.size(); conv++) { - convolutionNodes[conv]->get_rt_info() = setCPUInfo(inFmts, outFmts, priority); + convolutionNodes[conv]->get_rt_info() = getCPUInfo(); } auto concat = ngraph::builder::makeConcat(ngraph::OutputVector{convolutionNodes[0], convolutionNodes[1]}, axis); @@ -112,7 +112,7 @@ TEST_P(ConvConcatSubgraphTest, CompareWithRefs) { SKIP_IF_CURRENT_TEST_IS_DISABLED() Run(); - CheckCPUImpl(executableNetwork, pluginTypeNode, inFmts, outFmts, selectedType); + CheckCPUImpl(executableNetwork, pluginTypeNode); }; /* ============= Common Convolution Params ============= */ diff --git a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_permute_reorder.cpp b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_permute_reorder.cpp index e5b734bae53c9a..44c2d81847344e 100644 --- a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_permute_reorder.cpp +++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_permute_reorder.cpp @@ -84,7 +84,7 @@ void FusePermuteAndReorderTest::CreateGraph() { auto constOrder = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order); auto permute = std::make_shared(params[0], constOrder); - permute->get_rt_info() = setCPUInfo({memFmt}, {memFmt}, {}); + permute->get_rt_info() = makeCPUInfo({memFmt}, {memFmt}, {}); ngraph::ResultVector results{std::make_shared(permute)}; function = std::make_shared(results, params, "PermuteReorder"); @@ -145,17 +145,17 @@ void FusePermuteAndReorderTest1::CreateGraph() { auto constOrder1 = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order); auto permute1 = std::make_shared(params[0], constOrder1); auto memFmt1 = inputShape.size() == 5 ? ndhwc : nhwc; - permute1->get_rt_info() = setCPUInfo({memFmt1}, {memFmt1}, {}); + permute1->get_rt_info() = makeCPUInfo({memFmt1}, {memFmt1}, {}); auto constOrder2 = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order); auto permute2 = std::make_shared(permute1, constOrder2); auto memFmt2 = inputShape.size() == 5 ? ndhwc : nhwc; - permute2->get_rt_info() = setCPUInfo({memFmt2}, {memFmt2}, {}); + permute2->get_rt_info() = makeCPUInfo({memFmt2}, {memFmt2}, {}); auto constOrder3 = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order); auto permute3 = std::make_shared(permute2, constOrder3); auto memFmt3 = inputShape.size() == 5 ? ncdhw : nchw; - permute3->get_rt_info() = setCPUInfo({memFmt3}, {memFmt3}, {}); + permute3->get_rt_info() = makeCPUInfo({memFmt3}, {memFmt3}, {}); auto shape = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, permute3->get_output_shape(0)); auto reshape = std::make_shared(permute1, shape, false); @@ -214,12 +214,12 @@ void FusePermuteAndReorderTest2::CreateGraph() { auto constOrder1 = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order); auto permute1 = std::make_shared(params[0], constOrder1); auto memFmt1 = inputShape.size() == 5 ? ndhwc : nhwc; - permute1->get_rt_info() = setCPUInfo({memFmt1}, {memFmt1}, {}); + permute1->get_rt_info() = makeCPUInfo({memFmt1}, {memFmt1}, {}); auto constOrder2 = ngraph::builder::makeConstant(ngraph::element::i64, {inputShape.size()}, order); auto permute2 = std::make_shared(params[1], constOrder2); auto memFmt2 = inputShape.size() == 5 ? ncdhw : nchw; - permute2->get_rt_info() = setCPUInfo({memFmt2}, {memFmt2}, {}); + permute2->get_rt_info() = makeCPUInfo({memFmt2}, {memFmt2}, {}); auto concat = ngraph::builder::makeConcat({permute1, permute2}, 1); diff --git a/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp b/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp index 94fdbd34b377c1..5dff97729e5d48 100644 --- a/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp +++ b/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp @@ -15,6 +15,7 @@ const char *CPUTestsBase::cpu_fmt2str(cpu_memory_format_t v) { if (v == nCdhw8c) return "nCdhw8c"; if (v == nCdhw16c) return "nCdhw16c"; if (v == ndhwc) return "ndhwc"; + if (v == nc) return "nc"; if (v == x) return "x"; assert(!"unknown fmt"); return "undef"; @@ -34,6 +35,7 @@ cpu_memory_format_t CPUTestsBase::cpu_str2fmt(const char *str) { CASE(nCdhw8c); CASE(nCdhw16c); CASE(ndhwc); + CASE(nc); CASE(x); #undef CASE assert(!"unknown memory format"); @@ -45,7 +47,9 @@ std::string CPUTestsBase::fmts2str(const std::vector &fmts) for (auto &fmt : fmts) { ((str += "cpu:") += cpu_fmt2str(fmt)) += ","; } - str.erase(str.end() - 1); + if (!str.empty()) { + str.pop_back(); + } return str; } @@ -54,14 +58,16 @@ std::string CPUTestsBase::impls2str(const std::vector &priority) { for (auto &impl : priority) { ((str += "cpu:") += impl) += ","; } - str.erase(str.end() - 1); + if (!str.empty()) { + str.pop_back(); + } return str; } -void CPUTestsBase::CheckCPUImpl(InferenceEngine::ExecutableNetwork &execNet, std::string nodeType, - std::vector inputMemoryFormats, - std::vector outputMemoryFormats, std::string selectedType) { +void CPUTestsBase::CheckCPUImpl(InferenceEngine::ExecutableNetwork &execNet, std::string nodeType) const { IE_SUPPRESS_DEPRECATED_START + ASSERT_TRUE(!selectedType.empty()) << "Node type is not defined."; + bool isNodeFound = false; InferenceEngine::CNNNetwork execGraphInfo = execNet.GetExecGraphInfo(); auto function = execGraphInfo.getFunction(); ASSERT_NE(nullptr, function); @@ -84,25 +90,27 @@ void CPUTestsBase::CheckCPUImpl(InferenceEngine::ExecutableNetwork &execNet, std }; if (getExecValue(ExecGraphInfoSerialization::LAYER_TYPE) == nodeType) { - ASSERT_LE(inputMemoryFormats.size(), node->get_input_size()); - ASSERT_LE(outputMemoryFormats.size(), node->get_output_size()); - for (int i = 0; i < inputMemoryFormats.size(); i++) { + isNodeFound = true; + ASSERT_LE(inFmts.size(), node->get_input_size()); + ASSERT_LE(outFmts.size(), node->get_output_size()); + for (int i = 0; i < inFmts.size(); i++) { const auto parentPort = node->input_values()[i]; const auto port = node->inputs()[i]; if ((parentPort.get_tensor_ptr() == port.get_tensor_ptr())) { auto parentNode = parentPort.get_node_shared_ptr(); auto actualInputMemoryFormat = getExecValueOutputsLayout(parentNode); - ASSERT_EQ(inputMemoryFormats[i], cpu_str2fmt(actualInputMemoryFormat.c_str())); + ASSERT_EQ(inFmts[i], cpu_str2fmt(actualInputMemoryFormat.c_str())); } } - for (int i = 0; i < outputMemoryFormats.size(); i++) { + for (int i = 0; i < outFmts.size(); i++) { auto actualOutputMemoryFormat = getExecValue(ExecGraphInfoSerialization::OUTPUT_LAYOUTS); - ASSERT_EQ(outputMemoryFormats[i], cpu_str2fmt(actualOutputMemoryFormat.c_str())); + ASSERT_EQ(outFmts[i], cpu_str2fmt(actualOutputMemoryFormat.c_str())); } auto primType = getExecValue(ExecGraphInfoSerialization::IMPL_TYPE); ASSERT_EQ(selectedType, primType); } } + ASSERT_TRUE(isNodeFound) << "Node type name: \"" << nodeType << "\" has not been found."; IE_SUPPRESS_DEPRECATED_END } @@ -112,16 +120,39 @@ std::string CPUTestsBase::getTestCaseName(CPUSpecificParams params) { std::vector priority; std::string selectedType; std::tie(inFmts, outFmts, priority, selectedType) = params; - result << "_inFmts=" << fmts2str(inFmts); - result << "_outFmts=" << fmts2str(outFmts); - result << "_primitive=" << selectedType; + if (!inFmts.empty()) { + result << "_inFmts=" << fmts2str(inFmts); + } + if (!outFmts.empty()) { + result << "_outFmts=" << fmts2str(outFmts); + } + if (!selectedType.empty()) { + result << "_primitive=" << selectedType; + } return result.str(); } -std::map> CPUTestsBase::setCPUInfo(std::vector inFmts, - std::vector outFmts, - std::vector priority) { - std::map> cpuInfo; +CPUTestsBase::CPUInfo CPUTestsBase::getCPUInfo() const { + return makeCPUInfo(inFmts, outFmts, priority); +} + +std::string CPUTestsBase::getPrimitiveType() const { + std::string isaType; + if (InferenceEngine::with_cpu_x86_avx512f()) { + isaType = "jit_avx512"; + } else if (InferenceEngine::with_cpu_x86_avx2()) { + isaType = "jit_avx2"; + } else if (InferenceEngine::with_cpu_x86_sse42()) { + isaType = "jit_sse42"; + } else { + isaType = "ref"; + } + return isaType; +} + +CPUTestsBase::CPUInfo +CPUTestsBase::makeCPUInfo(std::vector inFmts, std::vector outFmts, std::vector priority) { + CPUInfo cpuInfo; if (!inFmts.empty()) { cpuInfo.insert({"InputMemoryFormats", std::make_shared>(fmts2str(inFmts))}); @@ -136,4 +167,24 @@ std::map> CPUTestsBase::setCPUInfo return cpuInfo; } +std::vector filterCPUSpecificParams(std::vector ¶msVector) { + auto adjustBlockedFormatByIsa = [](std::vector& formats) { + for (int i = 0; i < formats.size(); i++) { + if (formats[i] == nChw16c) + formats[i] = nChw8c; + if (formats[i] == nCdhw16c) + formats[i] = nCdhw8c; + } + }; + + if (!InferenceEngine::with_cpu_x86_avx512f()) { + for (auto& param : paramsVector) { + adjustBlockedFormatByIsa(std::get<0>(param)); + adjustBlockedFormatByIsa(std::get<1>(param)); + } + } + + return paramsVector; +} + } // namespace CPUTestUtils diff --git a/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.hpp b/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.hpp index 4a259af706eb5e..70e3d1c91839f5 100644 --- a/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.hpp +++ b/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2020 Intel Corporation +// Copyright (C) 2020 Intel Corporation7 // SPDX-License-Identifier: Apache-2.0 // @@ -23,38 +23,44 @@ namespace CPUTestUtils { nCdhw8c, nCdhw16c, ndhwc, + nc, x, undef } cpu_memory_format_t; using CPUSpecificParams = std::tuple< - std::vector, - std::vector, - std::vector, - std::string + std::vector, //input memomry format + std::vector, //output memory format + std::vector, //priority + std::string // selected primitive type >; class CPUTestsBase { public: - void CheckCPUImpl(InferenceEngine::ExecutableNetwork &execNet, std::string nodeType, std::vector inputMemoryFormats, - std::vector outputMemoryFormats, std::string selectedType); - - std::map> setCPUInfo(std::vector inFmts, std::vector outFmts, - std::vector priority); + typedef std::map> CPUInfo; +public: static std::string getTestCaseName(CPUSpecificParams params); + static const char *cpu_fmt2str(cpu_memory_format_t v); + static cpu_memory_format_t cpu_str2fmt(const char *str); + static std::string fmts2str(const std::vector &fmts); + static std::string impls2str(const std::vector &priority); + static CPUInfo makeCPUInfo(std::vector inFmts, + std::vector outFmts, + std::vector priority); + + CPUInfo getCPUInfo() const; + void CheckCPUImpl(InferenceEngine::ExecutableNetwork &execNet, std::string nodeType) const; +protected: + std::string getPrimitiveType() const; std::vector inFmts, outFmts; std::vector priority; std::string selectedType; - -private: - static const char *cpu_fmt2str(cpu_memory_format_t v); - cpu_memory_format_t cpu_str2fmt(const char *str); - static std::string fmts2str(const std::vector &fmts); - std::string impls2str(const std::vector &priority); }; +const auto emptyCPUSpec = CPUSpecificParams{{}, {}, {}, {}}; + const auto conv_ref_2D = CPUSpecificParams{{nchw}, {nchw}, {"ref_any"}, "ref_any_FP32"}; const auto conv_ref_3D = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref_any"}, "ref_any_FP32"}; @@ -80,4 +86,7 @@ const auto conv_sse42_2D_1x1 = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_sse42 const auto conv_avx2_2D_1x1 = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_avx2_1x1"}, "jit_avx2_1x1_FP32"}; const auto conv_avx512_2D_1x1 = CPUSpecificParams{{nChw16c}, {nChw16c}, {"jit_avx512_1x1"}, "jit_avx512_1x1_FP32"}; +// utility functions +std::vector filterCPUSpecificParams(std::vector& paramsVector); + } // namespace CPUTestUtils diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp index dbec7dadf1706b..dcd331a0241161 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp @@ -36,5 +36,9 @@ std::vector disabledTestPatterns() { // TODO: Issue: 41461 R"(.*TopKLayerTest.*k=10.*mode=min.*sort=index.*)", R"(.*TopKLayerTest.*k=5.*sort=(none|index).*)", + // TODO: Issue: 43511 + R"(.*EltwiseLayerTest.*IS=\(1.4.3.2.1.3\).*OpType=(Prod|Sub).*secondaryInputType=CONSTANT_opType=VECTOR_netPRC=(FP16|FP32).*)", + R"(.*EltwiseLayerTest.*IS=\(1.4.3.2.1.3\).*OpType=Sum.*secondaryInputType=CONSTANT_opType=VECTOR_netPRC=(FP16|FP32).*)", + R"(.*EltwiseLayerTest.*IS=\(1.4.3.2.1.3\).*OpType=Sub.*secondaryInputType=CONSTANT_opType=VECTOR_netPRC=I64.*)", }; } diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/eltwise.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/eltwise.cpp index 22bf1a5ae89a77..1c407733fc4459 100644 --- a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/eltwise.cpp +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/eltwise.cpp @@ -98,13 +98,11 @@ void EltwiseLayerTest::SetUp() { eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD || eltwiseType == ngraph::helpers::EltwiseTypes::MOD) { std::vector data(ngraph::shape_size(shape_input_secondary)); - data = NGraphFunctions::Utils::generateVector(ngraph::shape_size(shape_input_secondary)); - for (float &i : data) { - if (i == 0) { - i = 1; - } - } + data = NGraphFunctions::Utils::generateVector(ngraph::shape_size(shape_input_secondary), 10, 2); secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data); + } else if (eltwiseType == ngraph::helpers::EltwiseTypes::POWER && secondaryInputType == ngraph::helpers::InputLayerType::CONSTANT) { + // to avoid floating point overflow on some platforms, let's fill the constant with small numbers. + secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, {}, true, 3); } else { secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary); if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) { diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/region_yolo.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/region_yolo.cpp index 968909418bcfb2..fbbd2cc627dd02 100644 --- a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/region_yolo.cpp +++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/region_yolo.cpp @@ -27,7 +27,7 @@ std::string RegionYoloLayerTest::getTestCaseName(const testing::TestParamInfoGetParam(); auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); - auto param = std::make_shared(ngraph::element::f32, inputShape); + auto param = std::make_shared(ngPrc, inputShape); auto region_yolo = std::make_shared(param, coords, classes, num_regions, do_softmax, mask, start_axis, end_axis); function = std::make_shared(std::make_shared(region_yolo), ngraph::ParameterVector{param}, "RegionYolo"); } diff --git a/inference-engine/tests/ie_test_utils/common_test_utils/data_utils.hpp b/inference-engine/tests/ie_test_utils/common_test_utils/data_utils.hpp index fcbb64cf041173..670f027f8832cd 100644 --- a/inference-engine/tests/ie_test_utils/common_test_utils/data_utils.hpp +++ b/inference-engine/tests/ie_test_utils/common_test_utils/data_utils.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -177,8 +178,10 @@ void inline fill_data_random_float(InferenceEngine::Blob::Ptr &blob, const uint3 for (size_t i = 0; i < blob->size(); i++) { auto value = static_cast(distribution(random)); value /= static_cast(k); - if (typeid(dataType) == typeid(typename InferenceEngine::PrecisionTrait::value_type)) { + if (PRC == InferenceEngine::Precision::FP16) { rawBlobDataPtr[i] = ngraph::float16(value).to_bits(); + } else if (PRC == InferenceEngine::Precision::BF16) { + rawBlobDataPtr[i] = ngraph::bfloat16(value).to_bits(); } else { rawBlobDataPtr[i] = value; } @@ -237,4 +240,27 @@ void inline fill_data_random(InferenceEngine:: fill_data_random_float(blob, range, start_from, k, seed); } +template<> +void inline fill_data_random(InferenceEngine::Blob::Ptr &blob, + const uint32_t range, + int32_t start_from, + const int32_t k, const int seed) { + fill_data_random_float(blob, range, start_from, k, seed); +} + +template +typename std::enable_if::value, T>::type +static ie_abs(const T &val) { + return std::abs(val); +} + +template +typename std::enable_if::value, T>::type +static ie_abs(const T &val) { + return val; +} + +static ngraph::bfloat16 ie_abs(const ngraph::bfloat16& val) { + return ngraph::bfloat16::from_bits(val.to_bits() ^ 0x8000); +} } // namespace CommonTestUtils diff --git a/inference-engine/tests/ie_test_utils/functional_test_utils/blob_utils.hpp b/inference-engine/tests/ie_test_utils/functional_test_utils/blob_utils.hpp index 3a6be95061be22..1259266be86329 100644 --- a/inference-engine/tests/ie_test_utils/functional_test_utils/blob_utils.hpp +++ b/inference-engine/tests/ie_test_utils/functional_test_utils/blob_utils.hpp @@ -328,6 +328,16 @@ convertArrayPrecision +void inline +convertArrayPrecision(float *dst, const short *src, + size_t nelem) { + auto srcBf16 = reinterpret_cast(src); + for (size_t i = 0; i < nelem; i++) { + dst[i] = static_cast(srcBf16[i]); + } +} + template InferenceEngine::Blob::Ptr inline convertBlobPrecision(const InferenceEngine::Blob::Ptr &blob) { using from_d_type = typename InferenceEngine::PrecisionTrait::value_type; @@ -464,6 +474,7 @@ InferenceEngine::Blob::Ptr inline createAndFillBlob(const InferenceEngine::Tenso #define CASE(X) case X: CommonTestUtils::fill_data_random(blob, range, start_from, resolution, seed); break; CASE(InferenceEngine::Precision::FP32) CASE(InferenceEngine::Precision::FP16) + CASE(InferenceEngine::Precision::BF16) CASE(InferenceEngine::Precision::U8) CASE(InferenceEngine::Precision::U16) CASE(InferenceEngine::Precision::I8) diff --git a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp index b976f125825bcb..4cbfc20959e564 100644 --- a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp +++ b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp @@ -239,6 +239,10 @@ void LayerTestsCommon::Compare(const std::vector &expected, const Compare(reinterpret_cast(expectedBuffer), reinterpret_cast(actualBuffer), size, 0); break; + case InferenceEngine::Precision::BF16: + Compare(reinterpret_cast(expectedBuffer), + reinterpret_cast(actualBuffer), size, ngraph::bfloat16(threshold)); + break; default: FAIL() << "Comparator for " << precision << " precision isn't supported"; } @@ -320,6 +324,9 @@ std::vector> LayerTestsCommon::CalculateRefs() { // IE converts f16 to f32 ngraph::pass::ConvertPrecision().run_on_function( function); + + // The same idea for bf16 + ngraph::pass::ConvertPrecision().run_on_function(function); function->validate_nodes_and_infer_types(); auto referenceInputs = std::vector>(inputs.size()); for (std::size_t i = 0; i < inputs.size(); ++i) { @@ -347,14 +354,15 @@ std::vector> LayerTestsCommon::CalculateRefs() { } } + const auto& inType = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(inPrc); std::vector> expectedOutputs; switch (refMode) { case INTERPRETER: { - expectedOutputs = ngraph::helpers::interpreterFunction(function, referenceInputs, convertType); + expectedOutputs = ngraph::helpers::interpreterFunction(function, referenceInputs, inType, convertType); break; } case CONSTANT_FOLDING: { - const auto &foldedFunc = ngraph::helpers::foldFunction(function, referenceInputs); + const auto &foldedFunc = ngraph::helpers::foldFunction(function, referenceInputs, inType); expectedOutputs = ngraph::helpers::getConstData(foldedFunc, convertType); break; } @@ -370,7 +378,7 @@ std::vector> LayerTestsCommon::CalculateRefs() { m.register_pass(); m.register_pass(); m.run_passes(cloned_function); - expectedOutputs = ngraph::helpers::interpreterFunction(cloned_function, referenceInputs, convertType); + expectedOutputs = ngraph::helpers::interpreterFunction(cloned_function, referenceInputs, inType, convertType); break; } } diff --git a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.hpp b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.hpp index 0290d35daf75a2..bdc1e27b209ece 100644 --- a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.hpp +++ b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include "common_test_utils/common_utils.hpp" #include "common_test_utils/test_common.hpp" @@ -154,29 +155,17 @@ class LayerTestsCommon : public CommonTestUtils::TestsCommon { protected: LayerTestsCommon(); - template - typename std::enable_if::value, T>::type - static ie_abs(const T &val) { - return std::abs(val); - } - - template - typename std::enable_if::value, T>::type - static ie_abs(const T &val) { - return val; - } - template static void Compare(const T *expected, const T *actual, std::size_t size, T threshold) { for (std::size_t i = 0; i < size; ++i) { const auto &ref = expected[i]; const auto &res = actual[i]; - const auto absoluteDifference = ie_abs(res - ref); + const auto absoluteDifference = CommonTestUtils::ie_abs(res - ref); if (absoluteDifference <= threshold) { continue; } - const auto max = std::max(ie_abs(res), ie_abs(ref)); + const auto max = std::max(CommonTestUtils::ie_abs(res), CommonTestUtils::ie_abs(ref)); ASSERT_TRUE(max != 0 && ((absoluteDifference / max) <= threshold)) << "Relative comparison of values expected: " << ref << " and actual: " << res << " at index " << i << " with threshold " << threshold diff --git a/inference-engine/tests/ngraph_functions/include/ngraph_functions/utils/ngraph_helpers.hpp b/inference-engine/tests/ngraph_functions/include/ngraph_functions/utils/ngraph_helpers.hpp index 0682cc26f7cccd..69906db57014a1 100644 --- a/inference-engine/tests/ngraph_functions/include/ngraph_functions/utils/ngraph_helpers.hpp +++ b/inference-engine/tests/ngraph_functions/include/ngraph_functions/utils/ngraph_helpers.hpp @@ -233,6 +233,7 @@ inline ngraph::NodeVector castOps2Nodes(const std::vector> interpreterFunction(const std::shared_ptr &function, const std::vector> &inputs, + element::Type_t inType = element::Type_t::undefined, const std::vector convertType = {}); // @@ -245,7 +246,8 @@ void CompareFunctions(const Function &actual, const Function &expected); std::shared_ptr foldFunction(const std::shared_ptr &function, - const std::vector> &inputs); + const std::vector> &inputs, + element::Type_t inpType = element::Type_t::undefined); std::vector> getConstData(const std::shared_ptr &function, std::vector convertType = {}); @@ -253,7 +255,7 @@ std::vector> getConstData(const std::shared_ptr getNodeSharedPtr(const ngraph::NodeTypeInfo &type_info, const ngraph::OutputVector &outputVector); -std::vector convertOutputPrecision(std::vector &output, +std::vector convertOutputPrecision(const std::vector &output, const element::Type_t &fromPrecision, const element::Type_t &toPrecision, const size_t elementsCount); diff --git a/inference-engine/tests/ngraph_functions/src/input_layer.cpp b/inference-engine/tests/ngraph_functions/src/input_layer.cpp index 913e450135504e..a09e46b6d23308 100644 --- a/inference-engine/tests/ngraph_functions/src/input_layer.cpp +++ b/inference-engine/tests/ngraph_functions/src/input_layer.cpp @@ -17,8 +17,7 @@ std::shared_ptr makeInputLayer(const element::Type &type, ngraph:: std::shared_ptr input; switch (inputType) { case ngraph::helpers::InputLayerType::CONSTANT: { - std::vector data(ngraph::shape_size(shape)); - input = ngraph::builder::makeConstant(type, shape, data); + input = ngraph::builder::makeConstant(type, shape, {}, true); break; } case ngraph::helpers::InputLayerType::PARAMETER: diff --git a/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp b/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp index 4cc9029f0adf49..8fade97cd7bb77 100644 --- a/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp +++ b/inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp @@ -79,7 +79,9 @@ OutputVector convert2OutputVector(const std::vector> &node return outs; } -std::vector> interpreterFunction(const std::shared_ptr &function, const std::vector> &inputs, +std::vector> interpreterFunction(const std::shared_ptr &function, + const std::vector> &inputs, + element::Type_t inType, const std::vector convertType) { runtime::Backend::set_backend_shared_library_search_directory(""); auto backend = runtime::Backend::create("INTERPRETER"); @@ -98,7 +100,12 @@ std::vector> interpreterFunction(const std::shared_ptr const auto ¶meterType = parameter->get_element_type(); const auto ¶meterSize = shape_size(parameterShape) * parameterType.size(); - const auto &input = inputs[parameterIndex]; + auto input = inputs[parameterIndex]; + + if (inType != element::undefined && inType != parameterType) { + input = convertOutputPrecision(input, inType, parameter->get_element_type(), shape_size(parameter->get_shape())); + } + const auto &inputSize = input.size(); NGRAPH_CHECK(parameterSize == inputSize, "Got parameter (", parameter->get_friendly_name(), ") of size ", parameterSize, @@ -137,22 +144,31 @@ std::vector> interpreterFunction(const std::shared_ptr } std::shared_ptr foldFunction(const std::shared_ptr &function, - const std::vector> &inputs) { + const std::vector> &inputs, element::Type_t inpType) { std::vector paramElementTypes; std::vector paramShapes; + std::vector> vecTmpConvertedInputs; + vecTmpConvertedInputs.reserve(inputs.size()); + + std::vector inBuffers; + inBuffers.reserve(inputs.size()); + for (const auto ¶m : function->get_parameters()) { paramElementTypes.emplace_back(param->get_element_type()); paramShapes.emplace_back(param->get_shape()); + auto parameterIndex = function->get_parameter_index(param); + auto& input = inputs[parameterIndex]; + + if (inpType != element::undefined && inpType != paramElementTypes.back()) { + vecTmpConvertedInputs.emplace_back(convertOutputPrecision(input, inpType, param->get_element_type(), shape_size(param->get_shape()))); + inBuffers.push_back(vecTmpConvertedInputs.back().data()); + } else { + // const_cast added to satisfy specialize_function interface + // which requires inputs as std::vector + inBuffers.push_back(const_cast(input.data())); + } } - auto inBuffers = std::vector(inputs.size()); - std::transform(inputs.cbegin(), inputs.cend(), inBuffers.begin(), - [](const std::vector &input) { - // const_cast added to satisfy specialize_function interface - // which requires inputs as std::vector - return const_cast(input.data()); - }); - const auto &foldedFunc = specialize_function(function, paramElementTypes, paramShapes, inBuffers); ngraph::pass::ConstantFolding().run_on_function(foldedFunc); for (const auto &op : foldedFunc->get_ops()) { @@ -250,7 +266,7 @@ std::shared_ptr getNodeSharedPtr(const ngraph::NodeTypeInfo &type_ } template -std::vector convertPrecision(std::vector &buffer, const size_t elementsCount, const size_t elementSize) { +std::vector convertPrecision(const std::vector &buffer, const size_t elementsCount, const size_t elementSize) { std::vector convertedData(elementsCount * elementSize); const fromPrec *src = reinterpret_cast(buffer.data()); toPrec *dst = reinterpret_cast(convertedData.data()); @@ -270,8 +286,10 @@ bool is_tensor_iterator_exist(const std::shared_ptr & func) { return false; } -std::vector convertOutputPrecision(std::vector &output, const element::Type_t &fromPrecision, const element::Type_t &toPrecision, - const size_t elementsCount) { +std::vector convertOutputPrecision(const std::vector &output, + const element::Type_t &fromPrecision, + const element::Type_t &toPrecision, + const size_t elementsCount) { switch (fromPrecision) { case element::Type_t::u8: { switch (toPrecision) { @@ -520,6 +538,12 @@ std::vector convertOutputPrecision(std::vector &outp case element::Type_t::u64: { return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); } + case element::Type_t::bf16: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } + case element::Type_t::boolean: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } default: throw std::runtime_error("convertOutputPrecision can't convert from: " + element::Type(fromPrecision).get_type_name() + " to: " + element::Type(toPrecision).get_type_name()); @@ -548,6 +572,9 @@ std::vector convertOutputPrecision(std::vector &outp case element::Type_t::f32: { return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); } + case element::Type_t::bf16: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } case element::Type_t::u64: { return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); } @@ -556,6 +583,43 @@ std::vector convertOutputPrecision(std::vector &outp element::Type(toPrecision).get_type_name()); } } + case element::Type_t::bf16: { + switch (toPrecision) { + case element::Type_t::u8: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } + case element::Type_t::u16: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } + case element::Type_t::i8: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } + case element::Type_t::i16: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } + case element::Type_t::i32: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } + case element::Type_t::i64: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } + case element::Type_t::f32: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } + case element::Type_t::u64: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } + case element::Type_t::bf16: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } + case element::Type_t::boolean: { + return convertPrecision(output, elementsCount, element::Type(toPrecision).size()); + } + default: + throw std::runtime_error("convertOutputPrecision can't convert from: " + element::Type(fromPrecision).get_type_name() + " to: " + + element::Type(toPrecision).get_type_name()); + } + } default: throw std::runtime_error("convertOutputPrecision can't convert from: " + element::Type(fromPrecision).get_type_name() + " precision"); } diff --git a/inference-engine/tests_deprecated/functional/mkldnn/single_layer_tests/crop_tests.cpp b/inference-engine/tests_deprecated/functional/mkldnn/single_layer_tests/crop_tests.cpp deleted file mode 100644 index a845eaf77e5c50..00000000000000 --- a/inference-engine/tests_deprecated/functional/mkldnn/single_layer_tests/crop_tests.cpp +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include - -#include "tests_common.hpp" -#include "single_layer_common.hpp" -#include "ir_gen_helper.hpp" - -using namespace ::testing; -using namespace InferenceEngine; -using namespace single_layer_tests; - -struct crop_base_params { - std::vector in_dims; - std::vector out_dims; - std::vector offsets; -}; - -#ifdef IN -#undef IN -#endif - -struct crop_test_params : crop_base_params { - std::string device_name; - - crop_test_params(std::string name, crop_base_params params) : - crop_base_params(params), device_name(name) {} -}; - -template -void ref_crop(InferenceEngine::TBlob &src, InferenceEngine::TBlob &dst, crop_test_params prm) { - data_t *dst_ptr = dst.data(); - - int ndims = prm.in_dims.size(); - - size_t OFFSET_N = prm.offsets.at(0); - size_t OFFSET_C = prm.offsets.at(1); - size_t OFFSET_D = ndims == 5 ? prm.offsets.at(ndims - 3) : 0; - size_t OFFSET_H = prm.offsets.at(ndims - 2); - size_t OFFSET_W = prm.offsets.at(ndims - 1); - - size_t ON = prm.out_dims[0]; - size_t OC = prm.out_dims[1]; - size_t OD = ndims == 5 ? prm.out_dims[ndims - 3] : 1; - size_t OH = prm.out_dims[ndims - 2]; - size_t OW = prm.out_dims[ndims - 1]; - - size_t IN = prm.in_dims[0]; - size_t IC = prm.in_dims[1]; - size_t ID = ndims == 5 ? prm.in_dims[ndims - 3] : 1; - size_t IH = prm.in_dims[ndims - 2]; - size_t IW = prm.in_dims[ndims - 1]; - - auto dst_off = [=](size_t n, size_t c, size_t d, size_t h, size_t w) -> size_t { - return (n * OC * OD * OH * OW + c * OD * OH * OW + d * OH * OW + h * OW + w); - }; - auto src_off = [=](size_t n, size_t c, size_t d, size_t h, size_t w) -> size_t { - return (n * IC * ID * IH * IW + c * ID * IH * IW + d * IH * IW + h * IW + w); - }; - - ASSERT_GE(IN - OFFSET_N, ON); - ASSERT_GE(IC - OFFSET_C, OC); - ASSERT_GE(ID - OFFSET_D, OD); - ASSERT_GE(IH - OFFSET_H, OH); - ASSERT_GE(IW - OFFSET_W, OW); - - data_t* src_ptr = src.data(); - for (size_t n = 0; n < ON; ++n) { - for (size_t c = 0; c < OC; ++c) { - for (size_t d = 0; d < OD; ++d) { - for (size_t h = 0; h < OH; ++h) { - for (size_t w = 0; w < OW; ++w) { - dst_ptr[dst_off(n, c, d, h, w)] = src_ptr[src_off(n + OFFSET_N, c + OFFSET_C, d + OFFSET_D, - h + OFFSET_H, w + OFFSET_W)]; - } - } - } - } - } -} - -class smoke_CropOnlyTest: public TestsCommon, - public WithParamInterface { - std::string layers_t = R"V0G0N( - - - - - - - - - - - _ID0_ - _ID1_ - _ID2_ - _ID3_ - _ID4_ - - - - - _OD0_ - _OD1_ - _OD2_ - _OD3_ - _OD4_ - - - -)V0G0N"; - - std::string edges_t = R"V0G0N( - -)V0G0N"; - - std::string getModel(crop_test_params p) { - std::string model = layers_t; - - auto dims_size = p.in_dims.size(); - - if (dims_size == 4) { - REMOVE_LINE(model, ""); - REMOVE_LINE(model, "_ID4_"); - REMOVE_LINE(model, "_OD4_"); - } - - REPLACE_WITH_NUM(model, "_ID0_", p.in_dims[0]); - REPLACE_WITH_NUM(model, "_ID1_", p.in_dims[1]); - REPLACE_WITH_NUM(model, "_ID2_", p.in_dims[2]); - REPLACE_WITH_NUM(model, "_ID3_", p.in_dims[3]); - if (dims_size == 5) - REPLACE_WITH_NUM(model, "_ID4_", p.in_dims[4]); - - REPLACE_WITH_NUM(model, "_OD0_", p.out_dims[0]); - REPLACE_WITH_NUM(model, "_OD1_", p.out_dims[1]); - REPLACE_WITH_NUM(model, "_OD2_", p.out_dims[2]); - REPLACE_WITH_NUM(model, "_OD3_", p.out_dims[3]); - if (dims_size == 5) - REPLACE_WITH_NUM(model, "_OD4_", p.out_dims[4]); - - REPLACE_WITH_NUM(model, "_OF0_", p.offsets[0]); - REPLACE_WITH_NUM(model, "_OF1_", p.offsets[1]); - REPLACE_WITH_NUM(model, "_OF2_", p.offsets[2]); - REPLACE_WITH_NUM(model, "_OF3_", p.offsets[3]); - if (dims_size == 5) - REPLACE_WITH_NUM(model, "_OF4_", p.offsets[4]); - - model = IRTemplateGenerator::getIRTemplate("Crop_Only", p.in_dims, "FP32", model, edges_t); - - return model; - } - -protected: - virtual void SetUp() { - try { - crop_test_params p = ::testing::WithParamInterface::GetParam(); - std::string model = getModel(p); - - Core ie; - CNNNetwork network = ie.ReadNetwork(model, Blob::CPtr()); - - InferenceEngine::Layout layout = InferenceEngine::ANY; - switch (p.in_dims.size()) { - case 4: layout = InferenceEngine::NCHW; break; - case 5: layout = InferenceEngine::NCDHW; break; - } - - InputsDataMap inputs = network.getInputsInfo(); - DataPtr inPtr1 = inputs["in1"]->getInputData(); - - InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob(inPtr1->getTensorDesc()); - src->allocate(); - fill_data(src->buffer(), src->size()); - - TBlob* srcPtr = dynamic_cast*>(src.get()); - BlobMap srcs; - srcs.insert(std::pair("in1", src)); - - OutputsDataMap out = network.getOutputsInfo(); - BlobMap dstBlobs; - std::pair item = *out.begin(); - TBlob::Ptr dst; - dst = make_shared_blob(item.second->getTensorDesc()); - dst->allocate(); - dstBlobs[item.first] = dst; - - TBlob::Ptr dst_ref; - dst_ref = make_shared_blob(item.second->getTensorDesc()); - dst_ref->allocate(); - - ref_crop(*srcPtr, *dst_ref, p); - - ExecutableNetwork exeNetwork = ie.LoadNetwork(network, p.device_name); - InferRequest inferRequest = exeNetwork.CreateInferRequest(); - inferRequest.SetInput(srcs); - inferRequest.SetOutput(dstBlobs); - inferRequest.Infer(); - - compare(*dstBlobs.begin()->second, *dst_ref); - - } catch (const details::InferenceEngineException &e) { - FAIL() << e.what(); - } - } -}; - -#define case_1 crop_base_params({{1, 5, 32, 32}, {1, 2, 23, 23}, {0, 2, 5, 4}}) -#define case_2 crop_base_params({{1, 5, 32, 32}, {1, 5, 5, 5}, {0, 0, 20, 20}}) -#define case_3 crop_base_params({{1, 5, 32, 32}, {1, 5, 32, 10}, {0, 0, 0, 20}}) -#define case_4 crop_base_params({{1, 5, 32, 20}, {1, 5, 30, 10}, {0, 0, 2, 10}}) -#define case_5 crop_base_params({{1, 5, 32, 20, 14}, {1, 5, 30, 10, 8}, {0, 0, 2, 10, 6}}) -#define case_6 crop_base_params({{5, 9, 32, 20, 14}, {2, 5, 30, 10, 8}, {3, 4, 2, 10, 6}}) - -TEST_P(smoke_CropOnlyTest, TestsCrop) {} - -std::string getTestCaseName(testing::TestParamInfo obj) { - int ndims = obj.param.in_dims.size(); - - return obj.param.device_name + - "_in" + std::to_string(obj.param.in_dims[0]) + - "_ic" + std::to_string(obj.param.in_dims[1]) + - "_id" + std::to_string(ndims == 5 ? obj.param.in_dims[ndims - 3] : 1) + - "_ih" + std::to_string(obj.param.in_dims[ndims - 2]) + - "_iw" + std::to_string(obj.param.in_dims[ndims - 1]) + - "_on" + std::to_string(obj.param.out_dims[0]) + - "_oc" + std::to_string(obj.param.out_dims[1]) + - "_od" + std::to_string(ndims == 5 ? obj.param.out_dims[ndims - 3] : 1) + - "_oh" + std::to_string(obj.param.out_dims[ndims - 2]) + - "_ow" + std::to_string(obj.param.out_dims[ndims - 1]); -} - -crop_test_params crop_only_test_cases[] = { - crop_test_params("CPU", case_1), - crop_test_params("CPU", case_2), - crop_test_params("CPU", case_3), - crop_test_params("CPU", case_4), - crop_test_params("CPU", case_5), - crop_test_params("CPU", case_6), -}; - -INSTANTIATE_TEST_CASE_P( - TestsPooling, smoke_CropOnlyTest, ::testing::ValuesIn(crop_only_test_cases), getTestCaseName); diff --git a/ngraph/core/src/op/abs.cpp b/ngraph/core/src/op/abs.cpp index 071b10c724f6e5..d22a372c425480 100644 --- a/ngraph/core/src/op/abs.cpp +++ b/ngraph/core/src/op/abs.cpp @@ -71,6 +71,8 @@ namespace absop break; TYPE_CASE(f32)(arg0, out, count); break; + TYPE_CASE(bf16)(arg0, out, count); + break; default: rc = false; break; } return rc; diff --git a/ngraph/core/src/op/convert.cpp b/ngraph/core/src/op/convert.cpp index d03e77e0177be9..6992b1611f5dfe 100644 --- a/ngraph/core/src/op/convert.cpp +++ b/ngraph/core/src/op/convert.cpp @@ -113,6 +113,8 @@ namespace convert break; TYPE_CASE(i32)(arg, out); break; + TYPE_CASE(i16)(arg, out); + break; TYPE_CASE(i64)(arg, out); break; TYPE_CASE(u32)(arg, out); diff --git a/ngraph/core/src/op/divide.cpp b/ngraph/core/src/op/divide.cpp index f93912db65b696..b69c51d9588ff8 100644 --- a/ngraph/core/src/op/divide.cpp +++ b/ngraph/core/src/op/divide.cpp @@ -108,6 +108,8 @@ namespace divide break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec, pythondiv); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec, pythondiv); + break; default: rc = false; break; } return rc; diff --git a/ngraph/core/src/op/multiply.cpp b/ngraph/core/src/op/multiply.cpp index c80763ce5144e2..4c8b4be21e8092 100644 --- a/ngraph/core/src/op/multiply.cpp +++ b/ngraph/core/src/op/multiply.cpp @@ -80,6 +80,8 @@ namespace multiplyop break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; default: rc = false; break; } return rc; diff --git a/ngraph/core/src/op/power.cpp b/ngraph/core/src/op/power.cpp index 9403df667fae92..193c6ded5edf20 100644 --- a/ngraph/core/src/op/power.cpp +++ b/ngraph/core/src/op/power.cpp @@ -83,6 +83,8 @@ namespace power break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; default: rc = false; break; } return rc; diff --git a/ngraph/core/src/op/subtract.cpp b/ngraph/core/src/op/subtract.cpp index 79ccaaa4c8a1b8..3c100f2b23efe0 100644 --- a/ngraph/core/src/op/subtract.cpp +++ b/ngraph/core/src/op/subtract.cpp @@ -86,6 +86,8 @@ namespace subtract break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; default: rc = false; break; } return rc;