From a4c7e14838c061934c27660c22eafc58b0ed1065 Mon Sep 17 00:00:00 2001 From: Gorokhov Dmitriy Date: Wed, 22 Mar 2023 16:47:05 +0400 Subject: [PATCH] Performance fixes (#116) --- .../intel_cpu/src/dnnl_extension_utils.cpp | 31 + .../intel_cpu/src/dnnl_extension_utils.h | 1 + src/plugins/intel_cpu/src/graph_optimizer.cpp | 29 +- src/plugins/intel_cpu/src/node.cpp | 27 +- src/plugins/intel_cpu/src/nodes/conv.cpp | 1 + .../src/nodes/executors/acl/acl_eltwise.cpp | 65 +- .../src/nodes/executors/acl/acl_eltwise.hpp | 2 - .../nodes/executors/acl/acl_interpolate.cpp | 201 ++ .../nodes/executors/acl/acl_interpolate.hpp | 52 + .../executors/common/ref_interpolate.cpp | 380 +++ .../executors/common/ref_interpolate.hpp | 57 + .../src/nodes/executors/interpolate.cpp | 528 ++++ .../src/nodes/executors/interpolate.hpp | 187 ++ .../src/nodes/executors/interpolate_list.cpp | 21 + .../src/nodes/executors/interpolate_list.hpp | 105 + .../nodes/executors/x64/jit_interpolate.cpp | 1707 +++++++++++ .../nodes/executors/x64/jit_interpolate.hpp | 111 + src/plugins/intel_cpu/src/nodes/gather.cpp | 21 +- .../intel_cpu/src/nodes/interpolate.cpp | 2722 +---------------- src/plugins/intel_cpu/src/nodes/interpolate.h | 196 +- .../src/nodes/non_max_suppression.cpp | 4 + .../intel_cpu/src/nodes/non_max_suppression.h | 2 +- .../intel_cpu/src/nodes/roi_pooling.cpp | 10 +- src/plugins/intel_cpu/src/nodes/topk.cpp | 9 +- src/plugins/intel_cpu/src/nodes/topk.h | 2 +- src/plugins/intel_cpu/src/nodes_factory.cpp | 12 +- .../intel_cpu/src/onednn/iml_type_mapper.cpp | 2 + .../intel_cpu/src/onednn/iml_type_mapper.h | 1 + .../cpu_opset/arm/pass/mish_decomposition.cpp | 33 + .../cpu_opset/arm/pass/mish_decomposition.hpp | 19 + .../convert_to_cpu_specific_opset.hpp | 2 +- .../transformation_pipeline.cpp | 6 +- .../single_layer_tests/interpolate.cpp | 4 + 33 files changed, 3607 insertions(+), 2943 deletions(-) create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/common/ref_interpolate.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/common/ref_interpolate.hpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/x64/jit_interpolate.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/x64/jit_interpolate.hpp create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.hpp diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp index 7764284986f432..75bc3139eba165 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp @@ -246,5 +246,36 @@ dnnl::algorithm DnnlExtensionUtils::convertToDnnlAlgorithm(Algorithm alg) { } } +bool DnnlExtensionUtils::isUnarySupportedAsPostOp(Algorithm alg) { +#if defined(OV_CPU_WITH_ACL) + return one_of(alg, Algorithm::EltwiseRelu, + Algorithm::EltwiseTanh, + Algorithm::EltwiseElu, + Algorithm::EltwiseAbs, + Algorithm::EltwiseSqrt, + Algorithm::EltwiseSoftRelu, + Algorithm::EltwiseSigmoid); +#elif defined(OPENVINO_ARCH_X86_64) + return one_of(alg, Algorithm::EltwiseRelu, + Algorithm::EltwiseGeluErf, + Algorithm::EltwiseGeluTanh, + Algorithm::EltwiseElu, + Algorithm::EltwiseSigmoid, + Algorithm::EltwiseClamp, + Algorithm::EltwiseTanh, + Algorithm::EltwiseSwish, + Algorithm::EltwiseHswish, + Algorithm::EltwiseMish, + Algorithm::EltwiseHsigmoid, + Algorithm::EltwiseRoundHalfToEven, + Algorithm::EltwiseRoundHalfAwayFromZero, + Algorithm::EltwiseAbs, + Algorithm::EltwiseSqrt, + Algorithm::EltwiseSoftRelu); +#else + return false; +#endif +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.h b/src/plugins/intel_cpu/src/dnnl_extension_utils.h index 8268707a20a369..c378e305b578fe 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.h +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.h @@ -58,6 +58,7 @@ class DnnlExtensionUtils { static dnnl_memory_desc_t clone_desc(const_dnnl_memory_desc_t cdesc); static const char* query_pd_info(const_dnnl_primitive_desc_t pd); static dnnl::algorithm convertToDnnlAlgorithm(Algorithm alg); + static bool isUnarySupportedAsPostOp(Algorithm alg); }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 48239dfd1d4f45..18d91debc973db 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -988,22 +988,7 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &grap continue; } - if (!one_of(fuseCandidate->getAlgorithm(), Algorithm::EltwiseRelu, - Algorithm::EltwiseGeluErf, - Algorithm::EltwiseGeluTanh, - Algorithm::EltwiseElu, - Algorithm::EltwiseSigmoid, - Algorithm::EltwiseClamp, - Algorithm::EltwiseTanh, - Algorithm::EltwiseSwish, - Algorithm::EltwiseHswish, - Algorithm::EltwiseMish, - Algorithm::EltwiseHsigmoid, - Algorithm::EltwiseRoundHalfToEven, - Algorithm::EltwiseRoundHalfAwayFromZero, - Algorithm::EltwiseAbs, - Algorithm::EltwiseSqrt, - Algorithm::EltwiseSoftRelu)) { + if (!DnnlExtensionUtils::isUnarySupportedAsPostOp(fuseCandidate->getAlgorithm())) { parent++; continue; } @@ -1176,17 +1161,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) auto isFusingSupported = [&](NodePtr conv, NodePtr child) { return child->getType() == Type::Eltwise && - one_of(child->getAlgorithm(), Algorithm::EltwiseRelu, - Algorithm::EltwiseElu, - Algorithm::EltwiseSigmoid, - Algorithm::EltwiseClamp, - Algorithm::EltwiseSwish, - Algorithm::EltwiseHswish, - Algorithm::EltwiseMish, - Algorithm::EltwiseHsigmoid, - Algorithm::EltwiseRoundHalfToEven, - Algorithm::EltwiseRoundHalfAwayFromZero, - Algorithm::EltwiseSoftRelu); + DnnlExtensionUtils::isUnarySupportedAsPostOp(child->getAlgorithm()); }; for (auto &graphNode : graphNodes) { diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 088af40fa3179c..2814d3d40d3ed2 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -960,6 +960,9 @@ void Node::cleanup() { const std::vector& Node::getPrimitivesPriority() { std::vector priorities = { impl_desc_type::unknown, + // Undef impl type is used to express use-cases there real type is unkown during compilation + // Undef has higher priority than defined types in order to force primitive selection logic to make decision based on other properties + impl_desc_type::undef, impl_desc_type::brgconv_avx512_amx_1x1, impl_desc_type::brgconv_avx512_amx, impl_desc_type::jit_avx512_amx_dw, @@ -989,6 +992,7 @@ const std::vector& Node::getPrimitivesPriority() { impl_desc_type::gemm_avx2, impl_desc_type::gemm_avx, impl_desc_type::gemm_sse42, + impl_desc_type::acl, impl_desc_type::jit_gemm, impl_desc_type::ref_any, impl_desc_type::ref, @@ -1341,6 +1345,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr& op, const } bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { +#if defined(OPENVINO_ARCH_X86_64) IE_ASSERT(parentNode); size_t fusingPort = 0; @@ -1391,6 +1396,10 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { Algorithm::EltwisePrelu, Algorithm::EltwiseMulAdd) && isBroadcastableToDataInput()) || isConvertablePowerStatic(); +#else + // TODO: provide correct list of operations for other backends + return false; +#endif } // @todo shifts for Subtract and scales for Divide are replaced with @@ -1607,23 +1616,7 @@ bool Node::canFuseSimpleOperation(const NodePtr& node) const { } return ret; } else if (node->getType() == Type::Eltwise) { - return one_of(node->getAlgorithm(), - Algorithm::EltwiseRelu, - Algorithm::EltwiseGeluErf, - Algorithm::EltwiseGeluTanh, - Algorithm::EltwiseElu, - Algorithm::EltwiseSigmoid, - Algorithm::EltwiseClamp, - Algorithm::EltwiseTanh, - Algorithm::EltwiseSwish, - Algorithm::EltwiseHswish, - Algorithm::EltwiseMish, - Algorithm::EltwiseHsigmoid, - Algorithm::EltwiseRoundHalfToEven, - Algorithm::EltwiseRoundHalfAwayFromZero, - Algorithm::EltwiseAbs, - Algorithm::EltwiseSqrt, - Algorithm::EltwiseSoftRelu) || + return DnnlExtensionUtils::isUnarySupportedAsPostOp(node->getAlgorithm()) || node->canBePerformedAsScaleShift(this); } return false; diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index 37abc7c5b66ea4..339acf7ff26fd7 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -327,6 +327,7 @@ InferenceEngine::Precision Convolution::fusedEltwisePrecision(const NodePtr& fus const std::vector& Convolution::getPrimitivesPriority() { std::vector priorities = { impl_desc_type::unknown, + impl_desc_type::dw_acl, impl_desc_type::winograd_acl, impl_desc_type::gemm_acl, impl_desc_type::brgconv_avx512_amx_1x1, diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp index d75975ff595af6..fb4566e7c6196a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp @@ -10,19 +10,7 @@ namespace intel_cpu { using namespace arm_compute; -TensorShape eltwiseShapeCast(const VectorDims &dims) { - arm_compute::TensorShape tensorShape; - for (std::size_t i = 0; i < dims.size(); ++i) { - tensorShape.set(dims.size() - i - 1, dims[i], false); - } - if (tensorShape.num_dimensions() == 0) { - tensorShape.set(0, 1, false); - tensorShape.set_num_dimensions(1); - } - return tensorShape; -} - -VectorDims reshape_sizes(VectorDims dims) { +inline VectorDims reshape_sizes(VectorDims dims) { const size_t MAX_NUM_SHAPE = arm_compute::MAX_DIMS; VectorDims result_dims(MAX_NUM_SHAPE - 1); if (dims.size() >= MAX_NUM_SHAPE) { @@ -46,29 +34,56 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto if (!postOps.empty()) { return false; } aclEltwiseAttrs = eltwiseAttrs; + std::vector srcVecDims(srcDescs.size()), dstVecDims(dstDescs.size()); + std::vector srcDataLayout(srcDescs.size()), dstDataLayout(dstDescs.size()); + std::vector srcTensorsInfo(srcDescs.size()), dstTensorsInfo(dstDescs.size()); + srcTensors = std::vector(srcDescs.size()); + dstTensors = std::vector(dstDescs.size()); + + for (int i = 0; i < srcVecDims.size(); i++) { + srcVecDims[i] = shapeCast(reshape_sizes(srcDescs[i]->getShape().getDims())); + } + for (int i = 0; i < dstVecDims.size(); i++) { + dstVecDims[i] = shapeCast(reshape_sizes(dstDescs[i]->getShape().getDims())); + } + + for (int i = 0; i < srcDescs.size(); i++) { + srcDataLayout[i] = getAclDataLayoutByMemoryDesc(srcDescs[i]); + } + for (int i = 0; i < dstDescs.size(); i++) { + dstDataLayout[i] = getAclDataLayoutByMemoryDesc(dstDescs[i]); + } + if (srcDescs.size() == 2 && + srcDescs[0]->hasLayoutType(LayoutType::nspc) && srcDescs[1]->hasLayoutType(LayoutType::nspc) && srcDescs[0]->getShape().getDims() != srcDescs[1]->getShape().getDims()) { - return false; + auto dim_size = srcDescs[0]->getShape().getDims().size(); + auto mover = [&dim_size](TensorShape &_shape) { + if (dim_size == 5) { std::swap(_shape[2], _shape[3]); } + std::swap(_shape[1], _shape[2]); + std::swap(_shape[0], _shape[1]); + }; + if (dim_size < 5) { + srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCHW; + } else { + srcDataLayout[0] = srcDataLayout[1] = dstDataLayout[0] = DataLayout::NCDHW; + } + mover(srcVecDims[0]); + mover(srcVecDims[1]); + mover(dstVecDims[0]); } - std::vector srcVecDims(srcDescs.size()), dstVecDims(dstDescs.size()); - std::vector srcTensorsInfo(srcDescs.size()), dstTensorsInfo(dstDescs.size()); - srcTensors = std::vector(srcDescs.size()); - dstTensors = std::vector(dstDescs.size()); - for (int i = 0; i < srcVecDims.size(); i++) { - srcVecDims[i] = reshape_sizes(srcDescs[i]->getShape().getDims()); - srcTensorsInfo[i] = TensorInfo(eltwiseShapeCast(srcVecDims[i]), 1, + srcTensorsInfo[i] = TensorInfo(srcVecDims[i], 1, precisionToAclDataType(srcDescs[i]->getPrecision()), - getAclDataLayoutByMemoryDesc(srcDescs[i])); + srcDataLayout[i]); srcTensors[i].allocator()->init(srcTensorsInfo[i]); } for (int i = 0; i < dstVecDims.size(); i++) { - dstVecDims[i] = reshape_sizes(dstDescs[i]->getShape().getDims()); - dstTensorsInfo[i] = TensorInfo(eltwiseShapeCast(dstVecDims[i]), 1, + dstTensorsInfo[i] = TensorInfo(dstVecDims[i], 1, precisionToAclDataType(dstDescs[i]->getPrecision()), - getAclDataLayoutByMemoryDesc(dstDescs[i])); + dstDataLayout[i]); dstTensors[i].allocator()->init(dstTensorsInfo[i]); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp index de14de2f3db2cf..e149b4a1e849f8 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.hpp @@ -10,8 +10,6 @@ namespace ov { namespace intel_cpu { -arm_compute::TensorShape eltwiseShapeCast(const VectorDims& dims); - class AclEltwiseExecutor : public EltwiseExecutor { public: AclEltwiseExecutor(const ExecutorContext::CPtr context); diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp new file mode 100644 index 00000000000000..0702f3fa7e2f4a --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp @@ -0,0 +1,201 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "acl_interpolate.hpp" +#include "acl_utils.hpp" + +arm_compute::TensorShape interpolateShapeCast(const ov::intel_cpu::VectorDims& dims) { + arm_compute::TensorShape tensorShape; + for (std::size_t i = 0; i < dims.size(); ++i) { + tensorShape.set(dims.size() - i - 1, dims[i], false); + } + if (tensorShape.num_dimensions() == 0) { + tensorShape.set(0, 1, false); + tensorShape.set_num_dimensions(1); + } + return tensorShape; +} + +std::vector interpolateReshapeSizes(std::vector dims) { + const size_t MAX_NUM_SHAPE = 4; + std::vector result_dims(MAX_NUM_SHAPE - 1); + if (dims.size() >= MAX_NUM_SHAPE) { + for (int i = 0; i < MAX_NUM_SHAPE - 1; i++) { + result_dims[i] = dims[i]; + } + for (int i = MAX_NUM_SHAPE - 1; i < dims.size(); i++) { + result_dims[MAX_NUM_SHAPE - 2] *= dims[i]; + } + } else { + result_dims = dims; + } + return result_dims; +} + +bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpolateAttrs, + const std::vector &srcDescs, + const std::vector &dstDescs, + const dnnl::primitive_attr &attr) { + InterpolateExecutor::init(interpolateAttrs, srcDescs, dstDescs, attr); + aclInterpolateAttrs = interpolateAttrs; + auto& coord_mode = aclInterpolateAttrs.coordTransMode; + auto& inter_mode = aclInterpolateAttrs.mode; + acl_coord = arm_compute::SamplingPolicy::TOP_LEFT; + auto& out_shape = dstDescs[0]->getShape().getDims(); + + if ((coord_mode == InterpolateCoordTransMode::pytorch_half_pixel && out_shape[2] > 1 && out_shape[3] > 1) || + coord_mode == InterpolateCoordTransMode::half_pixel) { + acl_coord = arm_compute::SamplingPolicy::CENTER; + } + + switch (inter_mode) { + case InterpolateMode::linear: + case InterpolateMode::linear_onnx: + acl_policy = arm_compute::InterpolationPolicy::BILINEAR; + break; + case InterpolateMode::nearest: + acl_policy = arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR; + break; + default: + return false; + } + + auto srcDims = srcDescs[0]->getShape().getStaticDims(); + auto dstDims = dstDescs[0]->getShape().getStaticDims(); + auto srcTensorInfo = arm_compute::TensorInfo(interpolateShapeCast(srcDims), 1, + precisionToAclDataType(srcDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(srcDescs[0])); + auto dstTensorInfo = arm_compute::TensorInfo(interpolateShapeCast(dstDims), 1, + precisionToAclDataType(dstDescs[0]->getPrecision()), + getAclDataLayoutByMemoryDesc(dstDescs[0])); + + if (!arm_compute::NEScale::validate(&srcTensorInfo, + &dstTensorInfo, + arm_compute::ScaleKernelInfo(acl_policy, + arm_compute::BorderMode::REPLICATE, + arm_compute::PixelValue(), + acl_coord, + false, + coord_mode == InterpolateCoordTransMode::align_corners))) + return false; + + srcTensor.allocator()->init(srcTensorInfo); + dstTensor.allocator()->init(dstTensorInfo); + + acl_scale = std::make_unique(); + acl_scale->configure(&srcTensor, &dstTensor, arm_compute::ScaleKernelInfo(acl_policy, + arm_compute::BorderMode::REPLICATE, + arm_compute::PixelValue(), + acl_coord, + false, + aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::align_corners)); + return true; +} + +void ov::intel_cpu::ACLInterpolateExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { + auto in_ptr_ = padPreprocess(src, dst); + srcTensor.allocator()->import_memory(const_cast(reinterpret_cast(in_ptr_))); + dstTensor.allocator()->import_memory(dst[0]->GetPtr()); + + acl_scale->run(); + + srcTensor.allocator()->free(); + dstTensor.allocator()->free(); +} + +bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupportedConfiguration( + const ov::intel_cpu::InterpolateAttrs &interpolateAttrs, const std::vector &srcDescs, + const std::vector &dstDescs) { + auto& inp_shape = srcDescs[0]->getShape().getDims(); + auto& out_shape = dstDescs[0]->getShape().getDims(); + + float scale_h = static_cast(out_shape[2]) / inp_shape[2]; + float scale_w = static_cast(out_shape[3]) / inp_shape[3]; + bool is_upsample = scale_h > 1 && scale_w > 1; + + auto& coord_mode = interpolateAttrs.coordTransMode; + auto& nearest_mode = interpolateAttrs.nearestMode; + + if (coord_mode == InterpolateCoordTransMode::asymmetric && + nearest_mode == InterpolateNearestMode::floor) { + return is_upsample; + } + + if (coord_mode == InterpolateCoordTransMode::align_corners && + nearest_mode == InterpolateNearestMode::round_prefer_ceil) { + return true; + } + + if (coord_mode == InterpolateCoordTransMode::half_pixel && + (nearest_mode == InterpolateNearestMode::simple || nearest_mode == InterpolateNearestMode::round_prefer_ceil)) { + return false; + } + + if (coord_mode == InterpolateCoordTransMode::asymmetric && + (nearest_mode == InterpolateNearestMode::simple || nearest_mode == InterpolateNearestMode::floor)) { + return is_upsample; + } + + if (is_upsample) { + bool int_factor = scale_h == static_cast(scale_h) && scale_w == static_cast(scale_w); + if (int_factor && coord_mode != InterpolateCoordTransMode::asymmetric && + (nearest_mode == InterpolateNearestMode::round_prefer_ceil + || nearest_mode == InterpolateNearestMode::round_prefer_floor)) { + return true; + } + } else if (scale_h < 1 && scale_w < 1) { + float down_scale_h = static_cast(inp_shape[2]) / out_shape[2]; + float down_scale_w = static_cast(inp_shape[3]) / out_shape[3]; + bool int_factor = down_scale_h == static_cast(down_scale_h) && down_scale_w == static_cast(down_scale_w); + + if (int_factor && coord_mode != InterpolateCoordTransMode::align_corners && + nearest_mode == InterpolateNearestMode::simple) { + return true; + } + + if (int_factor && nearest_mode == InterpolateNearestMode::round_prefer_ceil && + ((out_shape[2] > 1 && out_shape[3] > 1) || coord_mode != InterpolateCoordTransMode::half_pixel)) { + return true; + } + } + return false; +} + +bool ov::intel_cpu::ACLInterpolateExecutorBuilder::isSupported(const ov::intel_cpu::InterpolateAttrs &interpolateAttrs, + const std::vector &srcDescs, + const std::vector &dstDescs) const { + if (srcDescs[0]->getShape().getDims().size() != 4) { + return false; + } + + auto& pads_begin = interpolateAttrs.padBegin; + auto& pads_end = interpolateAttrs.padEnd; + + if (!std::all_of(pads_begin.begin(), pads_begin.end(), [](int i){return i == 0;}) || + !std::all_of(pads_end.begin(), pads_end.end(), [](int i){return i == 0;})) { + return false; + } + + auto& nearest_mode = interpolateAttrs.nearestMode; + auto& coord_mode = interpolateAttrs.coordTransMode; + if (interpolateAttrs.antialias || + coord_mode == InterpolateCoordTransMode::tf_half_pixel_for_nn || + nearest_mode == InterpolateNearestMode::ceil) { + return false; + } + + if (interpolateAttrs.mode == InterpolateMode::cubic) { + return false; + } + + if (interpolateAttrs.mode == InterpolateMode::nearest && + !isSupportedConfiguration(interpolateAttrs, srcDescs, dstDescs)) { + return false; + } + + if (coord_mode == InterpolateCoordTransMode::pytorch_half_pixel) { + return false; + } + return true; +} diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp new file mode 100644 index 00000000000000..9850d390d843fd --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "../interpolate.hpp" + +namespace ov { +namespace intel_cpu { + +class ACLInterpolateExecutor : public InterpolateExecutor { +public: + ACLInterpolateExecutor(const ExecutorContext::CPtr context) : InterpolateExecutor(context) {} + + bool init(const InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr &attr) override; + + void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) override; + + impl_desc_type getImplType() const override { + return implType; + } + +private: + impl_desc_type implType = impl_desc_type::acl; + InterpolateAttrs aclInterpolateAttrs; + arm_compute::SamplingPolicy acl_coord; + arm_compute::InterpolationPolicy acl_policy; + bool antialias{}; + arm_compute::Tensor srcTensor, dstTensor; + std::unique_ptr acl_scale; +}; + +class ACLInterpolateExecutorBuilder : public InterpolateExecutorBuilder { +public: + bool isSupported(const InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const override; + + InterpolateExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override { + return std::make_shared(context); + } +private: + static bool isSupportedConfiguration(const InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs); +}; +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_interpolate.cpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_interpolate.cpp new file mode 100644 index 00000000000000..751050f4bf8775 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_interpolate.cpp @@ -0,0 +1,380 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ref_interpolate.hpp" +#include "ie_parallel.hpp" +#include "nodes/common/cpu_memcpy.h" +#include "utils/bfloat16.hpp" + +void ov::intel_cpu::RefInterpolateExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { + size_t N = srcDimPad5d[0], C = srcDimPad5d[1], ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; + size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; + + auto in_ptr_ = padPreprocess(src, dst); + auto out_ptr_ = static_cast(dst[0]->GetPtr()); + + switch (interpAttrs.mode) { + case InterpolateMode::nearest: { + NNRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); + break; + } + case InterpolateMode::linear_onnx: { + linearOnnxRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); + break; + } + case InterpolateMode::cubic: { + cubicRef(in_ptr_, out_ptr_, N, C, IH, IW, OH, OW); + break; + } + case InterpolateMode::linear: { + float fz = (dataRank == 5) ? dataScales[dataRank - 3] : 1.f; + float fy = dataScales[dataRank - 2]; + float fx = dataScales[dataRank - 1]; + + bool isDownsample = (fx < 1.f) || (fy < 1.f) || (fz < 1.f); + int kernel_width = 2; + linearInterpolation(in_ptr_, out_ptr_, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width, isDownsample && antialias); + break; + } + default: { + IE_THROW() << "Interpolate layer has unsupported interpolate mode: " << interpAttrs.mode; + } + } +} + +void ov::intel_cpu::RefInterpolateExecutor::NNRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, + int OD, int OH, int OW) { + int *index_d = static_cast(&indexTable[0]); + int *index_h = static_cast(&indexTable[OD]); + int *index_w = static_cast(&indexTable[OD + OH]); + + const float *in_ptr_f32 = reinterpret_cast(in_ptr_); + float *out_ptr_f32 = reinterpret_cast(out_ptr_); + + parallel_for3d(B, C, OD, [&](size_t b, size_t c, size_t od) { + const float *in_ptr = in_ptr_f32 + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]); + float *out_ptr = out_ptr_f32 + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od); + for (int oh = 0; oh < OH; oh++) { + const float *in_ptr_h = in_ptr + (IW * index_h[oh]); + float *out_ptr_h = out_ptr + (OW * oh); + for (int ow = 0; ow < OW; ow++) { + out_ptr_h[ow] = in_ptr_h[index_w[ow]]; + } + } + }); +} + +void ov::intel_cpu::RefInterpolateExecutor::linearOnnxRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, + int OD, int OH, int OW) { + std::vector indexPtr(MAX_INPUT_INTERPOLATE, 0); + std::vector weightPtr(MAX_INPUT_INTERPOLATE, 0); + // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, + // EndTopLeft:4, EndTopRight:5, EndBottomLeft:6, EndBottomRight:7 + // weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5 + + int eltInGrid = (spatialDimSize > 2) ? MAX_INPUT_INTERPOLATE : ((spatialDimSize > 1) ? 4 : 2); + int scratchLen = rnd_up(eltInGrid * OW * OH * OD, 16); + + indexPtr[0] = static_cast(&indexTable[0]); + indexPtr[1] = static_cast(&indexTable[OW * OH * OD]); + weightPtr[0] = reinterpret_cast(&indexTable[scratchLen]); + weightPtr[1] = reinterpret_cast(&indexTable[scratchLen + OW * OH * OD]); + if (spatialDimSize > 1) { + indexPtr[2] = static_cast(&indexTable[2 * OW * OH * OD]); + indexPtr[3] = static_cast(&indexTable[3 * OW * OH * OD]); + weightPtr[2] = reinterpret_cast(&indexTable[scratchLen + 2 * OW * OH * OD]); + weightPtr[3] = reinterpret_cast(&indexTable[scratchLen + 3 * OW * OH * OD]); + } + if (spatialDimSize > 2) { + indexPtr[4] = static_cast(&indexTable[4 * OW * OH * OD]); + indexPtr[5] = static_cast(&indexTable[5 * OW * OH * OD]); + indexPtr[6] = static_cast(&indexTable[6 * OW * OH * OD]); + indexPtr[7] = static_cast(&indexTable[7 * OW * OH * OD]); + weightPtr[4] = reinterpret_cast(&indexTable[scratchLen + 4 * OW * OH * OD]); + weightPtr[5] = reinterpret_cast(&indexTable[scratchLen + 5 * OW * OH * OD]); + } + + const float *in_ptr_f32 = reinterpret_cast(in_ptr_); + float *out_ptr_f32 = reinterpret_cast(out_ptr_); + + parallel_for2d(B, C, [&](size_t b, size_t c) { + float *out_ptr_nc = out_ptr_f32 + (OD * OH * OW * C * b + OD * OH * OW * c); + const float *in_ptr_nc = in_ptr_f32 + (ID * IH * IW * C * b + ID * IH * IW * c); + // do not combined 1d/2d to 3d unified process to get rid of invalid computing. + switch (spatialDimSize) { + case 1: + for (int i = 0; i < OW; i++) { + float src0 = in_ptr_nc[indexPtr[0][i]]; + float src1 = in_ptr_nc[indexPtr[1][i]]; + + out_ptr_nc[i] = src0 * weightPtr[0][i] + + src1 * weightPtr[1][i]; + } + break; + case 2: + for (int i = 0; i < OH * OW; i++) { + float src00 = in_ptr_nc[indexPtr[0][i]]; + float src01 = in_ptr_nc[indexPtr[1][i]]; + float src10 = in_ptr_nc[indexPtr[2][i]]; + float src11 = in_ptr_nc[indexPtr[3][i]]; + + out_ptr_nc[i] = src00 * weightPtr[2][i] * weightPtr[0][i] + + src01 * weightPtr[2][i] * weightPtr[1][i] + + src10 * weightPtr[3][i] * weightPtr[0][i] + + src11 * weightPtr[3][i] * weightPtr[1][i]; + } + break; + case 3: + for (int i = 0; i < OD * OH * OW; i++) { + float src000 = in_ptr_nc[indexPtr[0][i]]; + float src001 = in_ptr_nc[indexPtr[1][i]]; + float src010 = in_ptr_nc[indexPtr[2][i]]; + float src011 = in_ptr_nc[indexPtr[3][i]]; + float src100 = in_ptr_nc[indexPtr[4][i]]; + float src101 = in_ptr_nc[indexPtr[5][i]]; + float src110 = in_ptr_nc[indexPtr[6][i]]; + float src111 = in_ptr_nc[indexPtr[7][i]]; + + // float dstValue = + // weightPtr[4][i] * weightPtr[2][i] * weightPtr[0][i] * src000 + + // weightPtr[4][i] * weightPtr[2][i] * weightPtr[1][i] * src001 + + // weightPtr[4][i] * weightPtr[3][i] * weightPtr[0][i] * src010 + + // weightPtr[4][i] * weightPtr[3][i] * weightPtr[1][i] * src011 + + // weightPtr[5][i] * weightPtr[2][i] * weightPtr[0][i] * src100 + + // weightPtr[5][i] * weightPtr[2][i] * weightPtr[1][i] * src101 + + // weightPtr[5][i] * weightPtr[3][i] * weightPtr[0][i] * src110 + + // weightPtr[5][i] * weightPtr[3][i] * weightPtr[1][i] * src111; + + out_ptr_nc[i] = + weightPtr[4][i] * (weightPtr[2][i] * (weightPtr[0][i] * src000 + + weightPtr[1][i] * src001) + + weightPtr[3][i] * (weightPtr[0][i] * src010 + + weightPtr[1][i] * src011)) + + weightPtr[5][i] * (weightPtr[2][i] * (weightPtr[0][i] * src100 + + weightPtr[1][i] * src101) + + weightPtr[3][i] * (weightPtr[0][i] * src110 + + weightPtr[1][i] * src111)); + } + break; + default: + break; + } + }); +} + +void ov::intel_cpu::RefInterpolateExecutor::cubicRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW) { + const int idxNum = 1; + int *xOrigin = static_cast(&indexTable[0]); + float *xFactor = reinterpret_cast(&indexTable[OW]); + int *yOrigin = static_cast(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW]); + float *yFactor = reinterpret_cast(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); + + const float *in_ptr_f32 = reinterpret_cast(in_ptr_); + float *out_ptr_f32 = reinterpret_cast(out_ptr_); + + parallel_for4d(B, C, OH, OW, [&](size_t n, size_t c, size_t oy, size_t ox) { + const float *in_ptr_nc = in_ptr_f32 + (IW * IH * C * n + IW * IH * c); + float *out_ptr_nc = out_ptr_f32 + (OW * OH * C * n + OW * OH * c); + + int iy = yOrigin[oy]; + int ix = xOrigin[ox]; + + float retY = 0.f; + for (int y = iy - 1, i = 0; y <= iy + 2; y++, i++) { + int yInRange = std::max(0, std::min(y, IH - 1)); + const float *in_ptr_nch = in_ptr_nc + IW * yInRange; + float retX = 0.f; + for (int x = ix - 1, j = 0; x <= ix + 2; x++, j++) { + int xInRange = std::max(0, std::min(x, IW - 1)); + retX += xFactor[ox * CUBIC_GRID_LEN + j] * in_ptr_nch[xInRange]; + } + retY += yFactor[oy * CUBIC_GRID_LEN + i] * retX; + } + out_ptr_nc[oy * OW + ox] = retY; + }); +} + +float ov::intel_cpu::RefInterpolateExecutor::getValue(const uint8_t *base, size_t offset, InferenceEngine::Precision prec) { + const uint8_t *baseOffset = base + offset; + switch (prec) { + case Precision::U8: { + return static_cast(*baseOffset); + break; + } + case Precision::I8: { + const int8_t *valuePtr = reinterpret_cast(baseOffset); + return static_cast(*valuePtr); + break; + } + case Precision::BF16: { + const uint16_t *valuePtr = reinterpret_cast(baseOffset); + return bfloat16_t::from_bits(*valuePtr); + break; + } + case Precision::FP32: { + const float *valuePtr = reinterpret_cast(baseOffset); + return *valuePtr; + break; + } + default: { + IE_THROW() << "Interpolate layer does not support precision: " << prec; + break; + } + } +} + +void ov::intel_cpu::RefInterpolateExecutor::setValue(uint8_t *base, size_t offset, float value, InferenceEngine::Precision prec) { + uint8_t *baseOffset = base + offset; + switch (prec) { + case Precision::U8: { + uint8_t data = static_cast(value < 0 ? 0 : value); + cpu_memcpy(baseOffset, &data, 1); + break; + } + case Precision::I8: { + int8_t data = static_cast(value); + cpu_memcpy(baseOffset, &data, 1); + break; + } + case Precision::BF16: { + uint16_t data = bfloat16_t(value).to_bits(); + cpu_memcpy(baseOffset, &data, 2); + break; + } + case Precision::FP32: { + cpu_memcpy(baseOffset, &value, sizeof(float)); + break; + } + default: { + IE_THROW() << "Interpolate layer does not support precision: " << prec; + break; + } + } +} + +void ov::intel_cpu::RefInterpolateExecutor::linearInterpolation(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, + float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias) { + if (IW == OW && IH == OH && ID == OD) { + size_t spatialDimSize = IW * IH * ID; + // TODO: enable when fusing into interp with linear mode will support + if (/*fusedWith.empty() &&*/ interpAttrs.inPrc == interpAttrs.outPrc) { + size_t size = B * C * spatialDimSize * srcDataSize; + cpu_memcpy(out_ptr_, in_ptr_, size); + } else { + parallel_for2d(B, C, [&](size_t b, size_t c) { + const uint8_t *in_ptr_nc = in_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * srcDataSize; + uint8_t *out_ptr_nc = out_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * dstDataSize; + for (size_t i = 0; i < spatialDimSize; i++) { + float dstValue = getValue(in_ptr_nc, i * srcDataSize, interpAttrs.inPrc); + setValue(out_ptr_nc, i * dstDataSize, dstValue, interpAttrs.outPrc); + } + }); + } + return; + } + + float ax = antialias ? fx : 1.0f; + float ay = antialias ? fy : 1.0f; + float az = antialias ? fz : 1.0f; + + int rx = (fx > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / ax)); + int ry = (fy > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / ay)); + int rz = (fz > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / az)); + + int diaOD = 2 * rz + 1; + int diaOH = 2 * ry + 1; + int diaOW = 2 * rx + 1; + int sizeOD = OD * diaOD; + int sizeOH = OH * diaOH; + int sizeOW = OW * diaOW; + + float *weightTable = reinterpret_cast(&indexTable[0]); + float *weightOD = static_cast(&weightTable[0]); + float *weightOH = static_cast(&weightTable[sizeOD]); + float *weightOW = static_cast(&weightTable[sizeOD + sizeOH]); + + int *idxTable = static_cast(&indexTable[sizeOD + sizeOH + sizeOW]); + int *idxOD = static_cast(&idxTable[0]); + int *idxOH = static_cast(&idxTable[sizeOD]); + int *idxOW = static_cast(&idxTable[sizeOD + sizeOH]); + + parallel_for2d(B, C, [&](size_t b, size_t c) { + const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c) * srcDataSize; + uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c) * dstDataSize; + for (size_t oz = 0; oz < OD; oz++) { + uint8_t *out_ptr_ncd = out_ptr_nc + (OW * OH * oz) * dstDataSize; + for (size_t oy = 0; oy < OH; oy++) { + uint8_t *out_ptr_ncdh = out_ptr_ncd + (OW * oy) * dstDataSize; + for (size_t ox = 0; ox < OW; ox++) { + float sum = 0.f; + float wsum = 0.f; + + // this comment explains the original algo. + // for (int z = iz_r - rz; z <= iz_r + rz; z++) { + // for (int y = iy_r - ry; y <= iy_r + ry; y++) { + // for (int x = ix_r - rx; x <= ix_r + rx; x++) { + // bool is_continue = z < 0 || + // y < 0 || + // x < 0 || + // z >= static_cast(ID) || + // y >= static_cast(IH) || + // x >= static_cast(IW); + // if (is_continue) + // continue; + + // float dx = ix - x; + // float dy = iy - y; + // float dz = iz - z; + + // float w = ax * triangleCoeff(ax * dx) * + // ay * triangleCoeff(ay * dy) * + // az * triangleCoeff(az * dz); + + // sum += w * getValue(in_ptr_nc, (z * IH * IW + y * IW + x) * srcDataSize, inputPrec); + // wsum += w; + // } + // } + //} + + for (int iz = 0; iz < diaOD; iz++) { + if (weightOD[oz * diaOD + iz] == 0.f) + continue; + for (int iy = 0; iy < diaOH; iy++) { + if (weightOH[oy * diaOH + iy] == 0.f) { + continue; + } + for (int ix = 0; ix < diaOW; ix++) { + if (weightOW[ox * diaOW + ix] == 0.f) { + continue; + } + float w = weightOD[oz * diaOD + iz] * weightOH[oy * diaOH + iy] * weightOW[ox * diaOW + ix]; + float value = getValue(in_ptr_nc, + (idxOD[oz * diaOD + iz] * IH * IW + idxOH[oy * diaOH + iy] * IW + idxOW[ox * diaOW + ix]) + * srcDataSize, interpAttrs.inPrc); + + sum += w * value; + wsum += w; + } + } + } + + if (!wsum) { + setValue(out_ptr_ncdh, ox * dstDataSize, 0.f, interpAttrs.outPrc); + } else { + float dst_value = sum / wsum; + setValue(out_ptr_ncdh, ox * dstDataSize, dst_value, interpAttrs.outPrc); + } + } + } + } + }); +} + +bool ov::intel_cpu::RefInterpolateExecutor::init(const ov::intel_cpu::InterpolateAttrs &interpolateAttrs, + const std::vector &srcDescs, + const std::vector &dstDescs, + const dnnl::primitive_attr &attr) { + return InterpolateExecutor::init(interpolateAttrs, srcDescs, dstDescs, attr); +} + diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_interpolate.hpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_interpolate.hpp new file mode 100644 index 00000000000000..ba2608324b19bb --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_interpolate.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "../interpolate.hpp" + +namespace ov { +namespace intel_cpu { + +class RefInterpolateExecutor : public InterpolateExecutor { +public: + RefInterpolateExecutor(const ExecutorContext::CPtr context) : InterpolateExecutor(context) {} + + bool init(const InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr &attr) override; + + void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) override; + + impl_desc_type getImplType() const override { + return implType; + } + +private: + void NNRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); + void linearOnnxRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); + + void cubicRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW); + void linearInterpolation(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, + float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias); + + static float getValue(const uint8_t *base, size_t offset, InferenceEngine::Precision prec); + static void setValue(uint8_t *base, size_t offset, float value, InferenceEngine::Precision prec); + +private: + impl_desc_type implType = impl_desc_type::ref; + bool antialias; + std::vector dataScales; +}; + +class RefInterpolateExecutorBuilder : public InterpolateExecutorBuilder { +public: + bool isSupported(const InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const override { + return true; + } + + InterpolateExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override { + return std::make_shared(context); + } +}; +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp new file mode 100644 index 00000000000000..cd1e0ccd00aa8b --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp @@ -0,0 +1,528 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "interpolate.hpp" +#include "ie_parallel.hpp" +#include "nodes/common/cpu_memcpy.h" +#include "emitters/x64/jit_load_store_emitters.hpp" + +bool ov::intel_cpu::InterpolateExecutor::init(const InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr &attr) { + const auto &srcDims = srcDescs[0]->getShape().getStaticDims(); + const auto &dstDims = dstDescs[0]->getShape().getStaticDims(); + interpAttrs = interpolateAttrs; + srcDimPad5d = to5Dim(getPaddedInputShape(srcDims, interpolateAttrs.padBegin, interpolateAttrs.padEnd)); + dstDim5d = to5Dim(dstDims); + srcDataSize = interpolateAttrs.inPrc.size(); + dstDataSize = interpolateAttrs.outPrc.size(); + dataRank = srcDims.size(); + spatialDimSize = getSpatialDimsNum(dataRank); + + switch (interpAttrs.mode) { + case InterpolateMode::nearest: { + buildTblNN(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout, interpolateAttrs.nearestMode); + break; + } + case InterpolateMode::linear_onnx: { + buildTblLinearOnnx(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout); + break; + } + case InterpolateMode::linear: { + static constexpr int LINEAR_KERNEL = 2; + buildTblLinear(srcDimPad5d, dstDim5d, interpAttrs.dataScales, LINEAR_KERNEL, interpolateAttrs.antialias); + break; + } + case InterpolateMode::cubic: { + buildTblCubic(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.cubeCoeff, interpolateAttrs.layout); + break; + } + default: { + IE_THROW() << "Interpolate executor does not support interpolate mode: " << interpAttrs.mode; + break; + } + } + return true; +} +// ===================================================================================================================== +// index layout: +// d_0............d_OD-1, h_0..............h_OH-1, w_0................w_OW-1 +void ov::intel_cpu::InterpolateExecutor::buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, + const std::vector& dataScales, InterpolateLayoutType layout, InterpolateNearestMode nearestMode) { + const int dimSize = dataRank; + float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; + float fy = dataScales[dimSize - 2]; + float fx = dataScales[dimSize - 1]; + size_t ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; + size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; + + indexTable.resize(OD + OH + OW); + bool isDDownsample = (fz < 1) ? true : false; + bool isHDownsample = (fy < 1) ? true : false; + bool isWDownsample = (fx < 1) ? true : false; + for (int oz = 0; oz < OD; oz++) { + float iz = coordTransToInput(oz, fz, ID, OD); + indexTable[oz] = nearestRound(iz, isDDownsample, nearestMode); + indexTable[oz] = clipCoord(indexTable[oz], ID); + } + for (int oy = 0; oy < OH; oy++) { + float iy = coordTransToInput(oy, fy, IH, OH); + indexTable[OD + oy] = nearestRound(iy, isHDownsample, nearestMode); + indexTable[OD + oy] = clipCoord(indexTable[OD + oy], IH); + } + for (int ox = 0; ox < OW; ox++) { + float ix = coordTransToInput(ox, fx, IW, OW); + indexTable[OD + OH + ox] = nearestRound(ix, isWDownsample, nearestMode); + indexTable[OD + OH + ox] = clipCoord(indexTable[OD + OH + ox], IW); + } +} + +// scale is float(outShape) / float(inShape) +// strictly consistent with onnx calc manner(div scale, not multiply inverse), given this is done offline +// the slight precison diff can produce obvious wrong value due to "nearest round" behavior for NN mode +float ov::intel_cpu::InterpolateExecutor::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const { + if (scale == 1.0f || (inShape == outShape)) { + return outCoord; + } + switch (interpAttrs.coordTransMode) { + case InterpolateCoordTransMode::half_pixel: { + return (outCoord + 0.5f) / scale - 0.5f; + break; + } + case InterpolateCoordTransMode::pytorch_half_pixel: { + if (outShape > 1) + return (outCoord + 0.5f) / scale - 0.5f; + else + return 0; + break; + } + case InterpolateCoordTransMode::asymmetric: { + return static_cast(outCoord) / scale; + break; + } + case InterpolateCoordTransMode::tf_half_pixel_for_nn: { + return (outCoord + 0.5f) / scale; + break; + } + case InterpolateCoordTransMode::align_corners: { + if (outShape > 1) + return outCoord * (static_cast(inShape - 1) / static_cast(outShape - 1)); + else + return 0; + break; + } + default: { + IE_THROW() << "errorPrefix" << " does not support specified coordinate transformation mode"; + break; + } + } +} + +int ov::intel_cpu::InterpolateExecutor::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const { + switch (nearestMode) { + case InterpolateNearestMode::round_prefer_floor: { + if (originCoord == (static_cast(originCoord) + 0.5f)) + return static_cast(std::floor(originCoord)); + else + return static_cast(std::round(originCoord)); + break; + } + case InterpolateNearestMode::round_prefer_ceil: { + return static_cast(std::round(originCoord)); + break; + } + case InterpolateNearestMode::floor: { + return static_cast(std::floor(originCoord)); + break; + } + case InterpolateNearestMode::ceil: { + return static_cast(std::ceil(originCoord)); + break; + } + case InterpolateNearestMode::simple: { + if (isDownsample) + return static_cast(std::ceil(originCoord)); + else + return static_cast(originCoord); + } + default: { + IE_THROW() << "errorPrefix" << " does not support specified nearest round mode"; + break; + } + } +} + +void ov::intel_cpu::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, int inShape, int outShape, + int& index0, int& index1, float& weight0, float& weight1) { + float inCoord = coordTransToInput(outCoord, scale, inShape, outShape); + inCoord = std::max(0.0f, std::min(inCoord, static_cast(inShape - 1))); + index0 = std::min(static_cast(inCoord), inShape - 1); + index1 = std::min(index0 + 1, inShape - 1); + + weight1 = std::fabs(inCoord - index0); + weight0 = std::fabs(inCoord - index1); + if (index0 == index1) { + weight0 = 0.5f; + weight1 = 0.5f; + } +} + +void ov::intel_cpu::InterpolateExecutor::buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, + const std::vector& dataScales, InterpolateLayoutType layout) { + int dimSize = dataRank; + float fz = (spatialDimSize > 2) ? dataScales[dimSize - 3] : 1.f; + float fy = (spatialDimSize > 1) ? dataScales[dimSize - 2] : 1.f; + float fx = dataScales[dimSize - 1]; + int ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; + int OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; + + std::vector indexPtr(MAX_INPUT_INTERPOLATE, 0); + std::vector weightPtr(MAX_INPUT_INTERPOLATE, 0); + if (layout == InterpolateLayoutType::planar) { + // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, + // EndTopLeft:4, EndTopRight:5, EndBottomLeft:6, EndBottomRight:7 + // weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5 + int eltInGrid = (spatialDimSize > 2) ? MAX_INPUT_INTERPOLATE : ((spatialDimSize > 1) ? 4 : 2); + int idxType = 2; + int scratchLen = rnd_up(eltInGrid * OW * OH * OD, 16); + indexTable.resize(idxType * scratchLen); + + indexPtr[0] = static_cast(&indexTable[0]); + indexPtr[1] = static_cast(&indexTable[OW * OH * OD]); + weightPtr[0] = reinterpret_cast(&indexTable[scratchLen]); + weightPtr[1] = reinterpret_cast(&indexTable[scratchLen + OW * OH * OD]); + if (spatialDimSize > 1) { + indexPtr[2] = static_cast(&indexTable[2 * OW * OH * OD]); + indexPtr[3] = static_cast(&indexTable[3 * OW * OH * OD]); + weightPtr[2] = reinterpret_cast(&indexTable[scratchLen + 2 * OW * OH * OD]); + weightPtr[3] = reinterpret_cast(&indexTable[scratchLen + 3 * OW * OH * OD]); + } + if (spatialDimSize > 2) { + indexPtr[4] = static_cast(&indexTable[4 * OW * OH * OD]); + indexPtr[5] = static_cast(&indexTable[5 * OW * OH * OD]); + indexPtr[6] = static_cast(&indexTable[6 * OW * OH * OD]); + indexPtr[7] = static_cast(&indexTable[7 * OW * OH * OD]); + weightPtr[4] = reinterpret_cast(&indexTable[scratchLen + 4 * OW * OH * OD]); + weightPtr[5] = reinterpret_cast(&indexTable[scratchLen + 5 * OW * OH * OD]); + } + int scale = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::sse41) ? srcDataSize : 1; + + for (int oz = 0; oz < OD; oz++) { + int izF, izE; + float weightF, weightE; + linearOnnxCF(oz, fz, ID, OD, izF, izE, weightF, weightE); + int idxOz = oz * OH * OW; + for (int oy = 0; oy < OH; oy++) { + int iyT, iyB; + float weightT, weightB; + linearOnnxCF(oy, fy, IH, OH, iyT, iyB, weightT, weightB); + int idxOzOy = idxOz + oy * OW; + for (int ox = 0; ox < OW; ox++) { + int ixL, ixR; + float weightL, weightR; + linearOnnxCF(ox, fx, IW, OW, ixL, ixR, weightL, weightR); + + int idxOzOyOx = idxOzOy + ox; + indexPtr[0][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixL) * scale; + indexPtr[1][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixR) * scale; + weightPtr[0][idxOzOyOx] = weightL; + weightPtr[1][idxOzOyOx] = weightR; + if (spatialDimSize > 1) { + indexPtr[2][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixL) * scale; + indexPtr[3][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixR) * scale; + weightPtr[2][idxOzOyOx] = weightT; + weightPtr[3][idxOzOyOx] = weightB; + } + if (spatialDimSize > 2) { + indexPtr[4][idxOzOyOx] = (izE * IH * IW + iyT * IW + ixL) * scale; + indexPtr[5][idxOzOyOx] = (izE * IH * IW + iyT * IW + ixR) * scale; + indexPtr[6][idxOzOyOx] = (izE * IH * IW + iyB * IW + ixL) * scale; + indexPtr[7][idxOzOyOx] = (izE * IH * IW + iyB * IW + ixR) * scale; + weightPtr[4][idxOzOyOx] = weightF; + weightPtr[5][idxOzOyOx] = weightE; + } + } + } + } + } else { + // index: left:OW right:OW Top:OH Bottom:OH, Front:OD, End:OD + // weight:same as index + size_t scratchLen = rnd_up(OW + OW + OH + OH + OD + OD, 16); + int idxType = 2; + indexTable.resize(idxType * scratchLen); + indexPtr[0] = static_cast(&indexTable[0]); + indexPtr[1] = static_cast(&indexTable[OW]); + indexPtr[2] = static_cast(&indexTable[2 * OW]); + indexPtr[3] = static_cast(&indexTable[2 * OW + OH]); + indexPtr[4] = static_cast(&indexTable[2 * OW + 2 * OH]); + indexPtr[5] = static_cast(&indexTable[2 * OW + 2 * OH + OD]); + + weightPtr[0] = reinterpret_cast(&indexTable[scratchLen]); + weightPtr[1] = reinterpret_cast(&indexTable[scratchLen + OW]); + weightPtr[2] = reinterpret_cast(&indexTable[scratchLen + 2 * OW]); + weightPtr[3] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + OH]); + weightPtr[4] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + 2 * OH]); + weightPtr[5] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + 2 * OH + OD]); + + for (int ox = 0; ox < OW; ox++) { + linearOnnxCF(ox, fx, IW, OW, indexPtr[0][ox], indexPtr[1][ox], weightPtr[0][ox], weightPtr[1][ox]); + } + for (int oy = 0; oy < OH; oy++) { + linearOnnxCF(oy, fy, IH, OH, indexPtr[2][oy], indexPtr[3][oy], weightPtr[2][oy], weightPtr[3][oy]); + } + for (int oz = 0; oz < OD; oz++) { + linearOnnxCF(oz, fz, ID, OD, indexPtr[4][oz], indexPtr[5][oz], weightPtr[4][oz], weightPtr[5][oz]); + } + } +} + +// table layout: +// wd .........wd, wh............wh, ww.............ww, id...........id, ih............ih, iw..............iw +// | | +// wh0.....wh_diameter ih0.....ih_diameter +void ov::intel_cpu::InterpolateExecutor::buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, + const std::vector& dataScales, int kernel_width, bool antialias) { + int dimSize = dataRank; + float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; + float fy = dataScales[dimSize - 2]; + float fx = dataScales[dimSize - 1]; + size_t ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; + size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; + + if (!(IW == OW && IH == OH && ID == OD)) { + float ax = antialias ? fx : 1.0f; + float ay = antialias ? fy : 1.0f; + float az = antialias ? fz : 1.0f; + + int rx = (fx > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / ax)); + int ry = (fy > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / ay)); + int rz = (fz > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / az)); + + int diaOD = 2 * rz + 1; + int diaOH = 2 * ry + 1; + int diaOW = 2 * rx + 1; + int sizeOD = OD * diaOD; + int sizeOH = OH * diaOH; + int sizeOW = OW * diaOW; + indexTable.resize((sizeOD + sizeOH + sizeOW) * 2); + float *weightTable = reinterpret_cast(&indexTable[0]); + float *weightOD = static_cast(&weightTable[0]); + float *weightOH = static_cast(&weightTable[sizeOD]); + float *weightOW = static_cast(&weightTable[sizeOD + sizeOH]); + + int *idxTable = static_cast(&indexTable[sizeOD + sizeOH + sizeOW]); + int *idxOD = static_cast(&idxTable[0]); + int *idxOH = static_cast(&idxTable[sizeOD]); + int *idxOW = static_cast(&idxTable[sizeOD + sizeOH]); + + for (int oz = 0; oz < OD; oz++) { + float iz = coordTransToInput(oz, fz, ID, OD); + int iz_r = static_cast(std::round(iz)); + for (int r = iz_r - rz, i = 0; r <= iz_r + rz; r++, i++) { + idxOD[oz * diaOD + i] = r; + if (r < 0 || r >= static_cast(ID)) { + weightOD[oz * diaOD + i] = 0.f; + } else { + float dz = iz - r; + weightOD[oz * diaOD + i] = az * triangleCoeff(az * dz); + } + } + } + for (int oy = 0; oy < OH; oy++) { + float iy = coordTransToInput(oy, fy, IH, OH); + int iy_r = static_cast(std::round(iy)); + for (int r = iy_r - ry, i = 0; r <= iy_r + ry; r++, i++) { + idxOH[oy * diaOH + i] = r; + if (r < 0 || r >= static_cast(IH)) { + weightOH[oy * diaOH + i] = 0.f; + } else { + float dy = iy - r; + weightOH[oy * diaOH + i] = ay * triangleCoeff(ay * dy); + } + } + } + for (int ox = 0; ox < OW; ox++) { + float ix = coordTransToInput(ox, fx, IW, OW); + int ix_r = static_cast(std::round(ix)); + for (int r = ix_r - rx, i = 0; r <= ix_r + rx; r++, i++) { + idxOW[ox * diaOW + i] = r; + if (r < 0 || r >= static_cast(IW)) { + weightOW[ox * diaOW + i] = 0.f; + } else { + float dx = ix - r; + weightOW[ox * diaOW + i] = ax * triangleCoeff(ax * dx); + } + } + } + } +} + +std::vector ov::intel_cpu::InterpolateExecutor::getCubicCoeffs(float mantissa, float a) { + float m = std::fabs(mantissa); + std::vector coeffs(4, 0.f); + + coeffs[0] = a * (m - 1.0) * (m - 1.0) * m; + coeffs[1] = ((a + 2.0) * m - (a + 3.0)) * m * m + 1.0; + coeffs[2] = (((-a - 2.0) * m + (2.0 * a + 3.0)) * m - a) * m; + coeffs[3] = -a * m * m * (m - 1.0); + return coeffs; +} + +// table layout: +// OW OW OW OW OW OH OH OH OH OH +// x_idx x_weight0 x_weight1 x_weight2 x_weight3 y_idx y_weight0 y_weight1 y_weight2 y_weight3 +void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector& dataScales, + float cubicCoeff, InterpolateLayoutType layout) { + int dimSize = dataRank; + float fy = dataScales[dimSize - 2]; + float fx = dataScales[dimSize - 1]; + int IH = srcDimPad5d[3], IW = srcDimPad5d[4]; + int OH = dstDim5d[3], OW = dstDim5d[4]; + + // idxNum for index, CUBIC_GRID_LEN for weight + const int idxNum = 1; + size_t idxWeightSize = (CUBIC_GRID_LEN + idxNum) * OW + (CUBIC_GRID_LEN + idxNum) * OH; + if (layout != InterpolateLayoutType::planar) { + indexTable.resize(idxWeightSize); + } else { + size_t sequenceSize = 2 * OH * OW; + indexTable.resize(idxWeightSize + sequenceSize); + } + + int tblAdvance = 0; + int *xOrigin = static_cast(&indexTable[tblAdvance]); + tblAdvance += OW; + float *xFactor = reinterpret_cast(&indexTable[tblAdvance]); + for (int ox = 0; ox < OW; ox++) { + float ix = coordTransToInput(ox, fx, IW, OW); + int ix_r = static_cast(std::floor(ix)); + xOrigin[ox] = ix_r; + float m = ix - ix_r; + std::vector coffes = getCubicCoeffs(m, cubicCoeff); + xFactor[CUBIC_GRID_LEN * ox] = coffes[0]; + xFactor[CUBIC_GRID_LEN * ox + 1] = coffes[1]; + xFactor[CUBIC_GRID_LEN * ox + 2] = coffes[2]; + xFactor[CUBIC_GRID_LEN * ox + 3] = coffes[3]; + } + + tblAdvance += CUBIC_GRID_LEN * OW; + int *yOrigin = static_cast(&indexTable[tblAdvance]); + tblAdvance += OH; + float *yFactor = reinterpret_cast(&indexTable[tblAdvance]); + for (int oy = 0; oy < OH; oy++) { + float iy = coordTransToInput(oy, fy, IH, OH); + int iy_r = static_cast(std::floor(iy)); + yOrigin[oy] = iy_r; + float m = iy - iy_r; + std::vector coffes = getCubicCoeffs(m, cubicCoeff); + yFactor[CUBIC_GRID_LEN * oy] = coffes[0]; + yFactor[CUBIC_GRID_LEN * oy + 1] = coffes[1]; + yFactor[CUBIC_GRID_LEN * oy + 2] = coffes[2]; + yFactor[CUBIC_GRID_LEN * oy + 3] = coffes[3]; + } + + if (layout == InterpolateLayoutType::planar) { + tblAdvance += CUBIC_GRID_LEN * OH; + int *sequenceOH = static_cast(&indexTable[tblAdvance]); + tblAdvance += OH * OW; + int *sequenceOW = static_cast(&indexTable[tblAdvance]); + for (int h = 0; h < OH; ++h) { + int offset = h * OW; + for (int w = 0; w < OW; ++w) { + sequenceOH[offset + w] = h * sizeof(int); + sequenceOW[offset + w] = w * sizeof(int); + } + } + } +} + +// shapeND: n c d h w +// blockND: ncdhw cdhw dhw hw w 1 +// index : 0 1 2 3 4 5 +inline SizeVector getBlockND(const SizeVector& shape) { + int shapeRank = shape.size(); + SizeVector blockND(shapeRank + 1, 1); + for (int i = shapeRank - 1; i >= 0; i--) { + blockND[i] = shape[i] * blockND[i+1]; + } + return blockND; +} + +const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vector& src, const std::vector& dst) { + const uint8_t *src_data_origin = reinterpret_cast(src[0]->GetData()); + + const auto &srcDim = src[0]->getStaticDims(); + const auto &dstDim = dst[0]->getStaticDims(); + size_t dimSize = srcDim.size(); + auto srcDimPad = getSrcDimPad5d(); + + const auto srcDim5d = to5Dim(srcDim); + const auto srcDimPad5d = to5Dim(srcDimPad); + const auto dstDim5d = to5Dim(dstDim); + const auto srcDataSize = src[0]->getDesc().getPrecision().size(); + + const uint8_t *src_data = nullptr; + std::vector srcPadded; + if (interpAttrs.hasPad) { + int padB0 = (dimSize > 2) ? interpAttrs.padBegin[0] : 0; + int padB1 = (dimSize > 2) ? interpAttrs.padBegin[1] : 0; + int padB2 = (dimSize == 5) ? interpAttrs.padBegin[dimSize - 3] : 0; + int padB3 = interpAttrs.padBegin[dimSize - 2]; + int padB4 = interpAttrs.padBegin[dimSize - 1]; + + SizeVector inShapeBlock = getBlockND(srcDim5d); + SizeVector inShapePadBlock = getBlockND(srcDimPad5d); + + if (interpAttrs.layout == InterpolateLayoutType::planar) { + srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); + uint8_t *src_data_pad = static_cast(&srcPadded[0]); + parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) { + const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize; + uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + + inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize; + cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize); + }); + src_data = src_data_pad; + } else if (interpAttrs.layout == InterpolateLayoutType::by_channel) { + srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); + uint8_t *src_data_pad = static_cast(&srcPadded[0]); + parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) { + const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + + (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize; + uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) + + inShapePadBlock[4] * (h + padB3) + + inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize; + cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize); + }); + src_data = src_data_pad; + } else if (interpAttrs.layout == InterpolateLayoutType::block) { + size_t blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8; + size_t CB = div_up(srcDimPad5d[1], blkSize); + size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize; + srcPadded.resize(eltsTotal * srcDataSize, 0x0); + uint8_t *src_data_pad = static_cast(&srcPadded[0]); + if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) { + IE_THROW() << "Interpolate layer with name does not support padding on batch and channel dimensions"; + } + parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) { + const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (h * srcDim5d[4] * blkSize) * srcDataSize + + (w * blkSize) * srcDataSize; + uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize + + ((w + padB4) * blkSize) * srcDataSize; + cpu_memcpy(srcPad, src, blkSize * srcDataSize); + }); + src_data = src_data_pad; + } + } else { + src_data = src_data_origin; + } + return src_data; +} diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp new file mode 100644 index 00000000000000..214f67460c50f5 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp @@ -0,0 +1,187 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +#define MAX_INPUT_INTERPOLATE 8 + +using namespace InferenceEngine; + +namespace ov { +namespace intel_cpu { + +enum InterpolateLayoutType { + planar, + block, + by_channel +}; + +enum InterpolateMode { + nearest, + linear, + linear_onnx, + cubic +}; + +enum InterpolateCoordTransMode { + half_pixel, + pytorch_half_pixel, + asymmetric, + tf_half_pixel_for_nn, + align_corners +}; + +enum class InterpolateNearestMode { + round_prefer_floor, + round_prefer_ceil, + floor, + ceil, + simple +}; + +enum class InterpolateShapeCalcMode { + sizes, + scales +}; + +struct InterpolateAttrs { + InterpolateMode mode = InterpolateMode::nearest; + InterpolateCoordTransMode coordTransMode = InterpolateCoordTransMode::half_pixel; + InterpolateNearestMode nearestMode = InterpolateNearestMode::round_prefer_floor; + bool antialias = false; + float cubeCoeff = -0.75; + std::vector padBegin; + std::vector padEnd; + InferenceEngine::Precision inPrc; + InferenceEngine::Precision outPrc; + InterpolateLayoutType layout; + std::vector dataScales; + bool hasPad = false; +}; + +inline SizeVector getPaddedInputShape(const VectorDims &srcDims, + const std::vector &padBegin, + const std::vector &padEnd) { + SizeVector paddedShape; + int dataRank = srcDims.size(); + for (int i = 0; i < dataRank; i++) { + paddedShape.push_back(srcDims[i] + padBegin[i] + padEnd[i]); + } + return paddedShape; +} + +inline int clipCoord(int pos, int length) { + return std::max(static_cast(0), std::min(pos, length - 1)); +} + +inline size_t getSpatialDimsNum(const Dim rank) { + switch (rank) { + case 1: + case 3: + return 1; + case 2: + case 4: + return 2; + case 5: + return 3; + default: + IE_THROW() << "Can't define number spatial"; + } +} + +// w/hw/ncw/nchw/ncdhw to ncdhw +inline SizeVector to5Dim(SizeVector casesDim) { + size_t caseSize = casesDim.size(); + SizeVector dim5(5, 1lu); + dim5[4] = casesDim[caseSize - 1]; + if (caseSize > 1) { + dim5[3] = casesDim[caseSize - 2]; + } + if (caseSize > 2) { + dim5[0] = casesDim[0]; + } + if (caseSize > 3) { + dim5[1] = casesDim[1]; + } + if (caseSize > 4) { + dim5[2] = casesDim[2]; + } + if (caseSize == 3) { // nhw -> ncw + dim5[1] = dim5[3]; + dim5[3] = 1lu; + } + return dim5; +} + +static inline float triangleCoeff(float x) { + return (std::max)(0.0f, 1 - std::abs(x)); +} + +class InterpolateExecutor { +public: + static constexpr size_t DATA_ID = 0; + static constexpr size_t TARGET_SHAPE_ID = 1; + static constexpr size_t SCALES_ID = 2; + static constexpr size_t AXES_ID = 3; + static constexpr int CUBIC_GRID_LEN = 4; + InterpolateExecutor(const ExecutorContext::CPtr context) : _context(context) {} + + virtual bool init(const InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr &attr); + virtual void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) = 0; + virtual impl_desc_type getImplType() const = 0; + + virtual ~InterpolateExecutor() = default; + VectorDims getSrcDimPad5d() const { return srcDimPad5d; } + const uint8_t* padPreprocess(const std::vector& src, const std::vector& dst); + +private: + void buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector& dataScales, + InterpolateLayoutType layout, InterpolateNearestMode nearestMode); + void buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector& dataScales, + InterpolateLayoutType layout); + void buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector& dataScales, int kernel_width, + bool antialias); + void buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector& dataScales, float cubicCoeff, + InterpolateLayoutType layout); + + float coordTransToInput(int outCoord, float scale, int inShape, int outShape) const; + int nearestRound(float origin, bool isDownsample, InterpolateNearestMode nearestMode) const; + void linearOnnxCF(int outCoord, float scale, int inShape, int outShape, int& index0, int& index1, float& weight0, float& weight1); + std::vector getCubicCoeffs(float mantissa, float a); + +protected: + InterpolateAttrs interpAttrs; + VectorDims srcDimPad5d, dstDim5d; + size_t srcDataSize, dstDataSize; + int spatialDimSize; + size_t dataRank; + std::vector indexTable; + const ExecutorContext::CPtr _context; +}; + +using InterpolateExecutorPtr = std::shared_ptr; +using InterpolateExecutorCPtr = std::shared_ptr; + +class InterpolateExecutorBuilder { +public: + ~InterpolateExecutorBuilder() = default; + virtual bool isSupported(const InterpolateAttrs& InterpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const = 0; + virtual InterpolateExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0; +}; + +using InterpolateExecutorBuilderPtr = std::shared_ptr; +using InterpolateExecutorBuilderCPtr = std::shared_ptr; +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp new file mode 100644 index 00000000000000..b3f4fecac0f8cd --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp @@ -0,0 +1,21 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "interpolate_list.hpp" + +namespace ov { +namespace intel_cpu { + +const std::vector& getInterpolateExecutorsList() { + static std::vector descs = { + OV_CPU_INSTANCE_X64(ExecutorType::x64, std::make_shared()) + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) + OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared()) + }; + + return descs; +} + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp new file mode 100644 index 00000000000000..978d38053daee9 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp @@ -0,0 +1,105 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "executor.hpp" + +#include "interpolate.hpp" +#if defined(OV_CPU_WITH_ACL) +#include "acl/acl_interpolate.hpp" +#endif +#include "common/ref_interpolate.hpp" +#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) +#include "x64/jit_interpolate.hpp" +#endif + +#include "onednn/iml_type_mapper.h" +#include "common/primitive_cache.hpp" + +namespace ov { +namespace intel_cpu { + +struct InterpolateExecutorDesc { + ExecutorType executorType; + InterpolateExecutorBuilderCPtr builder; +}; + +const std::vector& getInterpolateExecutorsList(); + +class InterpolateExecutorFactory : public ExecutorFactory { +public: + InterpolateExecutorFactory(const InterpolateAttrs& InterpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) : ExecutorFactory(context) { + for (auto& desc : getInterpolateExecutorsList()) { + if (desc.builder->isSupported(InterpolateAttrs, srcDescs, dstDescs)) { + supportedDescs.push_back(desc); + } + } + } + + ~InterpolateExecutorFactory() = default; + virtual InterpolateExecutorPtr makeExecutor(const InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr &attr) { + auto build = [&](const InterpolateExecutorDesc* desc) { + switch (desc->executorType) { +//#if defined(OPENVINO_ARCH_X86_64) +// case ExecutorType::x64: { +// auto builder = [&](const JitInterpolateExecutor::Key& key) -> InterpolateExecutorPtr { +// auto executor = desc->builder->makeExecutor(context); +// if (executor->init(interpolateAttrs, srcDescs, dstDescs, attr)) { +// return executor; +// } else { +// return nullptr; +// } +// }; +// +// auto key = JitInterpolateExecutor::Key(interpolateAttrs, srcDescs, dstDescs, attr); +// auto res = runtimeCache->getOrCreate(key, builder); +// return res.first; +// } break; +//#endif + default: { + auto executor = desc->builder->makeExecutor(context); + if (executor->init(interpolateAttrs, srcDescs, dstDescs, attr)) { + return executor; + } + } break; + } + + InterpolateExecutorPtr ptr = nullptr; + return ptr; + }; + + + if (chosenDesc) { + if (auto executor = build(chosenDesc)) { + return executor; + } + } + + for (const auto& sd : supportedDescs) { + if (auto executor = build(&sd)) { + chosenDesc = &sd; + return executor; + } + } + + IE_THROW() << "Supported executor is not found"; + } + +private: + std::vector supportedDescs; + const InterpolateExecutorDesc* chosenDesc = nullptr; +}; + +using InterpolateExecutorFactoryPtr = std::shared_ptr; +using InterpolateExecutorFactoryCPtr = std::shared_ptr; + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/jit_interpolate.cpp b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_interpolate.cpp new file mode 100644 index 00000000000000..8dfdb54a171f87 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_interpolate.cpp @@ -0,0 +1,1707 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_interpolate.hpp" + +using namespace dnnl; +using namespace InferenceEngine; +using namespace dnnl::impl; +using namespace dnnl::impl::cpu; +using namespace dnnl::impl::cpu::x64; +using namespace dnnl::impl::utils; +using namespace Xbyak; + +#define GET_OFF(field) offsetof(jit_interpolate_call_args, field) + +#include "ie_parallel.hpp" + +#include +#include +#include +#include +#include +#include "utils/bfloat16.hpp" +#include "emitters/x64/jit_bf16_emitters.hpp" +#include "emitters/x64/jit_load_store_emitters.hpp" + +namespace ov { +namespace intel_cpu { + +template +struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_interpolate_kernel_f32) + + explicit jit_uni_interpolate_kernel_f32(jit_interpolate_config_params jcp, const dnnl_primitive_attr &attr) + : jit_uni_interpolate_kernel(jcp, attr), jit_generator(jit_name()) {} + + void create_ker() override { + jit_generator::create_kernel(); + ker_ = (decltype(ker_))jit_ker(); + } + + void generate() override { + // dummy second reg_tmp_64 as no fill needed + load_pool_gpr_idxs = {static_cast(reg_tmp_64.getIdx()), static_cast(reg_tmp_64.getIdx())}; + store_pool_gpr_idxs = {static_cast(reg_tmp_64.getIdx())}; + store_pool_vec_idxs = {static_cast(vmm_zero.getIdx())}; + + const auto &p = attr_.post_ops_; + for (int i = 0; i < p.len(); i++) { + auto &post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors.push_back(std::make_shared>( + this, + post_op.eltwise.alg, + post_op.eltwise.alpha, + post_op.eltwise.beta, + 1.f)); + } else if (post_op.is_depthwise()) { + depthwise_injectors.push_back(std::make_shared>( + this, + post_op)); + } else if (post_op.is_quantization()) { + quantization_injectors.push_back(std::make_shared>( + this, post_op, vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); + } + } + + this->preamble(); + + if (attr_.post_ops_.len() != 0) { + mov(reg_post_ops_data, ptr[reg_params + GET_OFF(post_op_data)]); + mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]); + } + uni_vpxor(vmm_zero, vmm_zero, vmm_zero); + + switch (jcp_.mode) { + case InterpolateMode::nearest: { + mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); + mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); + mov(reg_index, ptr[reg_params + GET_OFF(index)]); + mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); + + switch (jcp_.layout) { + case InterpolateLayoutType::planar: { + nn_planar(); + break; + } + case InterpolateLayoutType::block: { + nn_blk(); + break; + } + case InterpolateLayoutType::by_channel: { + nn_by_channel(); + break; + } + default: + assert(!"unsupported memory layout for interpolate layer with nearest neighbor mode."); + } + break; + } + case InterpolateMode::linear_onnx: { + switch (jcp_.layout) { + case InterpolateLayoutType::planar: { + linear_onnx_planar(); + break; + } + case InterpolateLayoutType::block: + case InterpolateLayoutType::by_channel: { + linear_onnx_c_gathered(); + break; + } + default: + assert(!"unsupported memory layout for interpolate layer with linear_onnx mode."); + } + break; + } + case InterpolateMode::cubic: { + switch (jcp_.layout) { + case InterpolateLayoutType::planar: { + cubic_planar(); + break; + } + case InterpolateLayoutType::block: + case InterpolateLayoutType::by_channel: { + cubic_c_gathered(); + break; + } + default: + assert(!"unsupported memory layout for interpolate layer with cubic mode."); + } + break; + } + case InterpolateMode::linear: { + assert(!"unsupported mode for interpolate layer with JITTED implimentation."); + break; + } + default: { + assert(!"unsupported mode for interpolate layer."); + } + } + + this->postamble(); + + emit_emitters_data(); + for (auto& inj : eltwise_injectors) + inj->prepare_table(); + if ((jcp_.mode == InterpolateMode::cubic) && (jcp_.layout == InterpolateLayoutType::planar)) { + prepare_cubic_planar_table(); + } + } + +private: + using Vmm = typename conditional3::type; + + const int vlen = cpu_isa_traits::vlen; + const int vector_step = vlen / sizeof(float); + const int tail_step = jcp_.C % vector_step; + const int scalar_step = 1; + + Xbyak::Reg64 reg_src = r8; + Xbyak::Reg64 reg_src_aux = r15; + Xbyak::Reg64 reg_src_aux1 = r11; + Xbyak::Reg64 reg_src_aux2 = r12; + Xbyak::Reg64 reg_dst = r9; + Xbyak::Reg64 reg_work_amount = r13; + Xbyak::Reg64 reg_index = r14; + Xbyak::Reg64 reg_params = abi_param1; + + Reg8 reg_tmp_8 = r10b; + Reg32 reg_tmp_32 = r10d; + Reg64 reg_tmp_64 = r10; + + Xbyak::Reg64 reg_oc_off = rax; + Xbyak::Reg64 reg_post_ops_data = rbx; + Xbyak::Reg64 reg_d_weights = reg_tmp_64; + Xbyak::Reg64 reg_d_bias = rcx; + Xbyak::Reg32 reg_index_offset = edx; + + // for cubic planar + Xbyak::Reg64 reg_tbl_y = rsi; + Xbyak::Reg64 reg_tbl_x = rbp; + Xbyak::Reg64 reg_table = rdx; // do not need reg_index_offset in this mode, so use rdx + + Vmm vmm_val = Vmm(1); + Vmm vmm_index = Vmm(0); + Vmm vmm_zero = Vmm(2); + Vmm vmm_mask = Vmm(3); + Vmm vmm_d_weights = Vmm(4); + Vmm vmm_d_bias = Vmm(5); + + // for linear + Vmm vmm_weightT = Vmm(15); + Vmm vmm_weightB = Vmm(14); + Vmm vmm_weightL = Vmm(13); + Vmm vmm_weightR = Vmm(12); + Vmm vmm_weightF = Vmm(6); + Vmm vmm_weightE = Vmm(7); + Vmm vmm_valTL = Vmm(11); + Vmm vmm_valTR = vmm_val; + Vmm vmm_valBL = Vmm(9); + Vmm vmm_valBR = Vmm(8); + + // for cubic + Vmm vmm_src = Vmm(6); + Xmm xmm_src = Xmm(6); + Vmm vmm_dstX = Vmm(7); + + Vmm vmm_weightX0 = vmm_weightT; + Vmm vmm_weightX1 = vmm_weightB; + Vmm vmm_weightX2 = vmm_weightL; + Vmm vmm_weightX3 = vmm_weightR; + Vmm vmm_weightY0 = vmm_valTL; + Vmm vmm_weightY1 = Vmm(10); // vmm_valTR is vmm_val, need reserved + Vmm vmm_weightY2 = vmm_valBL; + Vmm vmm_weightY3 = vmm_valBR; + // cubic planar + Vmm vmm_one = vmm_index; + Vmm vmm_weightY = vmm_weightY0; + Vmm vmm_index_y_itr = vmm_weightY1; + Vmm vmm_index_x_itr = vmm_weightY2; + Vmm vmm_tbl_y = vmm_weightY3; + // temporally used. when post ops, value in vmm_d_weights and vmm_d_bias is re-loaded(init) each time. + Vmm vmm_index_in_y = vmm_d_weights; + Vmm vmm_index_in_x = vmm_d_bias; + + Xbyak::Label l_table_constant; + Opmask k_mask = Xbyak::Opmask(1); + + std::unordered_map> emitters; + + std::vector store_pool_gpr_idxs; + std::vector store_pool_vec_idxs; + std::vector load_pool_gpr_idxs; + + std::vector>> eltwise_injectors; + std::vector>> depthwise_injectors; + std::vector>> quantization_injectors; + + void emit_emitters_data() { + for (const auto& emitter : emitters) { + if (emitter.second) + emitter.second->emit_data(); + } + } + + inline void load(Xbyak::Reg64 reg_src, Vmm vmm_src, const int elt_num, const int offset = 0) { + emit_load(reg_src, vmm_src, jcp_.src_prc, Precision::FP32, elt_num, offset); + } + + inline void load_weights(Xbyak::Reg64 reg_src, Vmm vmm_src, const int elt_num, const int offset = 0) { + emit_load(reg_src, vmm_src, Precision::FP32, Precision::FP32, elt_num, offset); + } + + inline void emit_load(Xbyak::Reg64 reg_src, Vmm vmm_src, Precision src_prc, Precision dst_prc, const int elt_num, const int offset = 0) { + const auto seed = load_emitter_params(src_prc, dst_prc, elt_num).hash(); + if (!emitters[seed]) { + emitters[seed].reset(new jit_load_emitter(this, isa, src_prc, dst_prc, elt_num)); + } + + emitters[seed]->emit_code({static_cast(reg_src.getIdx()), static_cast(offset)}, + {static_cast(vmm_src.getIdx())}, {}, {load_pool_gpr_idxs}); + } + + inline void store(Vmm vmm_dst, Xbyak::Reg64 reg_dst, const int elt_num, const int offset = 0) { + const auto seed = store_emitter_params(Precision::FP32, jcp_.dst_prc, elt_num).hash(); + if (!emitters[seed]) { + emitters[seed].reset(new jit_store_emitter(this, isa, Precision::FP32, jcp_.dst_prc, elt_num)); + } + + // for cases when Store emitter need 2 aux vmm we can use vmm_dst as second aux vmm + std::vector local_store_pool_vec_idxs = { static_cast(vmm_dst.getIdx()) }; + local_store_pool_vec_idxs.insert(local_store_pool_vec_idxs.begin(), store_pool_vec_idxs.begin(), store_pool_vec_idxs.end()); + + emitters[seed]->emit_code({static_cast(vmm_dst.getIdx()), static_cast(offset)}, + {static_cast(reg_dst.getIdx())}, + {local_store_pool_vec_idxs}, {store_pool_gpr_idxs}); + } + + void nn_planar() { + Xbyak::Reg64 reg_index_h = reg_src_aux1; + Xbyak::Reg64 reg_index_w = reg_src_aux2; + mov(reg_index_h, reg_index); + // reg_index represent reg_index_w + add(reg_index, jcp_.OH * jcp_.indices_size); + // bk for reset to reg_index_w + mov(reg_index_w, reg_index); + + Xbyak::Label out_loop_label; + Xbyak::Label out_loop_end; + + Xbyak::Reg64 reg_work_amount_oh = rdi; + mov(reg_work_amount_oh, jcp_.OH); + L(out_loop_label); + { + // outloop status + cmp(reg_work_amount_oh, 1); + jl(out_loop_end, T_NEAR); + + //reset work_amount to OW + mov(reg_work_amount, jcp_.OW); + + Xbyak::Reg64 reg_src_h = rsi; + mov(reg_src_h, reg_src); + // index_h * IW * dataSize done when built to avoid redundent compute + mov(reg_index_offset, dword[reg_index_h]); + add(reg_src_h, reg_index_offset); // reg_src_h now point to begin of row + + // reset index_w, index_w * dataSize done when built to avoid redundent compute + mov(reg_index, reg_index_w); + + Xbyak::Label nn_loop_label; + Xbyak::Label nn_loop_end_label; + Xbyak::Label nn_tail_loop_label; + Xbyak::Label nn_tail_loop_end_label; + + L(nn_loop_label); // inner loop + { + cmp(reg_work_amount, vector_step); + jl(nn_loop_end_label, T_NEAR); + + uni_vmovdqu(vmm_index, ptr[reg_index]); + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_val, ptr[reg_src_h + vmm_index], vmm_mask); + if (attr_.post_ops_.len() != 0) + apply_post_ops(jcp_.dst_prc, 1); + store(vmm_val, reg_dst, vector_step); + + add(reg_dst, vector_step * jcp_.dst_data_size); + add(reg_index, vector_step * jcp_.indices_size); + sub(reg_work_amount, vector_step); + + jmp(nn_loop_label, T_NEAR); + } + L(nn_loop_end_label); + + L(nn_tail_loop_label); + { + cmp(reg_work_amount, 1); + jl(nn_tail_loop_end_label, T_NEAR); + + mov(reg_src_aux, reg_src_h); + mov(reg_index_offset, dword[reg_index]); + add(reg_src_aux, reg_index_offset); + + load(reg_src_aux, vmm_val, scalar_step); + if (attr_.post_ops_.len() != 0) + apply_post_ops(jcp_.dst_prc, 1); + store(vmm_val, reg_dst, scalar_step); + + add(reg_dst, scalar_step * jcp_.dst_data_size); + add(reg_index, scalar_step * jcp_.indices_size); + sub(reg_work_amount, scalar_step); + + jmp(nn_tail_loop_label, T_NEAR); + } + L(nn_tail_loop_end_label); // inner loop end + + //increment index_h to next row + add(reg_index_h, jcp_.indices_size); + + sub(reg_work_amount_oh, 1); + jmp(out_loop_label, T_NEAR); + } + L(out_loop_end); + } + + void nn_blk() { + Xbyak::Label nn_loop_label; + Xbyak::Label nn_loop_end_label; + L(nn_loop_label); + { + cmp(reg_work_amount, 0); + jle(nn_loop_end_label, T_NEAR); + + mov(reg_src_aux, reg_src); + mov(reg_index_offset, dword[reg_index]); + add(reg_src_aux, reg_index_offset); + + load(reg_src_aux, vmm_val, vector_step); + if (attr_.post_ops_.len() != 0) + apply_post_ops(jcp_.dst_prc, 0); + store(vmm_val, reg_dst, vector_step); + add(reg_dst, vector_step * jcp_.dst_data_size); + + if (isa == cpu::x64::sse41) { + add(reg_src_aux, vector_step * jcp_.src_data_size); + load(reg_src_aux, vmm_val, vector_step); + if (attr_.post_ops_.len() != 0) { + add(reg_oc_off, vector_step * sizeof(float)); + apply_post_ops(jcp_.dst_prc, 0); + sub(reg_oc_off, vector_step * sizeof(float)); + } + store(vmm_val, reg_dst, vector_step); + add(reg_dst, vector_step * jcp_.dst_data_size); + } + + add(reg_index, jcp_.indices_size); + sub(reg_work_amount, 1); + + jmp(nn_loop_label, T_NEAR); + } + L(nn_loop_end_label); + } + + void nn_by_channel() { + // kernel for C * OW + Xbyak::Label out_loop_label; + Xbyak::Label out_loop_end; + + Xbyak::Reg64 reg_work_amount_bk = reg_src_aux2; + Xbyak::Reg64 reg_oc_off_bk = rsi; + mov(reg_work_amount_bk, ptr[reg_params + GET_OFF(work_amount)]); + if (attr_.post_ops_.len() != 0) { + mov(reg_oc_off_bk, ptr[reg_params + GET_OFF(oc_off)]); + } + + Xbyak::Reg64 reg_work_amount_out = reg_src_aux1; + mov(reg_work_amount_out, jcp_.OW); + L(out_loop_label); + { + cmp(reg_work_amount_out, 1); + jl(out_loop_end, T_NEAR); + + //inner loop for C + Xbyak::Label nn_loop_label; + Xbyak::Label nn_loop_end_label; + Xbyak::Label nn_tail_loop_label; + Xbyak::Label nn_tail_loop_end_label; + + // inner loop for C + // get current loop address reg_src_aux, from reg_src which is unchange, point this C * OW. + // reset offset and work_amount. + // dst and index address is continous, advanced each interator. + mov(reg_src_aux, reg_src); + // index*C*dataSize done when built to avoid redundent compute + mov(reg_index_offset, dword[reg_index]); + add(reg_src_aux, reg_index_offset); + + mov(reg_work_amount, reg_work_amount_bk); + if (attr_.post_ops_.len() != 0) + mov(reg_oc_off, reg_oc_off_bk); + + L(nn_loop_label); + { + cmp(reg_work_amount, vector_step); + jl(nn_loop_end_label, T_NEAR); + + load(reg_src_aux, vmm_val, vector_step); + if (attr_.post_ops_.len() != 0) + apply_post_ops(jcp_.dst_prc, 0); + store(vmm_val, reg_dst, vector_step); + + add(reg_dst, vector_step * jcp_.dst_data_size); + add(reg_src_aux, vector_step * jcp_.src_data_size); + add(reg_oc_off, vector_step * sizeof(float)); + sub(reg_work_amount, vector_step); + + jmp(nn_loop_label, T_NEAR); + } + L(nn_loop_end_label); + + if (tail_step != 0) { + load(reg_src_aux, vmm_val, tail_step); + if (attr_.post_ops_.len() != 0) + apply_post_ops(jcp_.dst_prc, 0); + store(vmm_val, reg_dst, tail_step); + + // check to remove below + add(reg_dst, tail_step * jcp_.dst_data_size); + add(reg_src_aux, tail_step * jcp_.src_data_size); + add(reg_oc_off, tail_step * sizeof(float)); + sub(reg_work_amount, tail_step); + } + add(reg_index, jcp_.indices_size); + sub(reg_work_amount_out, 1); + jmp(out_loop_label, T_NEAR); + } + L(out_loop_end); + } + + void linear_onnx_c_gathered() { + mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); + // load weight + mov(reg_src, ptr[reg_params + GET_OFF(weight_ptr[0])]); + mov(reg_src_aux, ptr[reg_params + GET_OFF(weight_ptr[0]) + sizeof(size_t)]); + uni_vbroadcastss(vmm_weightL, ptr[reg_src]); + uni_vbroadcastss(vmm_weightR, ptr[reg_src_aux]); + if (jcp_.spatial_dim_size > 1) { + mov(reg_src_aux1, ptr[reg_params + GET_OFF(weight_ptr[0]) + 2 * sizeof(size_t)]); + mov(reg_src_aux2, ptr[reg_params + GET_OFF(weight_ptr[0]) + 3 * sizeof(size_t)]); + uni_vbroadcastss(vmm_weightT, ptr[reg_src_aux1]); + uni_vbroadcastss(vmm_weightB, ptr[reg_src_aux2]); + } + if (jcp_.spatial_dim_size > 2) { + mov(reg_src, ptr[reg_params + GET_OFF(weight_ptr[0]) + 4 * sizeof(size_t)]); + mov(reg_src_aux, ptr[reg_params + GET_OFF(weight_ptr[0]) + 5 * sizeof(size_t)]); + uni_vbroadcastss(vmm_weightF, ptr[reg_src]); + uni_vbroadcastss(vmm_weightE, ptr[reg_src_aux]); + } + // load src + mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); + mov(reg_src_aux, ptr[reg_params + GET_OFF(src_ptr[0]) + sizeof(size_t)]); + if (jcp_.spatial_dim_size > 1) { + mov(reg_src_aux1, ptr[reg_params + GET_OFF(src_ptr[0]) + 2 * sizeof(size_t)]); + mov(reg_src_aux2, ptr[reg_params + GET_OFF(src_ptr[0]) + 3 * sizeof(size_t)]); + } + Xbyak::Reg64 reg_src_aux4 = r14; + Xbyak::Reg64 reg_src_aux5 = rdx; + Xbyak::Reg64 reg_src_aux6 = rsi; + Xbyak::Reg64 reg_src_aux7 = rbp; + if (jcp_.spatial_dim_size > 2) { + mov(reg_src_aux4, ptr[reg_params + GET_OFF(src_ptr[0]) + 4 * sizeof(size_t)]); + mov(reg_src_aux5, ptr[reg_params + GET_OFF(src_ptr[0]) + 5 * sizeof(size_t)]); + mov(reg_src_aux6, ptr[reg_params + GET_OFF(src_ptr[0]) + 6 * sizeof(size_t)]); + mov(reg_src_aux7, ptr[reg_params + GET_OFF(src_ptr[0]) + 7 * sizeof(size_t)]); + } + mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); + + int blk = (isa == cpu::x64::sse41) ? (2 * vector_step) : vector_step; + int dst_stride = (jcp_.layout == InterpolateLayoutType::by_channel) ? (vector_step * jcp_.dst_data_size) : + (blk * jcp_.OW * jcp_.OH * jcp_.OD * jcp_.dst_data_size); + int src_stride = (jcp_.layout == InterpolateLayoutType::by_channel) ? (vector_step * jcp_.src_data_size) : + (blk * jcp_.IW * jcp_.IH * jcp_.ID * jcp_.src_data_size); + + Xbyak::Label main_loop_label; + Xbyak::Label main_loop_end_label; + Xbyak::Label blk_tail_loop_label; + Xbyak::Label blk_tail_loop_end_label; + Xbyak::Label tail_loop_label; + Xbyak::Label tail_loop_end_label; + L(main_loop_label); + { + if (jcp_.layout == InterpolateLayoutType::by_channel) { + cmp(reg_work_amount, vector_step); + jl(main_loop_end_label, T_NEAR); + } else { + cmp(reg_work_amount, 1); + jl(main_loop_end_label, T_NEAR); + } + // progressive manner + load(reg_src, vmm_valTL, vector_step); + load(reg_src_aux, vmm_valTR, vector_step); + if (jcp_.spatial_dim_size == 1) { + linear_onnx_worker_1d(); + } + if (jcp_.spatial_dim_size > 1) { + load(reg_src_aux1, vmm_valBL, vector_step); + load(reg_src_aux2, vmm_valBR, vector_step); + linear_onnx_worker_2d(); + } + if (jcp_.spatial_dim_size > 2) { + uni_vmovups(vmm_d_bias, vmm_valTR); // temporally save front result to temp_vmm + load(reg_src_aux4, vmm_valTL, vector_step); + load(reg_src_aux5, vmm_valTR, vector_step); + load(reg_src_aux6, vmm_valBL, vector_step); + load(reg_src_aux7, vmm_valBR, vector_step); + + // 2d for end depth + linear_onnx_worker_2d(); + // 3th dimension + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + } + + if (attr_.post_ops_.len() != 0) { + apply_post_ops(jcp_.dst_prc, false); // vmm_val is vmm_valTR + add(reg_oc_off, vector_step * sizeof(float)); + } + store(vmm_valTR, reg_dst, vector_step); + + if ((isa == cpu::x64::sse41) && (jcp_.layout == InterpolateLayoutType::block)) { + int offset_src = vector_step * jcp_.src_data_size; + load(reg_src, vmm_valTL, vector_step, offset_src); + load(reg_src_aux, vmm_valTR, vector_step, offset_src); + if (jcp_.spatial_dim_size == 1) { + linear_onnx_worker_1d(); + } + if (jcp_.spatial_dim_size > 1) { + load(reg_src_aux1, vmm_valBL, vector_step, offset_src); + load(reg_src_aux2, vmm_valBR, vector_step, offset_src); + linear_onnx_worker_2d(); + } + if (jcp_.spatial_dim_size > 2) { + uni_vmovups(vmm_d_bias, vmm_valTR); // temporally save front result to temp_vmm + load(reg_src_aux4, vmm_valTL, vector_step, offset_src); + load(reg_src_aux5, vmm_valTR, vector_step, offset_src); + load(reg_src_aux6, vmm_valBL, vector_step, offset_src); + load(reg_src_aux7, vmm_valBR, vector_step, offset_src); + // 2d for end depth + linear_onnx_worker_2d(); + // 3th dimension + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + } + + if (attr_.post_ops_.len() != 0) { + apply_post_ops(jcp_.dst_prc, false); + add(reg_oc_off, vector_step * sizeof(float)); + } + int offset_dst = vector_step * jcp_.dst_data_size; + store(vmm_valTR, reg_dst, vector_step, offset_dst); + } + add(reg_dst, dst_stride); + add(reg_src, src_stride); + add(reg_src_aux, src_stride); + if (jcp_.spatial_dim_size > 1) { + add(reg_src_aux1, src_stride); + add(reg_src_aux2, src_stride); + } + if (jcp_.spatial_dim_size > 2) { + add(reg_src_aux4, src_stride); + add(reg_src_aux5, src_stride); + add(reg_src_aux6, src_stride); + add(reg_src_aux7, src_stride); + } + if (jcp_.layout == InterpolateLayoutType::by_channel) { + sub(reg_work_amount, vector_step); // work_amount is c + } else { + sub(reg_work_amount, 1); // work_amount = div_up(c, blk), no tails + } + + jmp(main_loop_label, T_NEAR); + } + L(main_loop_end_label); + + if ((jcp_.layout == InterpolateLayoutType::by_channel) && (tail_step != 0)) { + load(reg_src, vmm_valTL, tail_step); + load(reg_src_aux, vmm_valTR, tail_step); + if (jcp_.spatial_dim_size == 1) { + linear_onnx_worker_1d(); + } + if (jcp_.spatial_dim_size > 1) { + load(reg_src_aux1, vmm_valBL, tail_step); + load(reg_src_aux2, vmm_valBR, tail_step); + linear_onnx_worker_2d(); + } + if (jcp_.spatial_dim_size > 2) { + uni_vmovups(vmm_d_bias, vmm_valTR); // temporally save front result to temp_vmm + + load(reg_src_aux4, vmm_valTL, tail_step); + load(reg_src_aux5, vmm_valTR, tail_step); + load(reg_src_aux6, vmm_valBL, tail_step); + load(reg_src_aux7, vmm_valBR, tail_step); + // 2d for end depth + linear_onnx_worker_2d(); + // 3th dimension + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + } + + if (attr_.post_ops_.len() != 0) { + apply_post_ops(jcp_.dst_prc, false); // vmm_val is vmm_valTR + add(reg_oc_off, tail_step * sizeof(float)); + } + + store(vmm_valTR, reg_dst, tail_step); + } + } + + void linear_onnx_planar() { + mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); + mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); + mov(reg_index, ptr[reg_params + GET_OFF(index)]); + mov(reg_src_aux, ptr[reg_params + GET_OFF(weight_ptr[0])]); + mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); + + int index_stride = jcp_.OW * jcp_.OH * jcp_.OD * jcp_.indices_size; + int weight_stride = jcp_.OW * jcp_.OH * jcp_.OD * sizeof(float); + + Xbyak::Label main_loop_label; + Xbyak::Label main_loop_end_label; + Xbyak::Label tail_loop_label; + Xbyak::Label tail_loop_end_label; + L(main_loop_label); + { + cmp(reg_work_amount, vector_step); + jl(main_loop_end_label, T_NEAR); + + uni_vmovdqu(vmm_index, ptr[reg_index]); + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_valTL, ptr[reg_src + vmm_index], vmm_mask); + + uni_vmovdqu(vmm_index, ptr[reg_index + index_stride]); + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_valTR, ptr[reg_src + vmm_index], vmm_mask); + + load_weights(reg_src_aux, vmm_weightL, vector_step); + load_weights(reg_src_aux, vmm_weightR, vector_step, weight_stride); + + // progressive manner + if (jcp_.spatial_dim_size == 1) { + linear_onnx_worker_1d(); + } + if (jcp_.spatial_dim_size > 1) { + uni_vmovdqu(vmm_index, ptr[reg_index + 2 * index_stride]); + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_valBL, ptr[reg_src + vmm_index], vmm_mask); + + uni_vmovdqu(vmm_index, ptr[reg_index + 3 * index_stride]); + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_valBR, ptr[reg_src + vmm_index], vmm_mask); + + load_weights(reg_src_aux, vmm_weightT, vector_step, 2 * weight_stride); + load_weights(reg_src_aux, vmm_weightB, vector_step, 3 * weight_stride); + + linear_onnx_worker_2d(); + } + if (jcp_.spatial_dim_size > 2) { + uni_vmovups(vmm_d_bias, vmm_valTR); // temporally save front result to temp_vmm + + // for end depth + uni_vmovdqu(vmm_index, ptr[reg_index + 4 * index_stride]); + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_valTL, ptr[reg_src + vmm_index], vmm_mask); + + uni_vmovdqu(vmm_index, ptr[reg_index + 5 * index_stride]); + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_valTR, ptr[reg_src + vmm_index], vmm_mask); + + uni_vmovdqu(vmm_index, ptr[reg_index + 6 * index_stride]); + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_valBL, ptr[reg_src + vmm_index], vmm_mask); + + uni_vmovdqu(vmm_index, ptr[reg_index + 7 * index_stride]); + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_valBR, ptr[reg_src + vmm_index], vmm_mask); + + linear_onnx_worker_2d(); + + load_weights(reg_src_aux, vmm_weightE, vector_step, 5 * weight_stride); + load_weights(reg_src_aux, vmm_weightF, vector_step, 4 * weight_stride); + + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + } + + if (attr_.post_ops_.len() != 0) { + apply_post_ops(jcp_.dst_prc, true); // vmm_val is vmm_valTR, broadcase is true + } + store(vmm_valTR, reg_dst, vector_step); + + add(reg_dst, vector_step * jcp_.dst_data_size); + add(reg_src_aux, vector_step * sizeof(float)); + add(reg_index, vector_step * jcp_.indices_size); + sub(reg_work_amount, vector_step); + + jmp(main_loop_label, T_NEAR); + } + L(main_loop_end_label); + + L(tail_loop_label); + { + cmp(reg_work_amount, 1); + jl(tail_loop_end_label, T_NEAR); + + mov(reg_src_aux1, reg_src); + mov(reg_index_offset, dword[reg_index]); + add(reg_src_aux1, reg_index_offset); + load(reg_src_aux1, vmm_valTL, scalar_step); + + mov(reg_src_aux1, reg_src); + mov(reg_index_offset, dword[reg_index + index_stride]); + add(reg_src_aux1, reg_index_offset); + load(reg_src_aux1, vmm_valTR, scalar_step); + + load_weights(reg_src_aux, vmm_weightL, scalar_step, 0); + load_weights(reg_src_aux, vmm_weightR, scalar_step, weight_stride); + + if (jcp_.spatial_dim_size == 1) { + linear_onnx_worker_1d(); + } + if (jcp_.spatial_dim_size > 1) { + mov(reg_src_aux1, reg_src); + mov(reg_index_offset, dword[reg_index + 2 * index_stride]); + add(reg_src_aux1, reg_index_offset); + load(reg_src_aux1, vmm_valBL, scalar_step); + + mov(reg_src_aux1, reg_src); + mov(reg_index_offset, dword[reg_index + 3 * index_stride]); + add(reg_src_aux1, reg_index_offset); + load(reg_src_aux1, vmm_valBR, scalar_step); + + load_weights(reg_src_aux, vmm_weightT, scalar_step, 2 * weight_stride); + load_weights(reg_src_aux, vmm_weightB, scalar_step, 3 * weight_stride); + + linear_onnx_worker_2d(); + } + if (jcp_.spatial_dim_size > 2) { + uni_vmovups(vmm_d_bias, vmm_valTR); // save from front result to temp_vmm + + // for end depth + mov(reg_src_aux1, reg_src); + mov(reg_index_offset, dword[reg_index + 4 * index_stride]); + add(reg_src_aux1, reg_index_offset); + load(reg_src_aux1, vmm_valTL, scalar_step); + + mov(reg_src_aux1, reg_src); + mov(reg_index_offset, dword[reg_index + 5 * index_stride]); + add(reg_src_aux1, reg_index_offset); + load(reg_src_aux1, vmm_valTR, scalar_step); + + mov(reg_src_aux1, reg_src); + mov(reg_index_offset, dword[reg_index + 6 * index_stride]); + add(reg_src_aux1, reg_index_offset); + load(reg_src_aux1, vmm_valBL, scalar_step); + + mov(reg_src_aux1, reg_src); + mov(reg_index_offset, dword[reg_index + 7 * index_stride]); + add(reg_src_aux1, reg_index_offset); + load(reg_src_aux1, vmm_valBR, scalar_step); + + linear_onnx_worker_2d(); + + load_weights(reg_src_aux, vmm_weightE, scalar_step, 5 * weight_stride); + load_weights(reg_src_aux, vmm_weightF, scalar_step, 4 * weight_stride); + + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + } + + if (attr_.post_ops_.len() != 0) { + apply_post_ops(jcp_.dst_prc, true); // process on vmm_val, vmm_val is vmm_valTR, and bc + } + store(vmm_valTR, reg_dst, scalar_step); + + add(reg_dst, scalar_step * jcp_.dst_data_size); + add(reg_src_aux, scalar_step * sizeof(float)); + add(reg_index, scalar_step * jcp_.indices_size); + sub(reg_work_amount, scalar_step); + + jmp(tail_loop_label, T_NEAR); + } + L(tail_loop_end_label); + } + + inline void linear_onnx_worker_1d() { + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightR); + uni_vfmadd231ps(vmm_valTR, vmm_valTL, vmm_weightL); + } + + // weightT * (srcTL * weightL + srcTR * weightR) + + // weightB * (srcBL * weightL + srcBR * weightR) + inline void linear_onnx_worker_2d() { + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightR); + uni_vmulps(vmm_valBR, vmm_valBR, vmm_weightR); + uni_vfmadd231ps(vmm_valTR, vmm_valTL, vmm_weightL); + uni_vfmadd231ps(vmm_valBR, vmm_valBL, vmm_weightL); + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightT); + uni_vfmadd231ps(vmm_valTR, vmm_valBR, vmm_weightB); + } + + void cubic_c_gathered() { + mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); + mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); + mov(reg_index, ptr[reg_params + GET_OFF(index)]); + mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); + + // weight_ptr[0] point to weightX + mov(reg_src_aux1, ptr[reg_params + GET_OFF(weight_ptr[0])]); + uni_vbroadcastss(vmm_weightX0, ptr[reg_src_aux1]); + uni_vbroadcastss(vmm_weightX1, ptr[reg_src_aux1 + 1 * sizeof(float)]); + uni_vbroadcastss(vmm_weightX2, ptr[reg_src_aux1 + 2 * sizeof(float)]); + uni_vbroadcastss(vmm_weightX3, ptr[reg_src_aux1 + 3 * sizeof(float)]); + + // weight_ptr[1] point to weightY + mov(reg_src_aux1, ptr[reg_params + GET_OFF(weight_ptr[0]) + sizeof(size_t)]); + uni_vbroadcastss(vmm_weightY0, ptr[reg_src_aux1]); + uni_vbroadcastss(vmm_weightY1, ptr[reg_src_aux1 + 1 * sizeof(float)]); + uni_vbroadcastss(vmm_weightY2, ptr[reg_src_aux1 + 2 * sizeof(float)]); + uni_vbroadcastss(vmm_weightY3, ptr[reg_src_aux1 + 3 * sizeof(float)]); + + int blk = (isa == cpu::x64::sse41) ? (2 * vector_step) : vector_step; + + Xbyak::Label main_loop_label; + Xbyak::Label main_loop_end_label; + Xbyak::Label tail_loop_label; + Xbyak::Label tail_loop_end_label; + L(main_loop_label); + { + if (jcp_.layout == InterpolateLayoutType::by_channel) { + cmp(reg_work_amount, vector_step); + jl(main_loop_end_label, T_NEAR); + } else { + cmp(reg_work_amount, 1); + jl(tail_loop_end_label, T_NEAR); + } + + uni_vpxor(vmm_val, vmm_val, vmm_val); + + cubic_c_gathered_matrix(false); + + if (attr_.post_ops_.len() != 0) { + apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value to post_ops and store + add(reg_oc_off, vector_step * sizeof(float)); + } + store(vmm_val, reg_dst, vector_step); + + if ((isa == cpu::x64::sse41) && (jcp_.layout == InterpolateLayoutType::block)) { + // vmm is xmm here + add(reg_src, vector_step * jcp_.src_data_size); + add(reg_dst, vector_step * jcp_.dst_data_size); + + uni_vpxor(vmm_val, vmm_val, vmm_val); + + cubic_c_gathered_matrix(false); + + if (attr_.post_ops_.len() != 0) { + apply_post_ops(jcp_.dst_prc, false); + add(reg_oc_off, vector_step * sizeof(float)); // second vector_step for one blk + } + store(vmm_val, reg_dst, vector_step); + + sub(reg_src, vector_step * jcp_.src_data_size); + sub(reg_dst, vector_step * jcp_.dst_data_size); + } + if (jcp_.layout == InterpolateLayoutType::by_channel) { + int dst_stride = vector_step * jcp_.dst_data_size; + int src_stride = vector_step * jcp_.src_data_size; + add(reg_dst, dst_stride); + add(reg_src, src_stride); + sub(reg_work_amount, vector_step); // work_amount is c + } else { + int dst_stride = blk * jcp_.OW * jcp_.OH * jcp_.dst_data_size; + int src_stride = blk * jcp_.IW * jcp_.IH * jcp_.src_data_size; + add(reg_dst, dst_stride); + add(reg_src, src_stride); + sub(reg_work_amount, 1); // work_amount = div_up(c, blk), no tails + } + + jmp(main_loop_label, T_NEAR); + } + L(main_loop_end_label); + + // only for by_channel layout for tails. + L(tail_loop_label); + { + cmp(reg_work_amount, 1); + jl(tail_loop_end_label, T_NEAR); + + // store final computed value + uni_vpxor(vmm_val, vmm_val, vmm_val); + + cubic_c_gathered_matrix(true); + + if (attr_.post_ops_.len() != 0) { + apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value + add(reg_oc_off, scalar_step * sizeof(float)); + } + store(vmm_val, reg_dst, scalar_step); + + int dst_stride = scalar_step * jcp_.dst_data_size; + int src_stride = scalar_step * jcp_.src_data_size; + add(reg_dst, dst_stride); + add(reg_src, src_stride); + sub(reg_work_amount, scalar_step); // work_amount is c + + jmp(tail_loop_label, T_NEAR); + } + L(tail_loop_end_label); + } + + inline void cubic_c_gathered_matrix(bool is_scalar) { + // y0: (x0 * weightX0 + x1 * weightX1 + x2 * weightX2 + x3 * weightX3) * weightY0 + cubic_c_gathered_line(0, vmm_weightY0, is_scalar); + // y1 + cubic_c_gathered_line(4, vmm_weightY1, is_scalar); + // y2 + cubic_c_gathered_line(8, vmm_weightY2, is_scalar); + // y3 + cubic_c_gathered_line(12, vmm_weightY3, is_scalar); + } + + inline void cubic_c_gathered_line(int index_start, Vmm vmm_weight, bool is_scalar) { + uni_vpxor(vmm_dstX, vmm_dstX, vmm_dstX); + cubic_c_gathered_pixel(index_start, vmm_weightX0, is_scalar); + cubic_c_gathered_pixel(index_start + 1, vmm_weightX1, is_scalar); + cubic_c_gathered_pixel(index_start + 2, vmm_weightX2, is_scalar); + cubic_c_gathered_pixel(index_start + 3, vmm_weightX3, is_scalar); + uni_vfmadd231ps(vmm_val, vmm_dstX, vmm_weight); + } + + inline void cubic_c_gathered_pixel(int i, Vmm vmm_weight, bool is_scalar) { + mov(reg_src_aux, reg_src); + mov(reg_index_offset, dword[reg_index + i * jcp_.indices_size]); + add(reg_src_aux, reg_index_offset); + int step = is_scalar ? 1 : vlen / sizeof(float); + load(reg_src_aux, vmm_src, step); + uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weight); + } + + void cubic_planar() { + mov(reg_table, l_table_constant); + // src_ptr[2] for oh sequence, src_ptr[3] for ow sequence + mov(reg_tbl_y, ptr[reg_params + GET_OFF(src_ptr[0]) + 2 * sizeof(size_t)]); + mov(reg_tbl_x, ptr[reg_params + GET_OFF(src_ptr[0]) + 3 * sizeof(size_t)]); + uni_vmovdqu(vmm_one, cubic_planar_table_val(0)); + uni_vpxor(vmm_zero, vmm_zero, vmm_zero); + + mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); + mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); + // index_OW + mov(reg_index, ptr[reg_params + GET_OFF(index)]); + // index_OH from src_ptr[1] + Xbyak::Reg64 reg_index_y = reg_src_aux; + mov(reg_index_y, ptr[reg_params + GET_OFF(src_ptr[0]) + sizeof(size_t)]); + // weight_OW + Xbyak::Reg64 reg_weight_x = reg_src_aux1; + mov(reg_weight_x, ptr[reg_params + GET_OFF(weight_ptr[0])]); + // weight_OH + Xbyak::Reg64 reg_weight_y = reg_src_aux2; + mov(reg_weight_y, ptr[reg_params + GET_OFF(weight_ptr[0]) + sizeof(size_t)]); + mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); + + int grid_len = 4; + + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 + // for 3th step(8): 16 17 18 19 20 21 22 23 + // y: 0 0 0 0 1 1 1 1 + // x: 16 17 18 19 0 1 2 3 + + Xbyak::Label main_loop_label; + Xbyak::Label main_loop_end_label; + Xbyak::Label tail_loop_label; + Xbyak::Label tail_loop_end_label; + L(main_loop_label); + { + cmp(reg_work_amount, vector_step); + jl(main_loop_end_label, T_NEAR); + + // vmm_tbl_y: (0 0 0 0 1 1 1 1 * index_size) --> (0 0 0 0 4 4 4 4) + uni_vmovdqu(vmm_tbl_y, ptr[reg_tbl_y]); + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + // vmm_index_in_y: 0 0 0 0 2 2 2 2 + vpgatherdd(vmm_index_in_y, ptr[reg_index_y + vmm_tbl_y], vmm_mask); + + // use vmm_val temporally for value in reg_tbl_x: 16 17 18 19 0 1 2 3 + uni_vmovdqu(vmm_val, ptr[reg_tbl_x]); + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + // e.g. vmm_index_in_x: 32 34 36 38 0 2 4 6, now save src index. + vpgatherdd(vmm_index_in_x, ptr[reg_index + vmm_val], vmm_mask); + + // build weightX used in y0-y3 + // weight format: w0_0 w1_0 w2_0 w3_0 w0_1 w1_1 w2_1 w3_1 ... + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_weightX0, ptr[reg_weight_x + vmm_val * grid_len], vmm_mask); // 4 in vmm_val for weight_size, another 4 for grid_len + + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + // shift weight_size then gather second weight + vgatherdps(vmm_weightX1, ptr[reg_weight_x + sizeof(float) + (vmm_val * grid_len)], vmm_mask); + + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_weightX2, ptr[reg_weight_x + 2 * sizeof(float) + (vmm_val * grid_len)], vmm_mask); + + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_weightX3, ptr[reg_weight_x + 3 * sizeof(float) + (vmm_val * grid_len)], vmm_mask); + // vmm_val is now relieved and used for dst_value + + uni_vpxor(vmm_val, vmm_val, vmm_val); + // y0 + vpsubd(vmm_index_y_itr, vmm_index_in_y, vmm_one); + // crop to [0, IH - 1] + vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); + vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); + + // weight y0 + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_weightY, ptr[reg_weight_y + (vmm_tbl_y * grid_len)], vmm_mask); + cubic_planar_line(false); + + // y1 + // crop to [0, IH - 1] + vpminsd(vmm_index_y_itr, vmm_index_in_y, cubic_planar_table_val(1)); + vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); + // weight y1: shift weight_size + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_weightY, ptr[reg_weight_y + sizeof(float) + (vmm_tbl_y * grid_len)], vmm_mask); + cubic_planar_line(false); + + // y2 + vpaddd(vmm_index_y_itr, vmm_index_in_y, vmm_one); + // crop to [0, IH - 1] + vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); + vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); + // weight y2 + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_weightY, ptr[reg_weight_y + 2 * sizeof(float) + (vmm_tbl_y * grid_len)], vmm_mask); + cubic_planar_line(false); + + // y3 + vpaddd(vmm_index_y_itr, vmm_index_in_y, vmm_one); + vpaddd(vmm_index_y_itr, vmm_index_y_itr, vmm_one); + // crop to [0, IH - 1] + vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); + vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); + // weight y3 + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_weightY, ptr[reg_weight_y + 3 * sizeof(float) + (vmm_tbl_y * grid_len)], vmm_mask); + cubic_planar_line(false); + + if (attr_.post_ops_.len() != 0) { + apply_post_ops(jcp_.dst_prc, true); // oc_off is broadcast and always the same value for this channel + } + store(vmm_val, reg_dst, vector_step); + + add(reg_tbl_y, vector_step * sizeof(int)); // sizeof(int): sequence by dd() + add(reg_tbl_x, vector_step * sizeof(int)); + add(reg_dst, vector_step * jcp_.dst_data_size); + + sub(reg_work_amount, vector_step); + + jmp(main_loop_label, T_NEAR); + } + L(main_loop_end_label); + + L(tail_loop_label); + { + cmp(reg_work_amount, 1); + jl(tail_loop_end_label, T_NEAR); + + // get idx for input + uni_vmovss(Xmm(vmm_tbl_y.getIdx()), ptr[reg_tbl_y]); + gather_i32_indices(vmm_index_in_y, reg_index_y, 0, vmm_tbl_y, 1, Precision::I32, true); + + uni_vmovss(Xmm(vmm_val.getIdx()), ptr[reg_tbl_x]); + gather_i32_indices(vmm_index_in_x, reg_index, 0, vmm_val, 1, Precision::I32, true); + // gather weightX by input idx, used in y0-y3 + gather_i32_indices(vmm_weightX0, reg_weight_x, 0, vmm_val, grid_len, Precision::FP32, true); + gather_i32_indices(vmm_weightX1, reg_weight_x, sizeof(float), vmm_val, grid_len, Precision::FP32, true); + gather_i32_indices(vmm_weightX2, reg_weight_x, 2 * sizeof(float), vmm_val, grid_len, Precision::FP32, true); + gather_i32_indices(vmm_weightX3, reg_weight_x, 3 * sizeof(float), vmm_val, grid_len, Precision::FP32, true); + // vmm_val is now relieved and used for dst_value + + uni_vpxor(vmm_val, vmm_val, vmm_val); + // y0 + vpsubd(vmm_index_y_itr, vmm_index_in_y, vmm_one); + // crop to [0, IH - 1] + vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); + vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); + + gather_i32_indices(vmm_weightY, reg_weight_y, 0, vmm_tbl_y, grid_len, Precision::FP32, true); + cubic_planar_line(true); + + // y1 + // crop to [0, IH - 1] + vpminsd(vmm_index_y_itr, vmm_index_in_y, cubic_planar_table_val(1)); + vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); + // weight y1: shift weight_size + gather_i32_indices(vmm_weightY, reg_weight_y, sizeof(float), vmm_tbl_y, grid_len, Precision::FP32, true); + cubic_planar_line(true); + + // y2 + vpaddd(vmm_index_y_itr, vmm_index_in_y, vmm_one); + // crop to [0, IH - 1] + vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); + vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); + // weight y2 + gather_i32_indices(vmm_weightY, reg_weight_y, 2 * sizeof(float), vmm_tbl_y, grid_len, Precision::FP32, true); + cubic_planar_line(true); + + // y3 + vpaddd(vmm_index_y_itr, vmm_index_in_y, vmm_one); + vpaddd(vmm_index_y_itr, vmm_index_y_itr, vmm_one); + // crop to [0, IH - 1] + vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); + vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); + // weight y3 + gather_i32_indices(vmm_weightY, reg_weight_y, 3 * sizeof(float), vmm_tbl_y, grid_len, Precision::FP32, true); + cubic_planar_line(true); + + if (attr_.post_ops_.len() != 0) { + apply_post_ops(jcp_.dst_prc, true); // oc_off is broadcast and always the same value for this channel + } + store(vmm_val, reg_dst, scalar_step); + + add(reg_tbl_y, scalar_step * sizeof(int)); // sizeof(int): sequence with dd() + add(reg_tbl_x, scalar_step * sizeof(int)); + add(reg_dst, scalar_step * jcp_.dst_data_size); + + sub(reg_work_amount, scalar_step); + + jmp(tail_loop_label, T_NEAR); + } + L(tail_loop_end_label); + } + + inline void cubic_planar_line(bool is_scalar) { + uni_vpxor(vmm_dstX, vmm_dstX, vmm_dstX); + cubic_planar_pixel(0, is_scalar); + cubic_planar_pixel(1, is_scalar); + cubic_planar_pixel(2, is_scalar); + cubic_planar_pixel(3, is_scalar); + uni_vfmadd231ps(vmm_val, vmm_dstX, vmm_weightY); + } + + inline void cubic_planar_pixel(int itr, bool is_scalar) { + // vmm_index_in_x have index for src + if (itr == 0) { + vpsubd(vmm_index_x_itr, vmm_index_in_x, vmm_one); + } else if (itr == 1) { + vpaddd(vmm_index_x_itr, vmm_index_in_x, vmm_zero); + } else if (itr == 2) { + vpaddd(vmm_index_x_itr, vmm_index_in_x, vmm_one); + } else if (itr == 3) { + vpaddd(vmm_index_x_itr, vmm_index_in_x, vmm_one); + vpaddd(vmm_index_x_itr, vmm_index_x_itr, vmm_one); + } + + // crop to [0, IW - 1] + vpminsd(vmm_index_x_itr, vmm_index_x_itr, cubic_planar_table_val(2)); + vpmaxsd(vmm_index_x_itr, vmm_index_x_itr, vmm_zero); + + // value + // index is: ptr[reg_src + (vmm_index_y_itr * jcp_.IW + vmm_index_x_itr) * jcp_.src_data_size] + uni_vmovdqu(vmm_mask, cubic_planar_table_val(2)); + vpaddd(vmm_mask, vmm_mask, vmm_one); // (IW - 1) + 1 = IW + uni_vpmulld(vmm_mask, vmm_mask, vmm_index_y_itr); + uni_vpaddd(vmm_index_x_itr, vmm_index_x_itr, vmm_mask); + gather_i32_indices(vmm_src, reg_src, 0, vmm_index_x_itr, jcp_.src_data_size, Precision::FP32, is_scalar); + + if (itr == 0) { + uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weightX0); + } else if (itr == 1) { + uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weightX1); + } else if (itr == 2) { + uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weightX2); + } else if (itr == 3) { + uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weightX3); + } + } + + inline void prepare_cubic_planar_table() { + auto broadcast_int = [&](int val) { + for (size_t d = 0; d < vlen / sizeof(int); ++d) { + dd(val); + } + }; + + align(64); + L(l_table_constant); + broadcast_int(vals_for_cubic_planar.int_one); + broadcast_int(jcp_.IH - 1); + broadcast_int(jcp_.IW - 1); + dd(vals_for_cubic_planar.mask_gather_avx512); + } + + struct vals_for_cubic_planar_type { + int int_one = 0x00000001; + int mask_gather_avx512 = 0x0000ffff; // 00000000000000001111111111111111 + } vals_for_cubic_planar; + + inline Xbyak::Address cubic_planar_table_val(int index) { + return ptr[reg_table + index * vlen]; + } + + // always gather to Vmm, compute with Vmm, store with Xmm if scalar_step + inline void gather_i32_indices(Vmm vmm_src, const Xbyak::Reg64 &base, int offset, Vmm vmm_indices, int scale, + Precision src_prc, bool is_scalar) { + Xbyak::Address table_idx = ptr[base + offset + vmm_indices * scale]; + if ((isa == cpu::x64::avx512_core) && !is_scalar) { + // [0-15] bit of int to mask + kmovw(k_mask, cubic_planar_table_val(3)); + if (src_prc == Precision::FP32) { + vgatherdps(vmm_src | k_mask, table_idx); // dword index, packed single data + } else if (src_prc == Precision::I32) { + vpgatherdd(vmm_src | k_mask, table_idx); // dword index, dword data + } + } else if ((isa == cpu::x64::avx2) && !is_scalar) { + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + if (src_prc == Precision::FP32) { + vgatherdps(vmm_src, table_idx, vmm_mask); + } else if (src_prc == Precision::I32) { + vpgatherdd(vmm_src, table_idx, vmm_mask); + } + } else { + const int gpr_size = 8; + sub(rsp, gpr_size); + // move content in register to content in address(ptr[]) + mov(ptr[rsp], reg_tmp_64); + + // replace index with value in stack + sub(rsp, vlen); + uni_vmovdqu(ptr[rsp], vmm_indices); + + int repeats = is_scalar ? 1 : vlen / sizeof(float); + for (size_t i = 0; i < repeats; ++i) { + mov(reg_tmp_64.cvt32(), ptr[rsp + i * sizeof(int)]); // sizeof(int) index_size + table_idx = ptr[base + offset + reg_tmp_64 * scale]; // scale: sizeof(float) value_size + mov(reg_tmp_64.cvt32(), table_idx); + mov(ptr[rsp + i * sizeof(int)], reg_tmp_64.cvt32()); + } + + uni_vmovups(vmm_src, ptr[rsp]); + add(rsp, vlen); + // restore GPR state + mov(reg_tmp_64, ptr[rsp]); + add(rsp, gpr_size); + } + } + + // is_broadcast for broadcasting param for depth_wise and quantize(channel-sensitive post-ops), for fusion with plain layout. + void apply_post_ops(Precision dst_prc, bool is_broadcast) { + const auto &p = attr_.post_ops_; + int eltwise_inj_idx = 0; + int depthwise_inj_idx = 0; + int quantization_inj_idx = 0; + int post_ops_data_offset = 0; + for (int i = 0; i < p.len(); i++) { + auto& post_op = p.entry_[i]; + if (post_op.is_eltwise()) { + eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1); + eltwise_inj_idx++; + } else if (post_op.is_depthwise()) { + mov(reg_d_weights, ptr[reg_post_ops_data + post_ops_data_offset]); + add(reg_d_weights, reg_oc_off); + + // weight and bias is padded. scalar as vector. + depthwise_injectors[depthwise_inj_idx]->compute_vector_range( + vmm_val.getIdx(), vmm_val.getIdx() + 1, reg_d_weights, reg_d_weights, is_broadcast); + + depthwise_inj_idx++; + post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep(); + } else if (post_op.is_quantization()) { + bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; + bool do_rounding = do_dequantization || dst_prc == Precision::FP32 || i != p.len() - 1; + + int s_idx = vmm_val.getIdx(); + + quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); + quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0, 0, is_broadcast); + + quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); + quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding, 0, is_broadcast); + + if (do_dequantization) { + quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); + quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0, 0, is_broadcast); + } + + post_ops_data_offset += quantization_injectors[quantization_inj_idx]->memoryStep(); + quantization_inj_idx++; + } + } + } +}; + +// for ndhwc and nCdhw8c[16c] +// input may be f32/bf16/int8, fused->output varies +void ov::intel_cpu::JITInterpolateExecutor::NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, + int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { + int *index_d = static_cast(&indexTable[0]); + int *index_h = static_cast(&indexTable[OD]); + int *index_w = static_cast(&indexTable[OD + OH]); + bool is_nhwc = (jitInterpolateAttrs.layout == by_channel); + + for (int b = 0; b < B; b++) { + if (is_nhwc) { + const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * C * b) * srcDataSize; + uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * C * b) * dstDataSize; + std::vector index_w_kernel(OW); + for (int ox = 0; ox < OW; ox++) { + index_w_kernel[ox] = index_w[ox] * C * srcDataSize; + } + parallel_for2d(OD, OH, [&](size_t d, size_t h) { + // kernel for C * OW + uint8_t *out_ptr_dh = out_ptr + (C * OW * OH * d + C * OW * h) * dstDataSize; + const uint8_t *in_ptr_dh = in_ptr + (C * IW * IH * index_d[d] + C * IW * index_h[h]) * srcDataSize; + auto arg = jit_interpolate_call_args(); + arg.dst = out_ptr_dh; + arg.src_ptr[0] = in_ptr_dh; + arg.index = static_cast(&(index_w_kernel[0])); + arg.work_amount = C; + arg.oc_off = 0; + arg.post_op_data = post_ops_data_; + (*interpolateKernel)(&arg); + }); + } else { // for blk + int blk_size = mayiuse(cpu::x64::avx512_core) ? 16 : 8; + int CB = div_up(C, blk_size); + const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * CB * blk_size * b) * srcDataSize; + uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * CB * blk_size * b) * dstDataSize; + std::vector index_w_kernel(OW); + for (int ox = 0; ox < OW; ox++) { + index_w_kernel[ox] = index_w[ox] * blk_size * srcDataSize; + } + parallel_for2d(CB, OD, [&](size_t cb, size_t d) { + uint8_t *out_ptr_cbd = out_ptr + (blk_size * OW * OH * OD * cb + blk_size * OW * OH * d) * dstDataSize; + const uint8_t *in_ptr_cbd = in_ptr + (blk_size * IW * IH * ID * cb + blk_size * IW * IH * index_d[d]) * srcDataSize; + auto arg = jit_interpolate_call_args(); + for (int h = 0; h < OH; h++) { // kernel for blk_size * OW + arg.dst = out_ptr_cbd + blk_size * OW * h * dstDataSize; + arg.src_ptr[0] = in_ptr_cbd + blk_size * IW * index_h[h] * srcDataSize; + arg.index = static_cast(&(index_w_kernel[0])); + arg.work_amount = static_cast(OW); + arg.oc_off = cb * blk_size * sizeof(float); + arg.post_op_data = post_ops_data_; + (*interpolateKernel)(&arg); + } + }); + } + } // batch end +} + + +void ov::intel_cpu::JITInterpolateExecutor::NNPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, + int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { + int *index_d = static_cast(&indexTable[0]); + int *index_h = static_cast(&indexTable[OD]); + int *index_w = static_cast(&indexTable[OD + OH]); + + std::vector index_kernel(OH + OW); + // index_h * IW * srcDataSize to reduce and simplify redundant compute + for (int oh = 0; oh < OH; oh++) { + index_kernel[oh] = index_h[oh] * IW * srcDataSize; + } + // index_w * srcDataSize + for (int ow = 0; ow < OW; ow++) { + index_kernel[OH + ow] = index_w[ow] * srcDataSize; + } + + parallel_for3d(B, C, OD, [&](size_t b, size_t c, size_t od) { + const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]) * srcDataSize; + uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od) * dstDataSize; + + auto arg = jit_interpolate_call_args(); + arg.src_ptr[0] = in_ptr; + arg.dst = out_ptr; + arg.index = static_cast(&index_kernel[0]); // need index_h and index_w in kernel, it's in continous memory so one param + arg.oc_off = static_cast(c * sizeof(float)); + // work_amount is OH(out loop) and OW(inner loop), can get in kernel from jcp. + arg.post_op_data = post_ops_data_; + (*interpolateKernel)(&arg); + }); +} + + +void ov::intel_cpu::JITInterpolateExecutor::linearOnnxPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, int B, int C, + int ID, int IH, int IW, int OD, int OH, int OW) { + // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, EndTopLeft:4, EndTopRight:5, EndBottomLeft:6, EndBottomRight:7 + // weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5 + int *index = static_cast(&indexTable[0]); + int eltInGrid = (spatialDimSize > 2) ? MAX_INPUT_INTERPOLATE : ((spatialDimSize > 1) ? 4 : 2); + int scratchLen = rnd_up(eltInGrid * OW * OH * OD, 16); + float *weight = reinterpret_cast(&indexTable[scratchLen]); + + parallel_for2d(B, C, [&](size_t b, size_t c) { + uint8_t *out_ptr_nc = out_ptr_ + (OH * OW * OD * C * b + OH * OW * OD * c) * dstDataSize; + const uint8_t *in_ptr_nc = in_ptr_ + (IH * IW * ID * C * b + IH * IW * ID * c) * srcDataSize; + auto arg = jit_interpolate_call_args(); + arg.src_ptr[0] = in_ptr_nc; + arg.index = static_cast(&index[0]); + arg.weight_ptr[0] = static_cast(&weight[0]); + arg.dst = out_ptr_nc; + arg.work_amount = OW * OH * OD; + arg.oc_off = static_cast(c * sizeof(float)); + arg.post_op_data = post_ops_data_; + (*interpolateKernel)(&arg); + }); +} + +void ov::intel_cpu::JITInterpolateExecutor::linearOnnxCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, + int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { + // left:OW right:OW Top:OH Bottom:OH Front:OD End:OD + std::vector indexPtr(MAX_INPUT_INTERPOLATE, 0); + std::vector weightPtr(MAX_INPUT_INTERPOLATE, 0); + size_t scratchLen = rnd_up(OW + OW + OH + OH + OD + OD, 16); + indexPtr[0] = static_cast(&indexTable[0]); + indexPtr[1] = static_cast(&indexTable[OW]); + indexPtr[2] = static_cast(&indexTable[2 * OW]); + indexPtr[3] = static_cast(&indexTable[2 * OW + OH]); + indexPtr[4] = static_cast(&indexTable[2 * OW + 2 * OH]); + indexPtr[5] = static_cast(&indexTable[2 * OW + 2 * OH + OD]); + + weightPtr[0] = reinterpret_cast(&indexTable[scratchLen]); + weightPtr[1] = reinterpret_cast(&indexTable[scratchLen + OW]); + weightPtr[2] = reinterpret_cast(&indexTable[scratchLen + 2 * OW]); + weightPtr[3] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + OH]); + weightPtr[4] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + 2 * OH]); + weightPtr[5] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + 2 * OH + OD]); + + bool isByChannel = (jitInterpolateAttrs.layout == by_channel) ? true : false; + + int blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8; + int CB = isByChannel ? 1 : div_up(C, blkSize); + int CGatherLen = isByChannel ? C : blkSize; + int workAmount = isByChannel ? C : CB; + // n_CB(1)_d_h_w_8[16](c), () for by-channel + int C0 = OW * CGatherLen; + int C1 = OH * C0; + int C2 = OD * C1; + int C3 = CB * C2; + int I0 = IW * CGatherLen; + int I1 = IH * I0; + int I2 = ID * I1; + int I3 = CB * I2; + parallel_for3d(B, OD, OH, [&](size_t b, size_t d, size_t h) { + uint8_t *out_ptr_ndh = out_ptr_ + (C3 * b + C1 * d + C0 * h) * dstDataSize; + + const uint8_t *in_ptr_n = in_ptr_ + (I3 * b) * srcDataSize; + const uint8_t *in_ptr_nf = in_ptr_n + (indexPtr[4][d] * I1) * srcDataSize; + const uint8_t *in_ptr_nft = in_ptr_nf + (indexPtr[2][h] * I0) * srcDataSize; + const uint8_t *in_ptr_nfb = in_ptr_nf + (indexPtr[3][h] * I0) * srcDataSize; + const uint8_t *in_ptr_ne = in_ptr_n + (indexPtr[5][d] * I1) * srcDataSize; + const uint8_t *in_ptr_net = in_ptr_ne + (indexPtr[2][h] * I0) * srcDataSize; + const uint8_t *in_ptr_neb = in_ptr_ne + (indexPtr[3][h] * I0) * srcDataSize; + auto arg = jit_interpolate_call_args(); + for (int w = 0; w < OW; ++w) { + uint8_t *out_ptr_ndhw = out_ptr_ndh + CGatherLen * w * dstDataSize; + + arg.src_ptr[0] = in_ptr_nft + (indexPtr[0][w] * CGatherLen) * srcDataSize; + arg.src_ptr[1] = in_ptr_nft + (indexPtr[1][w] * CGatherLen) * srcDataSize; + arg.src_ptr[2] = in_ptr_nfb + (indexPtr[0][w] * CGatherLen) * srcDataSize; + arg.src_ptr[3] = in_ptr_nfb + (indexPtr[1][w] * CGatherLen) * srcDataSize; + arg.src_ptr[4] = in_ptr_net + (indexPtr[0][w] * CGatherLen) * srcDataSize; + arg.src_ptr[5] = in_ptr_net + (indexPtr[1][w] * CGatherLen) * srcDataSize; + arg.src_ptr[6] = in_ptr_neb + (indexPtr[0][w] * CGatherLen) * srcDataSize; + arg.src_ptr[7] = in_ptr_neb + (indexPtr[1][w] * CGatherLen) * srcDataSize; + arg.weight_ptr[0] = static_cast(&weightPtr[0][w]); + arg.weight_ptr[1] = static_cast(&weightPtr[1][w]); + arg.weight_ptr[2] = static_cast(&weightPtr[2][h]); + arg.weight_ptr[3] = static_cast(&weightPtr[3][h]); + arg.weight_ptr[4] = static_cast(&weightPtr[4][d]); + arg.weight_ptr[5] = static_cast(&weightPtr[5][d]); + arg.dst = out_ptr_ndhw; + arg.work_amount = workAmount; + arg.oc_off = 0; + arg.post_op_data = post_ops_data_; + (*interpolateKernel)(&arg); + } + }); +} + +void ov::intel_cpu::JITInterpolateExecutor::cubicCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, + int B, int C, int IH, int IW, int OH, int OW) { + const int idxNum = 1; + int *xOrigin = static_cast(&indexTable[0]); + float *xFactor = reinterpret_cast(&indexTable[OW]); + int *yOrigin = static_cast(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW]); + float *yFactor = reinterpret_cast(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); + + int blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8; + int CB = div_up(C, blkSize); + int CSize = jitInterpolateAttrs.layout == InterpolateLayoutType::by_channel ? C : blkSize * CB; + int CGatherLen = jitInterpolateAttrs.layout == InterpolateLayoutType::by_channel ? C : blkSize; + int workAmount = jitInterpolateAttrs.layout == InterpolateLayoutType::by_channel ? C : CB; + + parallel_for3d(B, OH, OW, [&](size_t b, size_t h, size_t w) { + uint8_t *out_ptr_nhw = out_ptr_ + (OH * OW * CSize * b + OW * CGatherLen * h + CGatherLen * w) * dstDataSize; + const uint8_t *in_ptr_n = in_ptr_ + (IH * IW * CSize * b) * srcDataSize; + + std::vector kernelIndex(CUBIC_GRID_LEN * CUBIC_GRID_LEN); // 16 address offset to src(batch) or src(CB) + int iy = yOrigin[h]; + int ix = xOrigin[w]; + for (int y = iy - 1, i = 0; y <= iy + 2; y++, i++) { + int yInRange = std::max(0, std::min(y, IH - 1)); + yInRange = yInRange * CGatherLen * IW * srcDataSize; + for (int x = ix - 1, j = 0; x <= ix + 2; x++, j++) { + int xInRange = std::max(0, std::min(x, IW - 1)); + xInRange = yInRange + xInRange * CGatherLen * srcDataSize; + kernelIndex[i * CUBIC_GRID_LEN + j] = xInRange; + } + } + auto arg = jit_interpolate_call_args(); + arg.dst = out_ptr_nhw; + arg.src_ptr[0] = in_ptr_n; + arg.index = static_cast(&kernelIndex[0]); + // 0 for weight_W, 1 for weight_H + arg.weight_ptr[0] = static_cast(&xFactor[w * CUBIC_GRID_LEN]); + arg.weight_ptr[1] = static_cast(&yFactor[h * CUBIC_GRID_LEN]); + + // for by channel, src + step, dst + step, process next step on continuous memory + // for blk, src + IW*IH*blkSize, dst + OW*OH*blkSize, process the blkSize on next CB + arg.work_amount = workAmount; + arg.oc_off = 0; + arg.post_op_data = post_ops_data_; + (*interpolateKernel)(&arg); + }); +} + +void ov::intel_cpu::JITInterpolateExecutor::cubicPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, + int B, int C, int IH, int IW, int OH, int OW) { + int tblAdvance = 0; + int *xOrigin = static_cast(&indexTable[tblAdvance]); + tblAdvance += OW; + float *xFactor = reinterpret_cast(&indexTable[tblAdvance]); + tblAdvance += CUBIC_GRID_LEN * OW; + int *yOrigin = static_cast(&indexTable[tblAdvance]); + tblAdvance += OH; + float *yFactor = reinterpret_cast(&indexTable[tblAdvance]); + + tblAdvance += CUBIC_GRID_LEN * OH; + int *sequenceOH = static_cast(&indexTable[tblAdvance]); + tblAdvance += OW * OH; + int *sequenceOW = static_cast(&indexTable[tblAdvance]); + + parallel_for2d(B, C, [&](size_t n, size_t c) { + const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * C * n + IW * IH * c) * srcDataSize; + uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * C * n + OW * OH * c) * dstDataSize; + + auto arg = jit_interpolate_call_args(); + arg.dst = out_ptr_nc; + arg.src_ptr[0] = in_ptr_nc; + arg.index = xOrigin; + arg.src_ptr[1] = yOrigin; + arg.src_ptr[2] = static_cast(&sequenceOH[0]); + arg.src_ptr[3] = static_cast(&sequenceOW[0]); + arg.weight_ptr[0] = xFactor; + arg.weight_ptr[1] = yFactor; + arg.work_amount = static_cast(OW * OH); + arg.oc_off = static_cast(c * sizeof(float)); + arg.post_op_data = post_ops_data_; + (*interpolateKernel)(&arg); + }); +} + +bool ov::intel_cpu::JITInterpolateExecutor::init(const InterpolateAttrs &interpolateAttrs, + const std::vector &srcDescs, + const std::vector &dstDescs, + const dnnl::primitive_attr &attr) { + InterpolateExecutor::init(interpolateAttrs, srcDescs, dstDescs, attr); + jitInterpolateAttrs = interpolateAttrs; + auto jcp = jit_interpolate_config_params(); + jcp.mode = interpAttrs.mode; + jcp.src_prc = interpAttrs.inPrc; + jcp.dst_prc = interpAttrs.outPrc; + jcp.src_data_size = jcp.src_prc.size(); + jcp.dst_data_size = jcp.dst_prc.size(); + jcp.indices_size = sizeof(int); + jcp.C = dstDim5d[1]; + jcp.OW = dstDim5d[4]; + jcp.OH = dstDim5d[3]; + jcp.OD = dstDim5d[2]; + jcp.IW = srcDimPad5d[4]; + jcp.IH = srcDimPad5d[3]; + jcp.ID = srcDimPad5d[2]; + jcp.spatial_dim_size = getSpatialDimsNum(srcDescs[0]->getShape().getDims().size()); + jcp.layout = interpAttrs.layout; + if (jcp.layout != InterpolateLayoutType::planar) { + if (mayiuse(cpu::x64::avx512_core)) { + interpolateKernel.reset(new jit_uni_interpolate_kernel_f32(jcp, *attr.get())); + implType = impl_desc_type::jit_avx512; + } else if (mayiuse(cpu::x64::avx2)) { + interpolateKernel.reset(new jit_uni_interpolate_kernel_f32(jcp, *attr.get())); + implType = impl_desc_type::jit_avx2; + } else if (mayiuse(cpu::x64::sse41)) { + interpolateKernel.reset(new jit_uni_interpolate_kernel_f32(jcp, *attr.get())); + implType = impl_desc_type::jit_sse42; + } + } else if (mayiuse(cpu::x64::avx2) && interpAttrs.inPrc == InferenceEngine::Precision::FP32) { + // gather ISA(for planar JIT kernel) for avx2 and fp32 + interpolateKernel.reset(new jit_uni_interpolate_kernel_f32(jcp, *attr.get())); + implType = impl_desc_type::jit_avx2; + } else { + IE_THROW() << "Can't create JitInterpolateExecutor"; + } + if (interpolateKernel) { + interpolateKernel->create_ker(); + } else { + IE_THROW() << "Can't compile JitInterpolateExecutor"; + } + return true; +} + +void ov::intel_cpu::JITInterpolateExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { + auto in_ptr_ = padPreprocess(src, dst); + auto out_ptr_ = static_cast(dst[0]->GetPtr()); + size_t N = srcDimPad5d[0], C = srcDimPad5d[1], ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; + size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; + + if (!interpolateKernel) { + IE_THROW() << "Can't execute, kernel for Interpolate node is not compiled"; + } + switch (jitInterpolateAttrs.mode) { + case InterpolateMode::nearest: { + if (jitInterpolateAttrs.layout == InterpolateLayoutType::planar) { + NNPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); + } else { + NNCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); + } + break; + } + case InterpolateMode::linear_onnx: { + if (jitInterpolateAttrs.layout == InterpolateLayoutType::planar) { + linearOnnxPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); + } else { + linearOnnxCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); + } + break; + } + case InterpolateMode::cubic: { + if (jitInterpolateAttrs.layout == InterpolateLayoutType::planar) { + cubicPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); + } else { + cubicCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); + } + break; + } + default: { + IE_THROW() << "JITInterpolateExecutor has unsupported interpolate mode: " << jitInterpolateAttrs.mode; + } + } +} + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/jit_interpolate.hpp b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_interpolate.hpp new file mode 100644 index 00000000000000..820dc126e3c5b5 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_interpolate.hpp @@ -0,0 +1,111 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "../interpolate.hpp" + +namespace ov { +namespace intel_cpu { + +struct jit_interpolate_config_params { + InterpolateLayoutType layout; + InterpolateMode mode; + InferenceEngine::Precision src_prc; + InferenceEngine::Precision dst_prc; + int src_data_size; + int dst_data_size; + int indices_size; + int spatial_dim_size; + int C, ID, IH, IW, OD, OH, OW; +}; + +struct jit_interpolate_call_args { + const void *src_ptr[MAX_INPUT_INTERPOLATE]; + const void *weight_ptr[MAX_INPUT_INTERPOLATE]; + const int *index; + void *dst; + size_t work_amount; + size_t oc_off; + //ptr to array of post op inputs pointers (flat list) + const void* post_op_data; +}; + +struct jit_uni_interpolate_kernel { + void (*ker_)(const jit_interpolate_call_args *); + + void operator()(const jit_interpolate_call_args *args) { + assert(ker_); + ker_(args); + } + + explicit jit_uni_interpolate_kernel(jit_interpolate_config_params jcp, const dnnl_primitive_attr &attr) : ker_( + nullptr), jcp_(jcp), attr_(attr) {} + + virtual ~jit_uni_interpolate_kernel() {} + + virtual void create_ker() = 0; + + jit_interpolate_config_params jcp_; + const dnnl_primitive_attr &attr_; +}; + +class JITInterpolateExecutor : public InterpolateExecutor { +public: + JITInterpolateExecutor(const ExecutorContext::CPtr context) : InterpolateExecutor(context) {} + + bool init(const InterpolateAttrs &interpolateAttrs, + const std::vector &srcDescs, + const std::vector &dstDescs, + const dnnl::primitive_attr &attr) override; + + void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) override; + + impl_desc_type getImplType() const override { + return implType; + } + +private: + // nearest neighbor + void NNPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, + int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); + + void NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, + int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); + + // onnx linear + void linearOnnxPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, + int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); + + void linearOnnxCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, + int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); + + // cubic + void cubicPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, + int B, int C, int IH, int IW, int OH, int OW); + + void cubicCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, + int B, int C, int IH, int IW, int OH, int OW); + +private: + InterpolateAttrs jitInterpolateAttrs; + impl_desc_type implType = impl_desc_type::jit; + std::shared_ptr interpolateKernel = nullptr; +}; + +class JITInterpolateExecutorBuilder : public InterpolateExecutorBuilder { +public: + bool isSupported(const InterpolateAttrs& interpolateAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const override { + return true; + } + + InterpolateExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override { + return std::make_shared(context); + } +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather.cpp b/src/plugins/intel_cpu/src/nodes/gather.cpp index 9f6a36f55184dc..04406319dec3b4 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather.cpp @@ -205,6 +205,7 @@ void Gather::initSupportedPrimitiveDescriptors() { } void Gather::createPrimitive() { +#if defined(OPENVINO_ARCH_X86_64) uint64_t idxElPerVec = 1; if (!isDynamicNode()) { idxElPerVec = x64::mayiuse(x64::avx512_core) ? x64::cpu_isa_traits::vlen / idxTypeSize : @@ -269,7 +270,7 @@ void Gather::createPrimitive() { } } } - +#endif Node::createPrimitive(); } @@ -323,6 +324,7 @@ void Gather::prepareParams() { totalWork = beforeBatchSize * betweenBatchAndAxisSize * specIndicesSize * afterAxisSize; } +#if defined(OPENVINO_ARCH_X86_64) const auto& selectedPD = getSelectedPrimitiveDescriptor(); if (jitKernel && jitKernel->isSupportedConfiguration(afterAxisSize)) { if (x64::mayiuse(x64::avx512_core)) { @@ -330,12 +332,12 @@ void Gather::prepareParams() { } else if (x64::mayiuse(x64::avx2)) { selectedPD->setImplementationType(jit_avx2); } - } else { - selectedPD->setImplementationType(ref_any); } +#endif } void Gather::execute(dnnl::stream strm) { +#if defined(OPENVINO_ARCH_X86_64) if (jitKernel && jitKernel->isSupportedConfiguration(afterAxisSize)) { const void* srcIndices = getParentEdgeAt(GATHER_INDICES)->getMemoryPtr()->GetPtr(); const void* srcData = getParentEdgeAt(GATHER_DATA)->getMemoryPtr()->GetPtr(); @@ -383,12 +385,15 @@ void Gather::execute(dnnl::stream strm) { }; parallel_nt(0, threadBody); - } else { - execReference(); + + return; } +#endif + execReference(); } void Gather::executeDynamicImpl(dnnl::stream strm) { +#if defined(OPENVINO_ARCH_X86_64) if (jitKernel && jitKernel->isSupportedConfiguration(afterAxisSize)) { const void* srcIndices = getParentEdgeAt(GATHER_INDICES)->getMemoryPtr()->GetPtr(); const void* srcData = getParentEdgeAt(GATHER_DATA)->getMemoryPtr()->GetPtr(); @@ -442,9 +447,11 @@ void Gather::executeDynamicImpl(dnnl::stream strm) { }; parallel_nt(0, threadBody); - } else { - execReference(); + + return; } +#endif + execReference(); } void Gather::initShortParams(threadExecParams& p, const uint64_t start) { diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/interpolate.cpp index 9027a2fd28e5f4..4227683a873d4a 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/interpolate.cpp @@ -3,6 +3,7 @@ // #include "interpolate.h" +#include "executors/common/ref_interpolate.hpp" #include "fake_quantize.h" #include "eltwise.h" @@ -13,16 +14,6 @@ #include "ie_parallel.hpp" #include -#include -#include -#include -#include -#include -#include "common/cpu_memcpy.h" -#include "utils/bfloat16.hpp" -#include "emitters/x64/jit_bf16_emitters.hpp" -#include "emitters/x64/jit_load_store_emitters.hpp" - #include #include #include @@ -31,1345 +22,19 @@ #include "utils/cpu_utils.hpp" #include -using namespace dnnl; -using namespace InferenceEngine; -using namespace dnnl::impl; -using namespace dnnl::impl::cpu; -using namespace dnnl::impl::cpu::x64; -using namespace dnnl::impl::utils; -using namespace Xbyak; - - -#define GET_OFF(field) offsetof(jit_interpolate_call_args, field) +#include "common/cpu_memcpy.h" +#include "utils/bfloat16.hpp" +#include "emitters/x64/jit_load_store_emitters.hpp" namespace ov { namespace intel_cpu { namespace node { -template -struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_interpolate_kernel_f32) - - explicit jit_uni_interpolate_kernel_f32(jit_interpolate_config_params jcp, const dnnl_primitive_attr &attr) - : jit_uni_interpolate_kernel(jcp, attr), jit_generator(jit_name()) {} - - void create_ker() override { - jit_generator::create_kernel(); - ker_ = (decltype(ker_))jit_ker(); - } - - void generate() override { - // dummy second reg_tmp_64 as no fill needed - load_pool_gpr_idxs = {static_cast(reg_tmp_64.getIdx()), static_cast(reg_tmp_64.getIdx())}; - store_pool_gpr_idxs = {static_cast(reg_tmp_64.getIdx())}; - store_pool_vec_idxs = {static_cast(vmm_zero.getIdx())}; - - const auto &p = attr_.post_ops_; - for (int i = 0; i < p.len(); i++) { - auto &post_op = p.entry_[i]; - if (post_op.is_eltwise()) { - eltwise_injectors.push_back(std::make_shared>( - this, - post_op.eltwise.alg, - post_op.eltwise.alpha, - post_op.eltwise.beta, - 1.f)); - } else if (post_op.is_depthwise()) { - depthwise_injectors.push_back(std::make_shared>( - this, - post_op)); - } else if (post_op.is_quantization()) { - quantization_injectors.push_back(std::make_shared>( - this, post_op, vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); - } - } - - this->preamble(); - - if (attr_.post_ops_.len() != 0) { - mov(reg_post_ops_data, ptr[reg_params + GET_OFF(post_op_data)]); - mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]); - } - uni_vpxor(vmm_zero, vmm_zero, vmm_zero); - - switch (jcp_.mode) { - case InterpolateMode::nearest: { - mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); - mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); - mov(reg_index, ptr[reg_params + GET_OFF(index)]); - mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - - switch (jcp_.layout) { - case InterpolateLayoutType::planar: { - nn_planar(); - break; - } - case InterpolateLayoutType::block: { - nn_blk(); - break; - } - case InterpolateLayoutType::by_channel: { - nn_by_channel(); - break; - } - default: - assert(!"unsupported memory layout for interpolate layer with nearest neighbor mode."); - } - break; - } - case InterpolateMode::linear_onnx: { - switch (jcp_.layout) { - case InterpolateLayoutType::planar: { - linear_onnx_planar(); - break; - } - case InterpolateLayoutType::block: - case InterpolateLayoutType::by_channel: { - linear_onnx_c_gathered(); - break; - } - default: - assert(!"unsupported memory layout for interpolate layer with linear_onnx mode."); - } - break; - } - case InterpolateMode::cubic: { - switch (jcp_.layout) { - case InterpolateLayoutType::planar: { - cubic_planar(); - break; - } - case InterpolateLayoutType::block: - case InterpolateLayoutType::by_channel: { - cubic_c_gathered(); - break; - } - default: - assert(!"unsupported memory layout for interpolate layer with cubic mode."); - } - break; - } - case InterpolateMode::linear: { - assert(!"unsupported mode for interpolate layer with JITTED implimentation."); - break; - } - default: { - assert(!"unsupported mode for interpolate layer."); - } - } - - this->postamble(); - - emit_emitters_data(); - for (auto& inj : eltwise_injectors) - inj->prepare_table(); - if ((jcp_.mode == InterpolateMode::cubic) && (jcp_.layout == InterpolateLayoutType::planar)) { - prepare_cubic_planar_table(); - } - } - -private: - using Vmm = typename conditional3::type; - - const int vlen = cpu_isa_traits::vlen; - const int vector_step = vlen / sizeof(float); - const int tail_step = jcp_.C % vector_step; - const int scalar_step = 1; - - Xbyak::Reg64 reg_src = r8; - Xbyak::Reg64 reg_src_aux = r15; - Xbyak::Reg64 reg_src_aux1 = r11; - Xbyak::Reg64 reg_src_aux2 = r12; - Xbyak::Reg64 reg_dst = r9; - Xbyak::Reg64 reg_work_amount = r13; - Xbyak::Reg64 reg_index = r14; - Xbyak::Reg64 reg_params = abi_param1; - - Reg8 reg_tmp_8 = r10b; - Reg32 reg_tmp_32 = r10d; - Reg64 reg_tmp_64 = r10; - - Xbyak::Reg64 reg_oc_off = rax; - Xbyak::Reg64 reg_post_ops_data = rbx; - Xbyak::Reg64 reg_d_weights = reg_tmp_64; - Xbyak::Reg64 reg_d_bias = rcx; - Xbyak::Reg32 reg_index_offset = edx; - - // for cubic planar - Xbyak::Reg64 reg_tbl_y = rsi; - Xbyak::Reg64 reg_tbl_x = rbp; - Xbyak::Reg64 reg_table = rdx; // do not need reg_index_offset in this mode, so use rdx - - Vmm vmm_val = Vmm(1); - Vmm vmm_index = Vmm(0); - Vmm vmm_zero = Vmm(2); - Vmm vmm_mask = Vmm(3); - Vmm vmm_d_weights = Vmm(4); - Vmm vmm_d_bias = Vmm(5); - - // for linear - Vmm vmm_weightT = Vmm(15); - Vmm vmm_weightB = Vmm(14); - Vmm vmm_weightL = Vmm(13); - Vmm vmm_weightR = Vmm(12); - Vmm vmm_weightF = Vmm(6); - Vmm vmm_weightE = Vmm(7); - Vmm vmm_valTL = Vmm(11); - Vmm vmm_valTR = vmm_val; - Vmm vmm_valBL = Vmm(9); - Vmm vmm_valBR = Vmm(8); - - // for cubic - Vmm vmm_src = Vmm(6); - Xmm xmm_src = Xmm(6); - Vmm vmm_dstX = Vmm(7); - - Vmm vmm_weightX0 = vmm_weightT; - Vmm vmm_weightX1 = vmm_weightB; - Vmm vmm_weightX2 = vmm_weightL; - Vmm vmm_weightX3 = vmm_weightR; - Vmm vmm_weightY0 = vmm_valTL; - Vmm vmm_weightY1 = Vmm(10); // vmm_valTR is vmm_val, need reserved - Vmm vmm_weightY2 = vmm_valBL; - Vmm vmm_weightY3 = vmm_valBR; - // cubic planar - Vmm vmm_one = vmm_index; - Vmm vmm_weightY = vmm_weightY0; - Vmm vmm_index_y_itr = vmm_weightY1; - Vmm vmm_index_x_itr = vmm_weightY2; - Vmm vmm_tbl_y = vmm_weightY3; - // temporally used. when post ops, value in vmm_d_weights and vmm_d_bias is re-loaded(init) each time. - Vmm vmm_index_in_y = vmm_d_weights; - Vmm vmm_index_in_x = vmm_d_bias; - - Xbyak::Label l_table_constant; - Opmask k_mask = Xbyak::Opmask(1); - - std::unordered_map> emitters; - - std::vector store_pool_gpr_idxs; - std::vector store_pool_vec_idxs; - std::vector load_pool_gpr_idxs; - - std::vector>> eltwise_injectors; - std::vector>> depthwise_injectors; - std::vector>> quantization_injectors; - - void emit_emitters_data() { - for (const auto& emitter : emitters) { - if (emitter.second) - emitter.second->emit_data(); - } - } - - inline void load(Xbyak::Reg64 reg_src, Vmm vmm_src, const int elt_num, const int offset = 0) { - emit_load(reg_src, vmm_src, jcp_.src_prc, Precision::FP32, elt_num, offset); - } - - inline void load_weights(Xbyak::Reg64 reg_src, Vmm vmm_src, const int elt_num, const int offset = 0) { - emit_load(reg_src, vmm_src, Precision::FP32, Precision::FP32, elt_num, offset); - } - - inline void emit_load(Xbyak::Reg64 reg_src, Vmm vmm_src, Precision src_prc, Precision dst_prc, const int elt_num, const int offset = 0) { - const auto seed = load_emitter_params(src_prc, dst_prc, elt_num).hash(); - if (!emitters[seed]) { - emitters[seed].reset(new jit_load_emitter(this, isa, src_prc, dst_prc, elt_num)); - } - - emitters[seed]->emit_code({static_cast(reg_src.getIdx()), static_cast(offset)}, - {static_cast(vmm_src.getIdx())}, {}, {load_pool_gpr_idxs}); - } - - inline void store(Vmm vmm_dst, Xbyak::Reg64 reg_dst, const int elt_num, const int offset = 0) { - const auto seed = store_emitter_params(Precision::FP32, jcp_.dst_prc, elt_num).hash(); - if (!emitters[seed]) { - emitters[seed].reset(new jit_store_emitter(this, isa, Precision::FP32, jcp_.dst_prc, elt_num)); - } - - // for cases when Store emitter need 2 aux vmm we can use vmm_dst as second aux vmm - std::vector local_store_pool_vec_idxs = { static_cast(vmm_dst.getIdx()) }; - local_store_pool_vec_idxs.insert(local_store_pool_vec_idxs.begin(), store_pool_vec_idxs.begin(), store_pool_vec_idxs.end()); - - emitters[seed]->emit_code({static_cast(vmm_dst.getIdx()), static_cast(offset)}, - {static_cast(reg_dst.getIdx())}, - {local_store_pool_vec_idxs}, {store_pool_gpr_idxs}); - } - - void nn_planar() { - Xbyak::Reg64 reg_index_h = reg_src_aux1; - Xbyak::Reg64 reg_index_w = reg_src_aux2; - mov(reg_index_h, reg_index); - // reg_index represent reg_index_w - add(reg_index, jcp_.OH * jcp_.indices_size); - // bk for reset to reg_index_w - mov(reg_index_w, reg_index); - - Xbyak::Label out_loop_label; - Xbyak::Label out_loop_end; - - Xbyak::Reg64 reg_work_amount_oh = rdi; - mov(reg_work_amount_oh, jcp_.OH); - L(out_loop_label); - { - // outloop status - cmp(reg_work_amount_oh, 1); - jl(out_loop_end, T_NEAR); - - //reset work_amount to OW - mov(reg_work_amount, jcp_.OW); - - Xbyak::Reg64 reg_src_h = rsi; - mov(reg_src_h, reg_src); - // index_h * IW * dataSize done when built to avoid redundent compute - mov(reg_index_offset, dword[reg_index_h]); - add(reg_src_h, reg_index_offset); // reg_src_h now point to begin of row - - // reset index_w, index_w * dataSize done when built to avoid redundent compute - mov(reg_index, reg_index_w); - - Xbyak::Label nn_loop_label; - Xbyak::Label nn_loop_end_label; - Xbyak::Label nn_tail_loop_label; - Xbyak::Label nn_tail_loop_end_label; - - L(nn_loop_label); // inner loop - { - cmp(reg_work_amount, vector_step); - jl(nn_loop_end_label, T_NEAR); - - uni_vmovdqu(vmm_index, ptr[reg_index]); - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_val, ptr[reg_src_h + vmm_index], vmm_mask); - if (attr_.post_ops_.len() != 0) - apply_post_ops(jcp_.dst_prc, 1); - store(vmm_val, reg_dst, vector_step); - - add(reg_dst, vector_step * jcp_.dst_data_size); - add(reg_index, vector_step * jcp_.indices_size); - sub(reg_work_amount, vector_step); - - jmp(nn_loop_label, T_NEAR); - } - L(nn_loop_end_label); - - L(nn_tail_loop_label); - { - cmp(reg_work_amount, 1); - jl(nn_tail_loop_end_label, T_NEAR); - - mov(reg_src_aux, reg_src_h); - mov(reg_index_offset, dword[reg_index]); - add(reg_src_aux, reg_index_offset); - - load(reg_src_aux, vmm_val, scalar_step); - if (attr_.post_ops_.len() != 0) - apply_post_ops(jcp_.dst_prc, 1); - store(vmm_val, reg_dst, scalar_step); - - add(reg_dst, scalar_step * jcp_.dst_data_size); - add(reg_index, scalar_step * jcp_.indices_size); - sub(reg_work_amount, scalar_step); - - jmp(nn_tail_loop_label, T_NEAR); - } - L(nn_tail_loop_end_label); // inner loop end - - //increment index_h to next row - add(reg_index_h, jcp_.indices_size); - - sub(reg_work_amount_oh, 1); - jmp(out_loop_label, T_NEAR); - } - L(out_loop_end); - } - - void nn_blk() { - Xbyak::Label nn_loop_label; - Xbyak::Label nn_loop_end_label; - L(nn_loop_label); - { - cmp(reg_work_amount, 0); - jle(nn_loop_end_label, T_NEAR); - - mov(reg_src_aux, reg_src); - mov(reg_index_offset, dword[reg_index]); - add(reg_src_aux, reg_index_offset); - - load(reg_src_aux, vmm_val, vector_step); - if (attr_.post_ops_.len() != 0) - apply_post_ops(jcp_.dst_prc, 0); - store(vmm_val, reg_dst, vector_step); - add(reg_dst, vector_step * jcp_.dst_data_size); - - if (isa == cpu::x64::sse41) { - add(reg_src_aux, vector_step * jcp_.src_data_size); - load(reg_src_aux, vmm_val, vector_step); - if (attr_.post_ops_.len() != 0) { - add(reg_oc_off, vector_step * sizeof(float)); - apply_post_ops(jcp_.dst_prc, 0); - sub(reg_oc_off, vector_step * sizeof(float)); - } - store(vmm_val, reg_dst, vector_step); - add(reg_dst, vector_step * jcp_.dst_data_size); - } - - add(reg_index, jcp_.indices_size); - sub(reg_work_amount, 1); - - jmp(nn_loop_label, T_NEAR); - } - L(nn_loop_end_label); - } - - void nn_by_channel() { - // kernel for C * OW - Xbyak::Label out_loop_label; - Xbyak::Label out_loop_end; - - Xbyak::Reg64 reg_work_amount_bk = reg_src_aux2; - Xbyak::Reg64 reg_oc_off_bk = rsi; - mov(reg_work_amount_bk, ptr[reg_params + GET_OFF(work_amount)]); - if (attr_.post_ops_.len() != 0) { - mov(reg_oc_off_bk, ptr[reg_params + GET_OFF(oc_off)]); - } - - Xbyak::Reg64 reg_work_amount_out = reg_src_aux1; - mov(reg_work_amount_out, jcp_.OW); - L(out_loop_label); - { - cmp(reg_work_amount_out, 1); - jl(out_loop_end, T_NEAR); - - //inner loop for C - Xbyak::Label nn_loop_label; - Xbyak::Label nn_loop_end_label; - Xbyak::Label nn_tail_loop_label; - Xbyak::Label nn_tail_loop_end_label; - - // inner loop for C - // get current loop address reg_src_aux, from reg_src which is unchange, point this C * OW. - // reset offset and work_amount. - // dst and index address is continous, advanced each interator. - mov(reg_src_aux, reg_src); - // index*C*dataSize done when built to avoid redundent compute - mov(reg_index_offset, dword[reg_index]); - add(reg_src_aux, reg_index_offset); - - mov(reg_work_amount, reg_work_amount_bk); - if (attr_.post_ops_.len() != 0) - mov(reg_oc_off, reg_oc_off_bk); - - L(nn_loop_label); - { - cmp(reg_work_amount, vector_step); - jl(nn_loop_end_label, T_NEAR); - - load(reg_src_aux, vmm_val, vector_step); - if (attr_.post_ops_.len() != 0) - apply_post_ops(jcp_.dst_prc, 0); - store(vmm_val, reg_dst, vector_step); - - add(reg_dst, vector_step * jcp_.dst_data_size); - add(reg_src_aux, vector_step * jcp_.src_data_size); - add(reg_oc_off, vector_step * sizeof(float)); - sub(reg_work_amount, vector_step); - - jmp(nn_loop_label, T_NEAR); - } - L(nn_loop_end_label); - - if (tail_step != 0) { - load(reg_src_aux, vmm_val, tail_step); - if (attr_.post_ops_.len() != 0) - apply_post_ops(jcp_.dst_prc, 0); - store(vmm_val, reg_dst, tail_step); - - // check to remove below - add(reg_dst, tail_step * jcp_.dst_data_size); - add(reg_src_aux, tail_step * jcp_.src_data_size); - add(reg_oc_off, tail_step * sizeof(float)); - sub(reg_work_amount, tail_step); - } - add(reg_index, jcp_.indices_size); - sub(reg_work_amount_out, 1); - jmp(out_loop_label, T_NEAR); - } - L(out_loop_end); - } - - void linear_onnx_c_gathered() { - mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); - // load weight - mov(reg_src, ptr[reg_params + GET_OFF(weight_ptr[0])]); - mov(reg_src_aux, ptr[reg_params + GET_OFF(weight_ptr[0]) + sizeof(size_t)]); - uni_vbroadcastss(vmm_weightL, ptr[reg_src]); - uni_vbroadcastss(vmm_weightR, ptr[reg_src_aux]); - if (jcp_.spatial_dim_size > 1) { - mov(reg_src_aux1, ptr[reg_params + GET_OFF(weight_ptr[0]) + 2 * sizeof(size_t)]); - mov(reg_src_aux2, ptr[reg_params + GET_OFF(weight_ptr[0]) + 3 * sizeof(size_t)]); - uni_vbroadcastss(vmm_weightT, ptr[reg_src_aux1]); - uni_vbroadcastss(vmm_weightB, ptr[reg_src_aux2]); - } - if (jcp_.spatial_dim_size > 2) { - mov(reg_src, ptr[reg_params + GET_OFF(weight_ptr[0]) + 4 * sizeof(size_t)]); - mov(reg_src_aux, ptr[reg_params + GET_OFF(weight_ptr[0]) + 5 * sizeof(size_t)]); - uni_vbroadcastss(vmm_weightF, ptr[reg_src]); - uni_vbroadcastss(vmm_weightE, ptr[reg_src_aux]); - } - // load src - mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); - mov(reg_src_aux, ptr[reg_params + GET_OFF(src_ptr[0]) + sizeof(size_t)]); - if (jcp_.spatial_dim_size > 1) { - mov(reg_src_aux1, ptr[reg_params + GET_OFF(src_ptr[0]) + 2 * sizeof(size_t)]); - mov(reg_src_aux2, ptr[reg_params + GET_OFF(src_ptr[0]) + 3 * sizeof(size_t)]); - } - Xbyak::Reg64 reg_src_aux4 = r14; - Xbyak::Reg64 reg_src_aux5 = rdx; - Xbyak::Reg64 reg_src_aux6 = rsi; - Xbyak::Reg64 reg_src_aux7 = rbp; - if (jcp_.spatial_dim_size > 2) { - mov(reg_src_aux4, ptr[reg_params + GET_OFF(src_ptr[0]) + 4 * sizeof(size_t)]); - mov(reg_src_aux5, ptr[reg_params + GET_OFF(src_ptr[0]) + 5 * sizeof(size_t)]); - mov(reg_src_aux6, ptr[reg_params + GET_OFF(src_ptr[0]) + 6 * sizeof(size_t)]); - mov(reg_src_aux7, ptr[reg_params + GET_OFF(src_ptr[0]) + 7 * sizeof(size_t)]); - } - mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - - int blk = (isa == cpu::x64::sse41) ? (2 * vector_step) : vector_step; - int dst_stride = (jcp_.layout == InterpolateLayoutType::by_channel) ? (vector_step * jcp_.dst_data_size) : - (blk * jcp_.OW * jcp_.OH * jcp_.OD * jcp_.dst_data_size); - int src_stride = (jcp_.layout == InterpolateLayoutType::by_channel) ? (vector_step * jcp_.src_data_size) : - (blk * jcp_.IW * jcp_.IH * jcp_.ID * jcp_.src_data_size); - - Xbyak::Label main_loop_label; - Xbyak::Label main_loop_end_label; - Xbyak::Label blk_tail_loop_label; - Xbyak::Label blk_tail_loop_end_label; - Xbyak::Label tail_loop_label; - Xbyak::Label tail_loop_end_label; - L(main_loop_label); - { - if (jcp_.layout == InterpolateLayoutType::by_channel) { - cmp(reg_work_amount, vector_step); - jl(main_loop_end_label, T_NEAR); - } else { - cmp(reg_work_amount, 1); - jl(main_loop_end_label, T_NEAR); - } - // progressive manner - load(reg_src, vmm_valTL, vector_step); - load(reg_src_aux, vmm_valTR, vector_step); - if (jcp_.spatial_dim_size == 1) { - linear_onnx_worker_1d(); - } - if (jcp_.spatial_dim_size > 1) { - load(reg_src_aux1, vmm_valBL, vector_step); - load(reg_src_aux2, vmm_valBR, vector_step); - linear_onnx_worker_2d(); - } - if (jcp_.spatial_dim_size > 2) { - uni_vmovups(vmm_d_bias, vmm_valTR); // temporally save front result to temp_vmm - load(reg_src_aux4, vmm_valTL, vector_step); - load(reg_src_aux5, vmm_valTR, vector_step); - load(reg_src_aux6, vmm_valBL, vector_step); - load(reg_src_aux7, vmm_valBR, vector_step); - - // 2d for end depth - linear_onnx_worker_2d(); - // 3th dimension - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight - } - - if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, false); // vmm_val is vmm_valTR - add(reg_oc_off, vector_step * sizeof(float)); - } - store(vmm_valTR, reg_dst, vector_step); - - if ((isa == cpu::x64::sse41) && (jcp_.layout == InterpolateLayoutType::block)) { - int offset_src = vector_step * jcp_.src_data_size; - load(reg_src, vmm_valTL, vector_step, offset_src); - load(reg_src_aux, vmm_valTR, vector_step, offset_src); - if (jcp_.spatial_dim_size == 1) { - linear_onnx_worker_1d(); - } - if (jcp_.spatial_dim_size > 1) { - load(reg_src_aux1, vmm_valBL, vector_step, offset_src); - load(reg_src_aux2, vmm_valBR, vector_step, offset_src); - linear_onnx_worker_2d(); - } - if (jcp_.spatial_dim_size > 2) { - uni_vmovups(vmm_d_bias, vmm_valTR); // temporally save front result to temp_vmm - load(reg_src_aux4, vmm_valTL, vector_step, offset_src); - load(reg_src_aux5, vmm_valTR, vector_step, offset_src); - load(reg_src_aux6, vmm_valBL, vector_step, offset_src); - load(reg_src_aux7, vmm_valBR, vector_step, offset_src); - // 2d for end depth - linear_onnx_worker_2d(); - // 3th dimension - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight - } - - if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, false); - add(reg_oc_off, vector_step * sizeof(float)); - } - int offset_dst = vector_step * jcp_.dst_data_size; - store(vmm_valTR, reg_dst, vector_step, offset_dst); - } - add(reg_dst, dst_stride); - add(reg_src, src_stride); - add(reg_src_aux, src_stride); - if (jcp_.spatial_dim_size > 1) { - add(reg_src_aux1, src_stride); - add(reg_src_aux2, src_stride); - } - if (jcp_.spatial_dim_size > 2) { - add(reg_src_aux4, src_stride); - add(reg_src_aux5, src_stride); - add(reg_src_aux6, src_stride); - add(reg_src_aux7, src_stride); - } - if (jcp_.layout == InterpolateLayoutType::by_channel) { - sub(reg_work_amount, vector_step); // work_amount is c - } else { - sub(reg_work_amount, 1); // work_amount = div_up(c, blk), no tails - } - - jmp(main_loop_label, T_NEAR); - } - L(main_loop_end_label); - - if ((jcp_.layout == InterpolateLayoutType::by_channel) && (tail_step != 0)) { - load(reg_src, vmm_valTL, tail_step); - load(reg_src_aux, vmm_valTR, tail_step); - if (jcp_.spatial_dim_size == 1) { - linear_onnx_worker_1d(); - } - if (jcp_.spatial_dim_size > 1) { - load(reg_src_aux1, vmm_valBL, tail_step); - load(reg_src_aux2, vmm_valBR, tail_step); - linear_onnx_worker_2d(); - } - if (jcp_.spatial_dim_size > 2) { - uni_vmovups(vmm_d_bias, vmm_valTR); // temporally save front result to temp_vmm - - load(reg_src_aux4, vmm_valTL, tail_step); - load(reg_src_aux5, vmm_valTR, tail_step); - load(reg_src_aux6, vmm_valBL, tail_step); - load(reg_src_aux7, vmm_valBR, tail_step); - // 2d for end depth - linear_onnx_worker_2d(); - // 3th dimension - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight - } - - if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, false); // vmm_val is vmm_valTR - add(reg_oc_off, tail_step * sizeof(float)); - } - - store(vmm_valTR, reg_dst, tail_step); - } - } - - void linear_onnx_planar() { - mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); - mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); - mov(reg_index, ptr[reg_params + GET_OFF(index)]); - mov(reg_src_aux, ptr[reg_params + GET_OFF(weight_ptr[0])]); - mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - - int index_stride = jcp_.OW * jcp_.OH * jcp_.OD * jcp_.indices_size; - int weight_stride = jcp_.OW * jcp_.OH * jcp_.OD * sizeof(float); - - Xbyak::Label main_loop_label; - Xbyak::Label main_loop_end_label; - Xbyak::Label tail_loop_label; - Xbyak::Label tail_loop_end_label; - L(main_loop_label); - { - cmp(reg_work_amount, vector_step); - jl(main_loop_end_label, T_NEAR); - - uni_vmovdqu(vmm_index, ptr[reg_index]); - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_valTL, ptr[reg_src + vmm_index], vmm_mask); - - uni_vmovdqu(vmm_index, ptr[reg_index + index_stride]); - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_valTR, ptr[reg_src + vmm_index], vmm_mask); - - load_weights(reg_src_aux, vmm_weightL, vector_step); - load_weights(reg_src_aux, vmm_weightR, vector_step, weight_stride); - - // progressive manner - if (jcp_.spatial_dim_size == 1) { - linear_onnx_worker_1d(); - } - if (jcp_.spatial_dim_size > 1) { - uni_vmovdqu(vmm_index, ptr[reg_index + 2 * index_stride]); - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_valBL, ptr[reg_src + vmm_index], vmm_mask); - - uni_vmovdqu(vmm_index, ptr[reg_index + 3 * index_stride]); - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_valBR, ptr[reg_src + vmm_index], vmm_mask); - - load_weights(reg_src_aux, vmm_weightT, vector_step, 2 * weight_stride); - load_weights(reg_src_aux, vmm_weightB, vector_step, 3 * weight_stride); - - linear_onnx_worker_2d(); - } - if (jcp_.spatial_dim_size > 2) { - uni_vmovups(vmm_d_bias, vmm_valTR); // temporally save front result to temp_vmm - - // for end depth - uni_vmovdqu(vmm_index, ptr[reg_index + 4 * index_stride]); - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_valTL, ptr[reg_src + vmm_index], vmm_mask); - - uni_vmovdqu(vmm_index, ptr[reg_index + 5 * index_stride]); - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_valTR, ptr[reg_src + vmm_index], vmm_mask); - - uni_vmovdqu(vmm_index, ptr[reg_index + 6 * index_stride]); - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_valBL, ptr[reg_src + vmm_index], vmm_mask); - - uni_vmovdqu(vmm_index, ptr[reg_index + 7 * index_stride]); - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_valBR, ptr[reg_src + vmm_index], vmm_mask); - - linear_onnx_worker_2d(); - - load_weights(reg_src_aux, vmm_weightE, vector_step, 5 * weight_stride); - load_weights(reg_src_aux, vmm_weightF, vector_step, 4 * weight_stride); - - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight - } - - if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, true); // vmm_val is vmm_valTR, broadcase is true - } - store(vmm_valTR, reg_dst, vector_step); - - add(reg_dst, vector_step * jcp_.dst_data_size); - add(reg_src_aux, vector_step * sizeof(float)); - add(reg_index, vector_step * jcp_.indices_size); - sub(reg_work_amount, vector_step); - - jmp(main_loop_label, T_NEAR); - } - L(main_loop_end_label); - - L(tail_loop_label); - { - cmp(reg_work_amount, 1); - jl(tail_loop_end_label, T_NEAR); - - mov(reg_src_aux1, reg_src); - mov(reg_index_offset, dword[reg_index]); - add(reg_src_aux1, reg_index_offset); - load(reg_src_aux1, vmm_valTL, scalar_step); - - mov(reg_src_aux1, reg_src); - mov(reg_index_offset, dword[reg_index + index_stride]); - add(reg_src_aux1, reg_index_offset); - load(reg_src_aux1, vmm_valTR, scalar_step); - - load_weights(reg_src_aux, vmm_weightL, scalar_step, 0); - load_weights(reg_src_aux, vmm_weightR, scalar_step, weight_stride); - - if (jcp_.spatial_dim_size == 1) { - linear_onnx_worker_1d(); - } - if (jcp_.spatial_dim_size > 1) { - mov(reg_src_aux1, reg_src); - mov(reg_index_offset, dword[reg_index + 2 * index_stride]); - add(reg_src_aux1, reg_index_offset); - load(reg_src_aux1, vmm_valBL, scalar_step); - - mov(reg_src_aux1, reg_src); - mov(reg_index_offset, dword[reg_index + 3 * index_stride]); - add(reg_src_aux1, reg_index_offset); - load(reg_src_aux1, vmm_valBR, scalar_step); - - load_weights(reg_src_aux, vmm_weightT, scalar_step, 2 * weight_stride); - load_weights(reg_src_aux, vmm_weightB, scalar_step, 3 * weight_stride); - - linear_onnx_worker_2d(); - } - if (jcp_.spatial_dim_size > 2) { - uni_vmovups(vmm_d_bias, vmm_valTR); // save from front result to temp_vmm - - // for end depth - mov(reg_src_aux1, reg_src); - mov(reg_index_offset, dword[reg_index + 4 * index_stride]); - add(reg_src_aux1, reg_index_offset); - load(reg_src_aux1, vmm_valTL, scalar_step); - - mov(reg_src_aux1, reg_src); - mov(reg_index_offset, dword[reg_index + 5 * index_stride]); - add(reg_src_aux1, reg_index_offset); - load(reg_src_aux1, vmm_valTR, scalar_step); - - mov(reg_src_aux1, reg_src); - mov(reg_index_offset, dword[reg_index + 6 * index_stride]); - add(reg_src_aux1, reg_index_offset); - load(reg_src_aux1, vmm_valBL, scalar_step); - - mov(reg_src_aux1, reg_src); - mov(reg_index_offset, dword[reg_index + 7 * index_stride]); - add(reg_src_aux1, reg_index_offset); - load(reg_src_aux1, vmm_valBR, scalar_step); - - linear_onnx_worker_2d(); - - load_weights(reg_src_aux, vmm_weightE, scalar_step, 5 * weight_stride); - load_weights(reg_src_aux, vmm_weightF, scalar_step, 4 * weight_stride); - - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight - } - - if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, true); // process on vmm_val, vmm_val is vmm_valTR, and bc - } - store(vmm_valTR, reg_dst, scalar_step); - - add(reg_dst, scalar_step * jcp_.dst_data_size); - add(reg_src_aux, scalar_step * sizeof(float)); - add(reg_index, scalar_step * jcp_.indices_size); - sub(reg_work_amount, scalar_step); - - jmp(tail_loop_label, T_NEAR); - } - L(tail_loop_end_label); - } - - inline void linear_onnx_worker_1d() { - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightR); - uni_vfmadd231ps(vmm_valTR, vmm_valTL, vmm_weightL); - } - - // weightT * (srcTL * weightL + srcTR * weightR) + - // weightB * (srcBL * weightL + srcBR * weightR) - inline void linear_onnx_worker_2d() { - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightR); - uni_vmulps(vmm_valBR, vmm_valBR, vmm_weightR); - uni_vfmadd231ps(vmm_valTR, vmm_valTL, vmm_weightL); - uni_vfmadd231ps(vmm_valBR, vmm_valBL, vmm_weightL); - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightT); - uni_vfmadd231ps(vmm_valTR, vmm_valBR, vmm_weightB); - } - - void cubic_c_gathered() { - mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); - mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); - mov(reg_index, ptr[reg_params + GET_OFF(index)]); - mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - - // weight_ptr[0] point to weightX - mov(reg_src_aux1, ptr[reg_params + GET_OFF(weight_ptr[0])]); - uni_vbroadcastss(vmm_weightX0, ptr[reg_src_aux1]); - uni_vbroadcastss(vmm_weightX1, ptr[reg_src_aux1 + 1 * sizeof(float)]); - uni_vbroadcastss(vmm_weightX2, ptr[reg_src_aux1 + 2 * sizeof(float)]); - uni_vbroadcastss(vmm_weightX3, ptr[reg_src_aux1 + 3 * sizeof(float)]); - - // weight_ptr[1] point to weightY - mov(reg_src_aux1, ptr[reg_params + GET_OFF(weight_ptr[0]) + sizeof(size_t)]); - uni_vbroadcastss(vmm_weightY0, ptr[reg_src_aux1]); - uni_vbroadcastss(vmm_weightY1, ptr[reg_src_aux1 + 1 * sizeof(float)]); - uni_vbroadcastss(vmm_weightY2, ptr[reg_src_aux1 + 2 * sizeof(float)]); - uni_vbroadcastss(vmm_weightY3, ptr[reg_src_aux1 + 3 * sizeof(float)]); - - int blk = (isa == cpu::x64::sse41) ? (2 * vector_step) : vector_step; - - Xbyak::Label main_loop_label; - Xbyak::Label main_loop_end_label; - Xbyak::Label tail_loop_label; - Xbyak::Label tail_loop_end_label; - L(main_loop_label); - { - if (jcp_.layout == InterpolateLayoutType::by_channel) { - cmp(reg_work_amount, vector_step); - jl(main_loop_end_label, T_NEAR); - } else { - cmp(reg_work_amount, 1); - jl(tail_loop_end_label, T_NEAR); - } - - uni_vpxor(vmm_val, vmm_val, vmm_val); - - cubic_c_gathered_matrix(false); - - if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value to post_ops and store - add(reg_oc_off, vector_step * sizeof(float)); - } - store(vmm_val, reg_dst, vector_step); - - if ((isa == cpu::x64::sse41) && (jcp_.layout == InterpolateLayoutType::block)) { - // vmm is xmm here - add(reg_src, vector_step * jcp_.src_data_size); - add(reg_dst, vector_step * jcp_.dst_data_size); - - uni_vpxor(vmm_val, vmm_val, vmm_val); - - cubic_c_gathered_matrix(false); - - if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, false); - add(reg_oc_off, vector_step * sizeof(float)); // second vector_step for one blk - } - store(vmm_val, reg_dst, vector_step); - - sub(reg_src, vector_step * jcp_.src_data_size); - sub(reg_dst, vector_step * jcp_.dst_data_size); - } - if (jcp_.layout == InterpolateLayoutType::by_channel) { - int dst_stride = vector_step * jcp_.dst_data_size; - int src_stride = vector_step * jcp_.src_data_size; - add(reg_dst, dst_stride); - add(reg_src, src_stride); - sub(reg_work_amount, vector_step); // work_amount is c - } else { - int dst_stride = blk * jcp_.OW * jcp_.OH * jcp_.dst_data_size; - int src_stride = blk * jcp_.IW * jcp_.IH * jcp_.src_data_size; - add(reg_dst, dst_stride); - add(reg_src, src_stride); - sub(reg_work_amount, 1); // work_amount = div_up(c, blk), no tails - } - - jmp(main_loop_label, T_NEAR); - } - L(main_loop_end_label); - - // only for by_channel layout for tails. - L(tail_loop_label); - { - cmp(reg_work_amount, 1); - jl(tail_loop_end_label, T_NEAR); - - // store final computed value - uni_vpxor(vmm_val, vmm_val, vmm_val); - - cubic_c_gathered_matrix(true); - - if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value - add(reg_oc_off, scalar_step * sizeof(float)); - } - store(vmm_val, reg_dst, scalar_step); - - int dst_stride = scalar_step * jcp_.dst_data_size; - int src_stride = scalar_step * jcp_.src_data_size; - add(reg_dst, dst_stride); - add(reg_src, src_stride); - sub(reg_work_amount, scalar_step); // work_amount is c - - jmp(tail_loop_label, T_NEAR); - } - L(tail_loop_end_label); - } - - inline void cubic_c_gathered_matrix(bool is_scalar) { - // y0: (x0 * weightX0 + x1 * weightX1 + x2 * weightX2 + x3 * weightX3) * weightY0 - cubic_c_gathered_line(0, vmm_weightY0, is_scalar); - // y1 - cubic_c_gathered_line(4, vmm_weightY1, is_scalar); - // y2 - cubic_c_gathered_line(8, vmm_weightY2, is_scalar); - // y3 - cubic_c_gathered_line(12, vmm_weightY3, is_scalar); - } - - inline void cubic_c_gathered_line(int index_start, Vmm vmm_weight, bool is_scalar) { - uni_vpxor(vmm_dstX, vmm_dstX, vmm_dstX); - cubic_c_gathered_pixel(index_start, vmm_weightX0, is_scalar); - cubic_c_gathered_pixel(index_start + 1, vmm_weightX1, is_scalar); - cubic_c_gathered_pixel(index_start + 2, vmm_weightX2, is_scalar); - cubic_c_gathered_pixel(index_start + 3, vmm_weightX3, is_scalar); - uni_vfmadd231ps(vmm_val, vmm_dstX, vmm_weight); - } - - inline void cubic_c_gathered_pixel(int i, Vmm vmm_weight, bool is_scalar) { - mov(reg_src_aux, reg_src); - mov(reg_index_offset, dword[reg_index + i * jcp_.indices_size]); - add(reg_src_aux, reg_index_offset); - int step = is_scalar ? 1 : vlen / sizeof(float); - load(reg_src_aux, vmm_src, step); - uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weight); - } - - void cubic_planar() { - mov(reg_table, l_table_constant); - // src_ptr[2] for oh sequence, src_ptr[3] for ow sequence - mov(reg_tbl_y, ptr[reg_params + GET_OFF(src_ptr[0]) + 2 * sizeof(size_t)]); - mov(reg_tbl_x, ptr[reg_params + GET_OFF(src_ptr[0]) + 3 * sizeof(size_t)]); - uni_vmovdqu(vmm_one, cubic_planar_table_val(0)); - uni_vpxor(vmm_zero, vmm_zero, vmm_zero); - - mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); - mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); - // index_OW - mov(reg_index, ptr[reg_params + GET_OFF(index)]); - // index_OH from src_ptr[1] - Xbyak::Reg64 reg_index_y = reg_src_aux; - mov(reg_index_y, ptr[reg_params + GET_OFF(src_ptr[0]) + sizeof(size_t)]); - // weight_OW - Xbyak::Reg64 reg_weight_x = reg_src_aux1; - mov(reg_weight_x, ptr[reg_params + GET_OFF(weight_ptr[0])]); - // weight_OH - Xbyak::Reg64 reg_weight_y = reg_src_aux2; - mov(reg_weight_y, ptr[reg_params + GET_OFF(weight_ptr[0]) + sizeof(size_t)]); - mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - - int grid_len = 4; - - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 - // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 - // for 3th step(8): 16 17 18 19 20 21 22 23 - // y: 0 0 0 0 1 1 1 1 - // x: 16 17 18 19 0 1 2 3 - - Xbyak::Label main_loop_label; - Xbyak::Label main_loop_end_label; - Xbyak::Label tail_loop_label; - Xbyak::Label tail_loop_end_label; - L(main_loop_label); - { - cmp(reg_work_amount, vector_step); - jl(main_loop_end_label, T_NEAR); - - // vmm_tbl_y: (0 0 0 0 1 1 1 1 * index_size) --> (0 0 0 0 4 4 4 4) - uni_vmovdqu(vmm_tbl_y, ptr[reg_tbl_y]); - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - // vmm_index_in_y: 0 0 0 0 2 2 2 2 - vpgatherdd(vmm_index_in_y, ptr[reg_index_y + vmm_tbl_y], vmm_mask); - - // use vmm_val temporally for value in reg_tbl_x: 16 17 18 19 0 1 2 3 - uni_vmovdqu(vmm_val, ptr[reg_tbl_x]); - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - // e.g. vmm_index_in_x: 32 34 36 38 0 2 4 6, now save src index. - vpgatherdd(vmm_index_in_x, ptr[reg_index + vmm_val], vmm_mask); - - // build weightX used in y0-y3 - // weight format: w0_0 w1_0 w2_0 w3_0 w0_1 w1_1 w2_1 w3_1 ... - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_weightX0, ptr[reg_weight_x + vmm_val * grid_len], vmm_mask); // 4 in vmm_val for weight_size, another 4 for grid_len - - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - // shift weight_size then gather second weight - vgatherdps(vmm_weightX1, ptr[reg_weight_x + sizeof(float) + (vmm_val * grid_len)], vmm_mask); - - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_weightX2, ptr[reg_weight_x + 2 * sizeof(float) + (vmm_val * grid_len)], vmm_mask); - - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_weightX3, ptr[reg_weight_x + 3 * sizeof(float) + (vmm_val * grid_len)], vmm_mask); - // vmm_val is now relieved and used for dst_value - - uni_vpxor(vmm_val, vmm_val, vmm_val); - // y0 - vpsubd(vmm_index_y_itr, vmm_index_in_y, vmm_one); - // crop to [0, IH - 1] - vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); - vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); - - // weight y0 - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_weightY, ptr[reg_weight_y + (vmm_tbl_y * grid_len)], vmm_mask); - cubic_planar_line(false); - - // y1 - // crop to [0, IH - 1] - vpminsd(vmm_index_y_itr, vmm_index_in_y, cubic_planar_table_val(1)); - vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); - // weight y1: shift weight_size - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_weightY, ptr[reg_weight_y + sizeof(float) + (vmm_tbl_y * grid_len)], vmm_mask); - cubic_planar_line(false); - - // y2 - vpaddd(vmm_index_y_itr, vmm_index_in_y, vmm_one); - // crop to [0, IH - 1] - vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); - vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); - // weight y2 - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_weightY, ptr[reg_weight_y + 2 * sizeof(float) + (vmm_tbl_y * grid_len)], vmm_mask); - cubic_planar_line(false); - - // y3 - vpaddd(vmm_index_y_itr, vmm_index_in_y, vmm_one); - vpaddd(vmm_index_y_itr, vmm_index_y_itr, vmm_one); - // crop to [0, IH - 1] - vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); - vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); - // weight y3 - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_weightY, ptr[reg_weight_y + 3 * sizeof(float) + (vmm_tbl_y * grid_len)], vmm_mask); - cubic_planar_line(false); - - if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, true); // oc_off is broadcast and always the same value for this channel - } - store(vmm_val, reg_dst, vector_step); - - add(reg_tbl_y, vector_step * sizeof(int)); // sizeof(int): sequence by dd() - add(reg_tbl_x, vector_step * sizeof(int)); - add(reg_dst, vector_step * jcp_.dst_data_size); - - sub(reg_work_amount, vector_step); - - jmp(main_loop_label, T_NEAR); - } - L(main_loop_end_label); - - L(tail_loop_label); - { - cmp(reg_work_amount, 1); - jl(tail_loop_end_label, T_NEAR); - - // get idx for input - uni_vmovss(Xmm(vmm_tbl_y.getIdx()), ptr[reg_tbl_y]); - gather_i32_indices(vmm_index_in_y, reg_index_y, 0, vmm_tbl_y, 1, Precision::I32, true); - - uni_vmovss(Xmm(vmm_val.getIdx()), ptr[reg_tbl_x]); - gather_i32_indices(vmm_index_in_x, reg_index, 0, vmm_val, 1, Precision::I32, true); - // gather weightX by input idx, used in y0-y3 - gather_i32_indices(vmm_weightX0, reg_weight_x, 0, vmm_val, grid_len, Precision::FP32, true); - gather_i32_indices(vmm_weightX1, reg_weight_x, sizeof(float), vmm_val, grid_len, Precision::FP32, true); - gather_i32_indices(vmm_weightX2, reg_weight_x, 2 * sizeof(float), vmm_val, grid_len, Precision::FP32, true); - gather_i32_indices(vmm_weightX3, reg_weight_x, 3 * sizeof(float), vmm_val, grid_len, Precision::FP32, true); - // vmm_val is now relieved and used for dst_value - - uni_vpxor(vmm_val, vmm_val, vmm_val); - // y0 - vpsubd(vmm_index_y_itr, vmm_index_in_y, vmm_one); - // crop to [0, IH - 1] - vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); - vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); - - gather_i32_indices(vmm_weightY, reg_weight_y, 0, vmm_tbl_y, grid_len, Precision::FP32, true); - cubic_planar_line(true); - - // y1 - // crop to [0, IH - 1] - vpminsd(vmm_index_y_itr, vmm_index_in_y, cubic_planar_table_val(1)); - vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); - // weight y1: shift weight_size - gather_i32_indices(vmm_weightY, reg_weight_y, sizeof(float), vmm_tbl_y, grid_len, Precision::FP32, true); - cubic_planar_line(true); - - // y2 - vpaddd(vmm_index_y_itr, vmm_index_in_y, vmm_one); - // crop to [0, IH - 1] - vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); - vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); - // weight y2 - gather_i32_indices(vmm_weightY, reg_weight_y, 2 * sizeof(float), vmm_tbl_y, grid_len, Precision::FP32, true); - cubic_planar_line(true); - - // y3 - vpaddd(vmm_index_y_itr, vmm_index_in_y, vmm_one); - vpaddd(vmm_index_y_itr, vmm_index_y_itr, vmm_one); - // crop to [0, IH - 1] - vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); - vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); - // weight y3 - gather_i32_indices(vmm_weightY, reg_weight_y, 3 * sizeof(float), vmm_tbl_y, grid_len, Precision::FP32, true); - cubic_planar_line(true); - - if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, true); // oc_off is broadcast and always the same value for this channel - } - store(vmm_val, reg_dst, scalar_step); - - add(reg_tbl_y, scalar_step * sizeof(int)); // sizeof(int): sequence with dd() - add(reg_tbl_x, scalar_step * sizeof(int)); - add(reg_dst, scalar_step * jcp_.dst_data_size); - - sub(reg_work_amount, scalar_step); - - jmp(tail_loop_label, T_NEAR); - } - L(tail_loop_end_label); - } - - inline void cubic_planar_line(bool is_scalar) { - uni_vpxor(vmm_dstX, vmm_dstX, vmm_dstX); - cubic_planar_pixel(0, is_scalar); - cubic_planar_pixel(1, is_scalar); - cubic_planar_pixel(2, is_scalar); - cubic_planar_pixel(3, is_scalar); - uni_vfmadd231ps(vmm_val, vmm_dstX, vmm_weightY); - } - - inline void cubic_planar_pixel(int itr, bool is_scalar) { - // vmm_index_in_x have index for src - if (itr == 0) { - vpsubd(vmm_index_x_itr, vmm_index_in_x, vmm_one); - } else if (itr == 1) { - vpaddd(vmm_index_x_itr, vmm_index_in_x, vmm_zero); - } else if (itr == 2) { - vpaddd(vmm_index_x_itr, vmm_index_in_x, vmm_one); - } else if (itr == 3) { - vpaddd(vmm_index_x_itr, vmm_index_in_x, vmm_one); - vpaddd(vmm_index_x_itr, vmm_index_x_itr, vmm_one); - } - - // crop to [0, IW - 1] - vpminsd(vmm_index_x_itr, vmm_index_x_itr, cubic_planar_table_val(2)); - vpmaxsd(vmm_index_x_itr, vmm_index_x_itr, vmm_zero); - - // value - // index is: ptr[reg_src + (vmm_index_y_itr * jcp_.IW + vmm_index_x_itr) * jcp_.src_data_size] - uni_vmovdqu(vmm_mask, cubic_planar_table_val(2)); - vpaddd(vmm_mask, vmm_mask, vmm_one); // (IW - 1) + 1 = IW - uni_vpmulld(vmm_mask, vmm_mask, vmm_index_y_itr); - uni_vpaddd(vmm_index_x_itr, vmm_index_x_itr, vmm_mask); - gather_i32_indices(vmm_src, reg_src, 0, vmm_index_x_itr, jcp_.src_data_size, Precision::FP32, is_scalar); - - if (itr == 0) { - uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weightX0); - } else if (itr == 1) { - uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weightX1); - } else if (itr == 2) { - uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weightX2); - } else if (itr == 3) { - uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weightX3); - } - } - - inline void prepare_cubic_planar_table() { - auto broadcast_int = [&](int val) { - for (size_t d = 0; d < vlen / sizeof(int); ++d) { - dd(val); - } - }; - - align(64); - L(l_table_constant); - broadcast_int(vals_for_cubic_planar.int_one); - broadcast_int(jcp_.IH - 1); - broadcast_int(jcp_.IW - 1); - dd(vals_for_cubic_planar.mask_gather_avx512); - } - - struct vals_for_cubic_planar_type { - int int_one = 0x00000001; - int mask_gather_avx512 = 0x0000ffff; // 00000000000000001111111111111111 - } vals_for_cubic_planar; - - inline Xbyak::Address cubic_planar_table_val(int index) { - return ptr[reg_table + index * vlen]; - } - - // always gather to Vmm, compute with Vmm, store with Xmm if scalar_step - inline void gather_i32_indices(Vmm vmm_src, const Xbyak::Reg64 &base, int offset, Vmm vmm_indices, int scale, - Precision src_prc, bool is_scalar) { - Xbyak::Address table_idx = ptr[base + offset + vmm_indices * scale]; - if ((isa == cpu::x64::avx512_core) && !is_scalar) { - // [0-15] bit of int to mask - kmovw(k_mask, cubic_planar_table_val(3)); - if (src_prc == Precision::FP32) { - vgatherdps(vmm_src | k_mask, table_idx); // dword index, packed single data - } else if (src_prc == Precision::I32) { - vpgatherdd(vmm_src | k_mask, table_idx); // dword index, dword data - } - } else if ((isa == cpu::x64::avx2) && !is_scalar) { - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - if (src_prc == Precision::FP32) { - vgatherdps(vmm_src, table_idx, vmm_mask); - } else if (src_prc == Precision::I32) { - vpgatherdd(vmm_src, table_idx, vmm_mask); - } - } else { - const int gpr_size = 8; - sub(rsp, gpr_size); - // move content in register to content in address(ptr[]) - mov(ptr[rsp], reg_tmp_64); - - // replace index with value in stack - sub(rsp, vlen); - uni_vmovdqu(ptr[rsp], vmm_indices); - - int repeats = is_scalar ? 1 : vlen / sizeof(float); - for (size_t i = 0; i < repeats; ++i) { - mov(reg_tmp_64.cvt32(), ptr[rsp + i * sizeof(int)]); // sizeof(int) index_size - table_idx = ptr[base + offset + reg_tmp_64 * scale]; // scale: sizeof(float) value_size - mov(reg_tmp_64.cvt32(), table_idx); - mov(ptr[rsp + i * sizeof(int)], reg_tmp_64.cvt32()); - } - - uni_vmovups(vmm_src, ptr[rsp]); - add(rsp, vlen); - // restore GPR state - mov(reg_tmp_64, ptr[rsp]); - add(rsp, gpr_size); - } - } - - // is_broadcast for broadcasting param for depth_wise and quantize(channel-sensitive post-ops), for fusion with plain layout. - void apply_post_ops(Precision dst_prc, bool is_broadcast) { - const auto &p = attr_.post_ops_; - int eltwise_inj_idx = 0; - int depthwise_inj_idx = 0; - int quantization_inj_idx = 0; - int post_ops_data_offset = 0; - for (int i = 0; i < p.len(); i++) { - auto& post_op = p.entry_[i]; - if (post_op.is_eltwise()) { - eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1); - eltwise_inj_idx++; - } else if (post_op.is_depthwise()) { - mov(reg_d_weights, ptr[reg_post_ops_data + post_ops_data_offset]); - add(reg_d_weights, reg_oc_off); - - // weight and bias is padded. scalar as vector. - depthwise_injectors[depthwise_inj_idx]->compute_vector_range( - vmm_val.getIdx(), vmm_val.getIdx() + 1, reg_d_weights, reg_d_weights, is_broadcast); - - depthwise_inj_idx++; - post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep(); - } else if (post_op.is_quantization()) { - bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; - bool do_rounding = do_dequantization || dst_prc == Precision::FP32 || i != p.len() - 1; - - int s_idx = vmm_val.getIdx(); - - quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0, 0, is_broadcast); - - quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding, 0, is_broadcast); - - if (do_dequantization) { - quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0, 0, is_broadcast); - } - - post_ops_data_offset += quantization_injectors[quantization_inj_idx]->memoryStep(); - quantization_inj_idx++; - } - } - } -}; - namespace { struct InterpolateKey { - Interpolate::InterpolateAttrs nodeAttrs; + InterpolateAttrs nodeAttrs; VectorDims srcDims; VectorDims dstDims; - std::vector dataScales; dnnl::primitive_attr attr; size_t hash() const; @@ -1398,7 +63,7 @@ size_t InterpolateKey::hash() const { seed = get_vector_hash(seed, srcDims); seed = get_vector_hash(seed, dstDims); - seed = get_vector_hash(seed, dataScales); + seed = get_vector_hash(seed, nodeAttrs.dataScales); seed = hash_combine(seed, get_attr_hash(*attr.get())); return seed; @@ -1432,7 +97,7 @@ bool InterpolateKey::operator==(const InterpolateKey &rhs) const { return false; if (dstDims != rhs.dstDims) return false; - if (dataScales != rhs.dataScales) + if (nodeAttrs.dataScales != rhs.nodeAttrs.dataScales) return false; if (!(*attr.get() == *rhs.attr.get())) return false; @@ -1442,41 +107,6 @@ bool InterpolateKey::operator==(const InterpolateKey &rhs) const { } // namespace -// shapeND: n c d h w -// blockND: ncdhw cdhw dhw hw w 1 -// index : 0 1 2 3 4 5 -inline SizeVector getBlockND(const SizeVector& shape) { - int shapeRank = shape.size(); - SizeVector blockND(shapeRank + 1, 1); - for (int i = shapeRank - 1; i >= 0; i--) { - blockND[i] = shape[i] * blockND[i+1]; - } - return blockND; -} -// w/hw/ncw/nchw/ncdhw to ncdhw -inline SizeVector to5Dim(SizeVector casesDim) { - size_t caseSize = casesDim.size(); - SizeVector dim5(5, 1lu); - dim5[4] = casesDim[caseSize - 1]; - if (caseSize > 1) { - dim5[3] = casesDim[caseSize - 2]; - } - if (caseSize > 2) { - dim5[0] = casesDim[0]; - } - if (caseSize > 3) { - dim5[1] = casesDim[1]; - } - if (caseSize > 4) { - dim5[2] = casesDim[2]; - } - if (caseSize == 3) { // nhw -> ncw - dim5[1] = dim5[3]; - dim5[3] = 1lu; - } - return dim5; -} - using ngInterpMode = ngraph::opset4::Interpolate::InterpolateMode; using ngInterpCoordTransf = ngraph::opset4::Interpolate::CoordinateTransformMode; using ngInterpNearMode = ngraph::opset4::Interpolate::NearestMode; @@ -1702,18 +332,18 @@ void Interpolate::getSupportedDescriptors() { // get pad for (int i = 0; i < interpAttrs.padBegin.size(); i++) { if (interpAttrs.padBegin[i] != 0) { - hasPad = true; + interpAttrs.hasPad = true; break; } } for (int i = 0; i < interpAttrs.padEnd.size(); i++) { if (interpAttrs.padEnd[i] != 0) { - hasPad = true; + interpAttrs.hasPad = true; break; } } //correct pad - if (hasPad) { + if (interpAttrs.hasPad) { auto correctPad = [&](std::vector pad, int rank) { int padLen = pad.size(); if (padLen == rank) { @@ -1778,7 +408,22 @@ void Interpolate::initSupportedPrimitiveDescriptors() { config.inConfs[AXES_ID].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(axesType, getInputShapeAtPort(AXES_ID))); config.outConfs[0].setMemDesc(creatorsMap.at(dataFormat)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0))); - supportedPrimitiveDescriptors.push_back({config, implDetail}); +//#if defined(OPENVINO_ARCH_X86_64) +// supportedPrimitiveDescriptors.push_back({config, impl_type}); +//#else + std::vector srcMemoryDescs; + for (int i = 0; i < config.inConfs.size(); i++) { + srcMemoryDescs.push_back(config.inConfs[i].getMemDesc()); + } + std::vector dstMemoryDescs; + for (int i = 0; i < config.outConfs.size(); i++) { + dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()); + } + + auto factory = std::make_shared(interpAttrs, srcMemoryDescs, dstMemoryDescs, + std::make_shared(context, getPrimitivesPriority())); + supportedPrimitiveDescriptors.push_back({config, implDetail, factory}); +//#endif }; const auto &dataMinDims = getInputShapeAtPort(DATA_ID).getMinDims(); @@ -1857,7 +502,7 @@ bool Interpolate::needPrepareParams() const { return (inputShapesModified() || lastOutputDims != getChildEdgesAtPort(0)[0]->getMemory().getStaticDims()); } -void Interpolate::prepareParams() { +void Interpolate::prepareParams() {\ if (!shapesDefined()) { IE_THROW() << "Can't prepare params for Interpolate node with name: " << getName() << ", because input/output dims aren't defined"; } @@ -1892,38 +537,45 @@ void Interpolate::prepareParams() { scales.assign(scalesData, scalesData + scalesMem.getStaticDims()[0]); } - std::vector dataScales = getScales(getPaddedInputShape(srcDims, interpAttrs.padBegin, interpAttrs.padEnd), dstDims); + auto dataScales = getScales(getPaddedInputShape(srcDims, interpAttrs.padBegin, interpAttrs.padEnd), dstDims); + interpAttrs.dataScales = dataScales; if (getOutputShapeAtPort(0).getRank() > 2 && (dataScales[0] != 1.f || dataScales[1] != 1.f)) { IE_THROW() << "Interpolate layer only supports resize on spatial dimensions(depth, height and width)"; } - - InterpolateKey key = {interpAttrs, srcDims, dstDims, dataScales, dnnl::primitive_attr()}; + InterpolateKey key = {interpAttrs, srcDims, dstDims, dnnl::primitive_attr()}; setPostOps(key.attr, dstDims); - - auto buildExecutor = [&](const InterpolateKey& key) -> std::shared_ptr { - std::shared_ptr executor; - if ((key.nodeAttrs.mode == InterpolateMode::nearest || key.nodeAttrs.mode == InterpolateMode::linear_onnx || - key.nodeAttrs.mode == InterpolateMode::cubic) && - ((key.nodeAttrs.layout != InterpolateLayoutType::planar && mayiuse(cpu::x64::sse41)) || - (mayiuse(cpu::x64::avx2) && key.nodeAttrs.inPrc == Precision::FP32))) { - executor = std::make_shared(key.nodeAttrs, - key.srcDims, - key.dstDims, - key.dataScales, - key.attr); - } else { - executor = std::make_shared(key.nodeAttrs, - key.srcDims, - key.dstDims, - key.dataScales); - } - return executor; - }; - - auto cache = context->getParamsCache(); - auto result = cache->getOrCreate(key, buildExecutor); - execPtr = result.first; - +//#if defined(OPENVINO_ARCH_X86_64) +// auto buildExecutor = [&](const InterpolateKey& key) -> std::shared_ptr { +// std::shared_ptr executor; +// if ((key.nodeAttrs.mode == InterpolateMode::nearest || key.nodeAttrs.mode == InterpolateMode::linear_onnx || +// key.nodeAttrs.mode == InterpolateMode::cubic) && +// ((key.nodeAttrs.layout != InterpolateLayoutType::planar && mayiuse(cpu::x64::sse41)) || +// (mayiuse(cpu::x64::avx2) && key.nodeAttrs.inPrc == Precision::FP32))) { +// executor = std::make_shared(key.nodeAttrs, +// key.srcDims, +// key.dstDims, +// key.nodeAttrs.dataScales, +// key.attr); +// } +// return executor; +// }; +// +// auto cache = context->getParamsCache(); +// auto result = cache->getOrCreate(key, buildExecutor); +// execPtr = result.first; +//#else + std::vector srcMemoryDescs; + for (int i = 0; i < getOriginalInputsNumber(); i++) { + srcMemoryDescs.push_back(getParentEdgeAt(i)->getMemoryPtr()->getDescPtr()); + } + std::vector dstMemoryDescs; + for (int i = 0; i < getOriginalOutputsNumber(); i++) { + dstMemoryDescs.push_back(getChildEdgeAt(i)->getMemoryPtr()->getDescPtr()); + } + auto selectedPD = getSelectedPrimitiveDescriptor(); + execPtr = selectedPD->getExecutorFactoryAs()->makeExecutor(interpAttrs, srcMemoryDescs, dstMemoryDescs, key.attr); + selectedPD->setImplementationType(execPtr->getImplType()); +//#endif lastOutputDims = dstDims; } @@ -1954,10 +606,6 @@ void Interpolate::createPrimitive() { } } -inline int clipCoord(int pos, int length) { - return std::max(static_cast(0), std::min(pos, length - 1)); -} - static inline float triangleCoeff(float x) { return (std::max)(0.0f, 1 - std::abs(x)); } @@ -1985,17 +633,6 @@ void Interpolate::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims) attr.set_post_ops(ops); } -SizeVector Interpolate::getPaddedInputShape(const VectorDims &srcDims, - const std::vector &padBegin, - const std::vector &padEnd) { - SizeVector paddedShape; - int dataRank = srcDims.size(); - for (int i = 0; i < dataRank; i++) { - paddedShape.push_back(srcDims[i] + padBegin[i] + padEnd[i]); - } - return paddedShape; -} - // get scales of data rank size // if "scale" version: set scales with input scales, 1.f for other dims not in axis // if "size" version: scales = shape[target] / shape[input].pad, 1.f for other dims not in axis @@ -2016,1240 +653,9 @@ void Interpolate::execute(dnnl::stream strm) { if (!execPtr) { IE_THROW() << "Can't execute Interpolate node. Primitive didn't created"; } - auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); auto &srcMemPtr = getParentEdgeAt(DATA_ID)->getMemoryPtr(); - - uint8_t *dst_data = reinterpret_cast(dstMemPtr->GetPtr()); - const uint8_t *src_data_origin = reinterpret_cast(srcMemPtr->GetData()); - - const auto &srcDim = srcMemPtr->getStaticDims(); - const auto &dstDim = dstMemPtr->getStaticDims(); - size_t dimSize = srcDim.size(); - auto srcDimPad = execPtr->getSrcDimPad5d(); - - const auto srcDim5d = to5Dim(srcDim); - const auto srcDimPad5d = to5Dim(srcDimPad); - const auto dstDim5d = to5Dim(dstDim); - const auto srcDataSize = srcMemPtr->getDesc().getPrecision().size(); - - const uint8_t *src_data = nullptr; - std::vector srcPadded; - if (hasPad) { - int padB0 = (dimSize > 2) ? interpAttrs.padBegin[0] : 0; - int padB1 = (dimSize > 2) ? interpAttrs.padBegin[1] : 0; - int padB2 = (dimSize == 5) ? interpAttrs.padBegin[dimSize - 3] : 0; - int padB3 = interpAttrs.padBegin[dimSize - 2]; - int padB4 = interpAttrs.padBegin[dimSize - 1]; - - SizeVector inShapeBlock = getBlockND(srcDim5d); - SizeVector inShapePadBlock = getBlockND(srcDimPad5d); - - if (interpAttrs.layout == InterpolateLayoutType::planar) { - srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); - parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) { - const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize; - uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + - inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize; - cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize); - }); - src_data = src_data_pad; - } else if (interpAttrs.layout == InterpolateLayoutType::by_channel) { - srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); - parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) { - const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + - (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize; - uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) + - inShapePadBlock[4] * (h + padB3) + inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize; - cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize); - }); - src_data = src_data_pad; - } else if (interpAttrs.layout == InterpolateLayoutType::block) { - size_t blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8; - size_t CB = div_up(srcDimPad5d[1], blkSize); - size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize; - srcPadded.resize(eltsTotal * srcDataSize, 0x0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); - if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) { - IE_THROW() << "Interpolate layer with name '" << getName() << - "' does not support padding on batch and channel dimensions"; - } - parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) { - const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (h * srcDim5d[4] * blkSize) * srcDataSize - + (w * blkSize) * srcDataSize; - uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize - + ((w + padB4) * blkSize) * srcDataSize; - cpu_memcpy(srcPad, src, blkSize * srcDataSize); - }); - src_data = src_data_pad; - } - } else { - src_data = src_data_origin; - } - - execPtr->exec(src_data, dst_data, postOpsDataPtrs.data()); -} - -// for ndhwc and nCdhw8c[16c] -// input may be f32/bf16/int8, fused->output varies -void Interpolate::InterpolateJitExecutor::NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { - int *index_d = static_cast(&indexTable[0]); - int *index_h = static_cast(&indexTable[OD]); - int *index_w = static_cast(&indexTable[OD + OH]); - - bool is_nhwc = (configured_for_layout == by_channel); - - for (int b = 0; b < B; b++) { - if (is_nhwc) { - const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * C * b) * srcDataSize; - uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * C * b) * dstDataSize; - std::vector index_w_kernel(OW); - for (int ox = 0; ox < OW; ox++) { - index_w_kernel[ox] = index_w[ox] * C * srcDataSize; - } - parallel_for2d(OD, OH, [&](size_t d, size_t h) { - // kernel for C * OW - uint8_t *out_ptr_dh = out_ptr + (C * OW * OH * d + C * OW * h) * dstDataSize; - const uint8_t *in_ptr_dh = in_ptr + (C * IW * IH * index_d[d] + C * IW * index_h[h]) * srcDataSize; - auto arg = jit_interpolate_call_args(); - arg.dst = out_ptr_dh; - arg.src_ptr[0] = in_ptr_dh; - arg.index = static_cast(&(index_w_kernel[0])); - arg.work_amount = C; - arg.oc_off = 0; - arg.post_op_data = post_ops_data_; - (*interpolateKernel)(&arg); - }); - } else { // for blk - int blk_size = mayiuse(cpu::x64::avx512_core) ? 16 : 8; - int CB = div_up(C, blk_size); - const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * CB * blk_size * b) * srcDataSize; - uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * CB * blk_size * b) * dstDataSize; - std::vector index_w_kernel(OW); - for (int ox = 0; ox < OW; ox++) { - index_w_kernel[ox] = index_w[ox] * blk_size * srcDataSize; - } - parallel_for2d(CB, OD, [&](size_t cb, size_t d) { - uint8_t *out_ptr_cbd = out_ptr + (blk_size * OW * OH * OD * cb + blk_size * OW * OH * d) * dstDataSize; - const uint8_t *in_ptr_cbd = in_ptr + (blk_size * IW * IH * ID * cb + blk_size * IW * IH * index_d[d]) * srcDataSize; - auto arg = jit_interpolate_call_args(); - for (int h = 0; h < OH; h++) { // kernel for blk_size * OW - arg.dst = out_ptr_cbd + blk_size * OW * h * dstDataSize; - arg.src_ptr[0] = in_ptr_cbd + blk_size * IW * index_h[h] * srcDataSize; - arg.index = static_cast(&(index_w_kernel[0])); - arg.work_amount = static_cast(OW); - arg.oc_off = cb * blk_size * sizeof(float); - arg.post_op_data = post_ops_data_; - (*interpolateKernel)(&arg); - } - }); - } - } // batch end -} - -void Interpolate::InterpolateJitExecutor::NNPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { - int *index_d = static_cast(&indexTable[0]); - int *index_h = static_cast(&indexTable[OD]); - int *index_w = static_cast(&indexTable[OD + OH]); - - std::vector index_kernel(OH + OW); - // index_h * IW * srcDataSize to reduce and simplify redundant compute - for (int oh = 0; oh < OH; oh++) { - index_kernel[oh] = index_h[oh] * IW * srcDataSize; - } - // index_w * srcDataSize - for (int ow = 0; ow < OW; ow++) { - index_kernel[OH + ow] = index_w[ow] * srcDataSize; - } - - parallel_for3d(B, C, OD, [&](size_t b, size_t c, size_t od) { - const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]) * srcDataSize; - uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od) * dstDataSize; - - auto arg = jit_interpolate_call_args(); - arg.src_ptr[0] = in_ptr; - arg.dst = out_ptr; - arg.index = static_cast(&index_kernel[0]); // need index_h and index_w in kernel, it's in continous memory so one param - arg.oc_off = static_cast(c * sizeof(float)); - // work_amount is OH(out loop) and OW(inner loop), can get in kernel from jcp. - arg.post_op_data = post_ops_data_; - (*interpolateKernel)(&arg); - }); -} - -void Interpolate::InterpolateJitExecutor::linearOnnxPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, int B, int C, - int ID, int IH, int IW, int OD, int OH, int OW) { - // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, EndTopLeft:4, EndTopRight:5, EndBottomLeft:6, EndBottomRight:7 - // weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5 - int *index = static_cast(&indexTable[0]); - int eltInGrid = (spatialDimSize > 2) ? MAX_INPUT_INTERPOLATE : ((spatialDimSize > 1) ? 4 : 2); - int scratchLen = rnd_up(eltInGrid * OW * OH * OD, 16); - float *weight = reinterpret_cast(&indexTable[scratchLen]); - - parallel_for2d(B, C, [&](size_t b, size_t c) { - uint8_t *out_ptr_nc = out_ptr_ + (OH * OW * OD * C * b + OH * OW * OD * c) * dstDataSize; - const uint8_t *in_ptr_nc = in_ptr_ + (IH * IW * ID * C * b + IH * IW * ID * c) * srcDataSize; - auto arg = jit_interpolate_call_args(); - arg.src_ptr[0] = in_ptr_nc; - arg.index = static_cast(&index[0]); - arg.weight_ptr[0] = static_cast(&weight[0]); - arg.dst = out_ptr_nc; - arg.work_amount = OW * OH * OD; - arg.oc_off = static_cast(c * sizeof(float)); - arg.post_op_data = post_ops_data_; - (*interpolateKernel)(&arg); - }); -} - -void Interpolate::InterpolateJitExecutor::linearOnnxCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { - // left:OW right:OW Top:OH Bottom:OH Front:OD End:OD - std::vector indexPtr(MAX_INPUT_INTERPOLATE, 0); - std::vector weightPtr(MAX_INPUT_INTERPOLATE, 0); - size_t scratchLen = rnd_up(OW + OW + OH + OH + OD + OD, 16); - indexPtr[0] = static_cast(&indexTable[0]); - indexPtr[1] = static_cast(&indexTable[OW]); - indexPtr[2] = static_cast(&indexTable[2 * OW]); - indexPtr[3] = static_cast(&indexTable[2 * OW + OH]); - indexPtr[4] = static_cast(&indexTable[2 * OW + 2 * OH]); - indexPtr[5] = static_cast(&indexTable[2 * OW + 2 * OH + OD]); - - weightPtr[0] = reinterpret_cast(&indexTable[scratchLen]); - weightPtr[1] = reinterpret_cast(&indexTable[scratchLen + OW]); - weightPtr[2] = reinterpret_cast(&indexTable[scratchLen + 2 * OW]); - weightPtr[3] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + OH]); - weightPtr[4] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + 2 * OH]); - weightPtr[5] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + 2 * OH + OD]); - - bool isByChannel = (configured_for_layout == by_channel) ? true : false; - - int blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8; - int CB = isByChannel ? 1 : div_up(C, blkSize); - int CGatherLen = isByChannel ? C : blkSize; - int workAmount = isByChannel ? C : CB; - // n_CB(1)_d_h_w_8[16](c), () for by-channel - int C0 = OW * CGatherLen; - int C1 = OH * C0; - int C2 = OD * C1; - int C3 = CB * C2; - int I0 = IW * CGatherLen; - int I1 = IH * I0; - int I2 = ID * I1; - int I3 = CB * I2; - parallel_for3d(B, OD, OH, [&](size_t b, size_t d, size_t h) { - uint8_t *out_ptr_ndh = out_ptr_ + (C3 * b + C1 * d + C0 * h) * dstDataSize; - - const uint8_t *in_ptr_n = in_ptr_ + (I3 * b) * srcDataSize; - const uint8_t *in_ptr_nf = in_ptr_n + (indexPtr[4][d] * I1) * srcDataSize; - const uint8_t *in_ptr_nft = in_ptr_nf + (indexPtr[2][h] * I0) * srcDataSize; - const uint8_t *in_ptr_nfb = in_ptr_nf + (indexPtr[3][h] * I0) * srcDataSize; - const uint8_t *in_ptr_ne = in_ptr_n + (indexPtr[5][d] * I1) * srcDataSize; - const uint8_t *in_ptr_net = in_ptr_ne + (indexPtr[2][h] * I0) * srcDataSize; - const uint8_t *in_ptr_neb = in_ptr_ne + (indexPtr[3][h] * I0) * srcDataSize; - auto arg = jit_interpolate_call_args(); - for (int w = 0; w < OW; ++w) { - uint8_t *out_ptr_ndhw = out_ptr_ndh + CGatherLen * w * dstDataSize; - - arg.src_ptr[0] = in_ptr_nft + (indexPtr[0][w] * CGatherLen) * srcDataSize; - arg.src_ptr[1] = in_ptr_nft + (indexPtr[1][w] * CGatherLen) * srcDataSize; - arg.src_ptr[2] = in_ptr_nfb + (indexPtr[0][w] * CGatherLen) * srcDataSize; - arg.src_ptr[3] = in_ptr_nfb + (indexPtr[1][w] * CGatherLen) * srcDataSize; - arg.src_ptr[4] = in_ptr_net + (indexPtr[0][w] * CGatherLen) * srcDataSize; - arg.src_ptr[5] = in_ptr_net + (indexPtr[1][w] * CGatherLen) * srcDataSize; - arg.src_ptr[6] = in_ptr_neb + (indexPtr[0][w] * CGatherLen) * srcDataSize; - arg.src_ptr[7] = in_ptr_neb + (indexPtr[1][w] * CGatherLen) * srcDataSize; - arg.weight_ptr[0] = static_cast(&weightPtr[0][w]); - arg.weight_ptr[1] = static_cast(&weightPtr[1][w]); - arg.weight_ptr[2] = static_cast(&weightPtr[2][h]); - arg.weight_ptr[3] = static_cast(&weightPtr[3][h]); - arg.weight_ptr[4] = static_cast(&weightPtr[4][d]); - arg.weight_ptr[5] = static_cast(&weightPtr[5][d]); - arg.dst = out_ptr_ndhw; - arg.work_amount = workAmount; - arg.oc_off = 0; - arg.post_op_data = post_ops_data_; - (*interpolateKernel)(&arg); - } - }); -} - -void Interpolate::InterpolateJitExecutor::cubicCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW) { - const int idxNum = 1; - int *xOrigin = static_cast(&indexTable[0]); - float *xFactor = reinterpret_cast(&indexTable[OW]); - int *yOrigin = static_cast(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW]); - float *yFactor = reinterpret_cast(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); - - int blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8; - int CB = div_up(C, blkSize); - int CSize = configured_for_layout == InterpolateLayoutType::by_channel ? C : blkSize * CB; - int CGatherLen = configured_for_layout == InterpolateLayoutType::by_channel ? C : blkSize; - int workAmount = configured_for_layout == InterpolateLayoutType::by_channel ? C : CB; - - parallel_for3d(B, OH, OW, [&](size_t b, size_t h, size_t w) { - uint8_t *out_ptr_nhw = out_ptr_ + (OH * OW * CSize * b + OW * CGatherLen * h + CGatherLen * w) * dstDataSize; - const uint8_t *in_ptr_n = in_ptr_ + (IH * IW * CSize * b) * srcDataSize; - - std::vector kernelIndex(CUBIC_GRID_LEN * CUBIC_GRID_LEN); // 16 address offset to src(batch) or src(CB) - int iy = yOrigin[h]; - int ix = xOrigin[w]; - for (int y = iy - 1, i = 0; y <= iy + 2; y++, i++) { - int yInRange = std::max(0, std::min(y, IH - 1)); - yInRange = yInRange * CGatherLen * IW * srcDataSize; - for (int x = ix - 1, j = 0; x <= ix + 2; x++, j++) { - int xInRange = std::max(0, std::min(x, IW - 1)); - xInRange = yInRange + xInRange * CGatherLen * srcDataSize; - kernelIndex[i * CUBIC_GRID_LEN + j] = xInRange; - } - } - auto arg = jit_interpolate_call_args(); - arg.dst = out_ptr_nhw; - arg.src_ptr[0] = in_ptr_n; - arg.index = static_cast(&kernelIndex[0]); - // 0 for weight_W, 1 for weight_H - arg.weight_ptr[0] = static_cast(&xFactor[w * CUBIC_GRID_LEN]); - arg.weight_ptr[1] = static_cast(&yFactor[h * CUBIC_GRID_LEN]); - - // for by channel, src + step, dst + step, process next step on continuous memory - // for blk, src + IW*IH*blkSize, dst + OW*OH*blkSize, process the blkSize on next CB - arg.work_amount = workAmount; - arg.oc_off = 0; - arg.post_op_data = post_ops_data_; - (*interpolateKernel)(&arg); - }); -} - -void Interpolate::InterpolateJitExecutor::cubicPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW) { - int tblAdvance = 0; - int *xOrigin = static_cast(&indexTable[tblAdvance]); - tblAdvance += OW; - float *xFactor = reinterpret_cast(&indexTable[tblAdvance]); - tblAdvance += CUBIC_GRID_LEN * OW; - int *yOrigin = static_cast(&indexTable[tblAdvance]); - tblAdvance += OH; - float *yFactor = reinterpret_cast(&indexTable[tblAdvance]); - - tblAdvance += CUBIC_GRID_LEN * OH; - int *sequenceOH = static_cast(&indexTable[tblAdvance]); - tblAdvance += OW * OH; - int *sequenceOW = static_cast(&indexTable[tblAdvance]); - - parallel_for2d(B, C, [&](size_t n, size_t c) { - const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * C * n + IW * IH * c) * srcDataSize; - uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * C * n + OW * OH * c) * dstDataSize; - - auto arg = jit_interpolate_call_args(); - arg.dst = out_ptr_nc; - arg.src_ptr[0] = in_ptr_nc; - arg.index = xOrigin; - arg.src_ptr[1] = yOrigin; - arg.src_ptr[2] = static_cast(&sequenceOH[0]); - arg.src_ptr[3] = static_cast(&sequenceOW[0]); - arg.weight_ptr[0] = xFactor; - arg.weight_ptr[1] = yFactor; - arg.work_amount = static_cast(OW * OH); - arg.oc_off = static_cast(c * sizeof(float)); - arg.post_op_data = post_ops_data_; - (*interpolateKernel)(&arg); - }); -} - -// ===================================================================================================================== -// index layout: -// d_0............d_OD-1, h_0..............h_OH-1, w_0................w_OW-1 -void Interpolate::InterpolateExecutor::buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, - const std::vector& dataScales, InterpolateLayoutType layout, InterpolateNearestMode nearestMode) { - const int dimSize = dataRank; - float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; - float fy = dataScales[dimSize - 2]; - float fx = dataScales[dimSize - 1]; - size_t ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; - size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; - - indexTable.resize(OD + OH + OW); - bool isDDownsample = (fz < 1) ? true : false; - bool isHDownsample = (fy < 1) ? true : false; - bool isWDownsample = (fx < 1) ? true : false; - for (int oz = 0; oz < OD; oz++) { - float iz = coordTransToInput(oz, fz, ID, OD); - indexTable[oz] = nearestRound(iz, isDDownsample, nearestMode); - indexTable[oz] = clipCoord(indexTable[oz], ID); - } - for (int oy = 0; oy < OH; oy++) { - float iy = coordTransToInput(oy, fy, IH, OH); - indexTable[OD + oy] = nearestRound(iy, isHDownsample, nearestMode); - indexTable[OD + oy] = clipCoord(indexTable[OD + oy], IH); - } - for (int ox = 0; ox < OW; ox++) { - float ix = coordTransToInput(ox, fx, IW, OW); - indexTable[OD + OH + ox] = nearestRound(ix, isWDownsample, nearestMode); - indexTable[OD + OH + ox] = clipCoord(indexTable[OD + OH + ox], IW); - } -} - -// scale is float(outShape) / float(inShape) -// strictly consistent with onnx calc manner(div scale, not multiply inverse), given this is done offline -// the slight precison diff can produce obvious wrong value due to "nearest round" behavior for NN mode -float Interpolate::InterpolateExecutor::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const { - if (scale == 1.0f || (inShape == outShape)) { - return outCoord; - } - switch (coordTransMode) { - case InterpolateCoordTransMode::half_pixel: { - return (outCoord + 0.5f) / scale - 0.5f; - break; - } - case InterpolateCoordTransMode::pytorch_half_pixel: { - if (outShape > 1) - return (outCoord + 0.5f) / scale - 0.5f; - else - return 0; - break; - } - case InterpolateCoordTransMode::asymmetric: { - return static_cast(outCoord) / scale; - break; - } - case InterpolateCoordTransMode::tf_half_pixel_for_nn: { - return (outCoord + 0.5f) / scale; - break; - } - case InterpolateCoordTransMode::align_corners: { - if (outShape > 1) - return outCoord * (static_cast(inShape - 1) / static_cast(outShape - 1)); - else - return 0; - break; - } - default: { - IE_THROW() << "errorPrefix" << " does not support specified coordinate transformation mode"; - break; - } - } -} - -int Interpolate::InterpolateExecutor::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const { - switch (nearestMode) { - case InterpolateNearestMode::round_prefer_floor: { - if (originCoord == (static_cast(originCoord) + 0.5f)) - return static_cast(std::floor(originCoord)); - else - return static_cast(std::round(originCoord)); - break; - } - case InterpolateNearestMode::round_prefer_ceil: { - return static_cast(std::round(originCoord)); - break; - } - case InterpolateNearestMode::floor: { - return static_cast(std::floor(originCoord)); - break; - } - case InterpolateNearestMode::ceil: { - return static_cast(std::ceil(originCoord)); - break; - } - case InterpolateNearestMode::simple: { - if (isDownsample) - return static_cast(std::ceil(originCoord)); - else - return static_cast(originCoord); - } - default: { - IE_THROW() << "errorPrefix" << " does not support specified nearest round mode"; - break; - } - } -} - -void Interpolate::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, int inShape, int outShape, - int& index0, int& index1, float& weight0, float& weight1) { - float inCoord = coordTransToInput(outCoord, scale, inShape, outShape); - inCoord = std::max(0.0f, std::min(inCoord, static_cast(inShape - 1))); - index0 = std::min(static_cast(inCoord), inShape - 1); - index1 = std::min(index0 + 1, inShape - 1); - - weight1 = std::fabs(inCoord - index0); - weight0 = std::fabs(inCoord - index1); - if (index0 == index1) { - weight0 = 0.5f; - weight1 = 0.5f; - } -} - -void Interpolate::InterpolateExecutor::buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, - const std::vector& dataScales, InterpolateLayoutType layout) { - int dimSize = dataRank; - float fz = (spatialDimSize > 2) ? dataScales[dimSize - 3] : 1.f; - float fy = (spatialDimSize > 1) ? dataScales[dimSize - 2] : 1.f; - float fx = dataScales[dimSize - 1]; - int ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; - int OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; - - std::vector indexPtr(MAX_INPUT_INTERPOLATE, 0); - std::vector weightPtr(MAX_INPUT_INTERPOLATE, 0); - if (layout == InterpolateLayoutType::planar) { - // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, - // EndTopLeft:4, EndTopRight:5, EndBottomLeft:6, EndBottomRight:7 - // weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5 - int eltInGrid = (spatialDimSize > 2) ? MAX_INPUT_INTERPOLATE : ((spatialDimSize > 1) ? 4 : 2); - int idxType = 2; - int scratchLen = rnd_up(eltInGrid * OW * OH * OD, 16); - indexTable.resize(idxType * scratchLen); - - indexPtr[0] = static_cast(&indexTable[0]); - indexPtr[1] = static_cast(&indexTable[OW * OH * OD]); - weightPtr[0] = reinterpret_cast(&indexTable[scratchLen]); - weightPtr[1] = reinterpret_cast(&indexTable[scratchLen + OW * OH * OD]); - if (spatialDimSize > 1) { - indexPtr[2] = static_cast(&indexTable[2 * OW * OH * OD]); - indexPtr[3] = static_cast(&indexTable[3 * OW * OH * OD]); - weightPtr[2] = reinterpret_cast(&indexTable[scratchLen + 2 * OW * OH * OD]); - weightPtr[3] = reinterpret_cast(&indexTable[scratchLen + 3 * OW * OH * OD]); - } - if (spatialDimSize > 2) { - indexPtr[4] = static_cast(&indexTable[4 * OW * OH * OD]); - indexPtr[5] = static_cast(&indexTable[5 * OW * OH * OD]); - indexPtr[6] = static_cast(&indexTable[6 * OW * OH * OD]); - indexPtr[7] = static_cast(&indexTable[7 * OW * OH * OD]); - weightPtr[4] = reinterpret_cast(&indexTable[scratchLen + 4 * OW * OH * OD]); - weightPtr[5] = reinterpret_cast(&indexTable[scratchLen + 5 * OW * OH * OD]); - } - int scale = mayiuse(cpu::x64::sse41) ? srcDataSize : 1; - - for (int oz = 0; oz < OD; oz++) { - int izF, izE; - float weightF, weightE; - linearOnnxCF(oz, fz, ID, OD, izF, izE, weightF, weightE); - int idxOz = oz * OH * OW; - for (int oy = 0; oy < OH; oy++) { - int iyT, iyB; - float weightT, weightB; - linearOnnxCF(oy, fy, IH, OH, iyT, iyB, weightT, weightB); - int idxOzOy = idxOz + oy * OW; - for (int ox = 0; ox < OW; ox++) { - int ixL, ixR; - float weightL, weightR; - linearOnnxCF(ox, fx, IW, OW, ixL, ixR, weightL, weightR); - - int idxOzOyOx = idxOzOy + ox; - indexPtr[0][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixL) * scale; - indexPtr[1][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixR) * scale; - weightPtr[0][idxOzOyOx] = weightL; - weightPtr[1][idxOzOyOx] = weightR; - if (spatialDimSize > 1) { - indexPtr[2][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixL) * scale; - indexPtr[3][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixR) * scale; - weightPtr[2][idxOzOyOx] = weightT; - weightPtr[3][idxOzOyOx] = weightB; - } - if (spatialDimSize > 2) { - indexPtr[4][idxOzOyOx] = (izE * IH * IW + iyT * IW + ixL) * scale; - indexPtr[5][idxOzOyOx] = (izE * IH * IW + iyT * IW + ixR) * scale; - indexPtr[6][idxOzOyOx] = (izE * IH * IW + iyB * IW + ixL) * scale; - indexPtr[7][idxOzOyOx] = (izE * IH * IW + iyB * IW + ixR) * scale; - weightPtr[4][idxOzOyOx] = weightF; - weightPtr[5][idxOzOyOx] = weightE; - } - } - } - } - } else { - // index: left:OW right:OW Top:OH Bottom:OH, Front:OD, End:OD - // weight:same as index - size_t scratchLen = rnd_up(OW + OW + OH + OH + OD + OD, 16); - int idxType = 2; - indexTable.resize(idxType * scratchLen); - indexPtr[0] = static_cast(&indexTable[0]); - indexPtr[1] = static_cast(&indexTable[OW]); - indexPtr[2] = static_cast(&indexTable[2 * OW]); - indexPtr[3] = static_cast(&indexTable[2 * OW + OH]); - indexPtr[4] = static_cast(&indexTable[2 * OW + 2 * OH]); - indexPtr[5] = static_cast(&indexTable[2 * OW + 2 * OH + OD]); - - weightPtr[0] = reinterpret_cast(&indexTable[scratchLen]); - weightPtr[1] = reinterpret_cast(&indexTable[scratchLen + OW]); - weightPtr[2] = reinterpret_cast(&indexTable[scratchLen + 2 * OW]); - weightPtr[3] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + OH]); - weightPtr[4] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + 2 * OH]); - weightPtr[5] = reinterpret_cast(&indexTable[scratchLen + 2 * OW + 2 * OH + OD]); - - for (int ox = 0; ox < OW; ox++) { - linearOnnxCF(ox, fx, IW, OW, indexPtr[0][ox], indexPtr[1][ox], weightPtr[0][ox], weightPtr[1][ox]); - } - for (int oy = 0; oy < OH; oy++) { - linearOnnxCF(oy, fy, IH, OH, indexPtr[2][oy], indexPtr[3][oy], weightPtr[2][oy], weightPtr[3][oy]); - } - for (int oz = 0; oz < OD; oz++) { - linearOnnxCF(oz, fz, ID, OD, indexPtr[4][oz], indexPtr[5][oz], weightPtr[4][oz], weightPtr[5][oz]); - } - } -} - -// table layout: -// wd .........wd, wh............wh, ww.............ww, id...........id, ih............ih, iw..............iw -// | | -// wh0.....wh_diameter ih0.....ih_diameter -void Interpolate::InterpolateExecutor::buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, - const std::vector& dataScales, int kernel_width, bool antialias) { - int dimSize = dataRank; - float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; - float fy = dataScales[dimSize - 2]; - float fx = dataScales[dimSize - 1]; - size_t ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; - size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; - - if (!(IW == OW && IH == OH && ID == OD)) { - float ax = antialias ? fx : 1.0f; - float ay = antialias ? fy : 1.0f; - float az = antialias ? fz : 1.0f; - - int rx = (fx > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / ax)); - int ry = (fy > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / ay)); - int rz = (fz > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / az)); - - int diaOD = 2 * rz + 1; - int diaOH = 2 * ry + 1; - int diaOW = 2 * rx + 1; - int sizeOD = OD * diaOD; - int sizeOH = OH * diaOH; - int sizeOW = OW * diaOW; - indexTable.resize((sizeOD + sizeOH + sizeOW) * 2); - float *weightTable = reinterpret_cast(&indexTable[0]); - float *weightOD = static_cast(&weightTable[0]); - float *weightOH = static_cast(&weightTable[sizeOD]); - float *weightOW = static_cast(&weightTable[sizeOD + sizeOH]); - - int *idxTable = static_cast(&indexTable[sizeOD + sizeOH + sizeOW]); - int *idxOD = static_cast(&idxTable[0]); - int *idxOH = static_cast(&idxTable[sizeOD]); - int *idxOW = static_cast(&idxTable[sizeOD + sizeOH]); - - for (int oz = 0; oz < OD; oz++) { - float iz = coordTransToInput(oz, fz, ID, OD); - int iz_r = static_cast(std::round(iz)); - for (int r = iz_r - rz, i = 0; r <= iz_r + rz; r++, i++) { - idxOD[oz * diaOD + i] = r; - if (r < 0 || r >= static_cast(ID)) { - weightOD[oz * diaOD + i] = 0.f; - } else { - float dz = iz - r; - weightOD[oz * diaOD + i] = az * triangleCoeff(az * dz); - } - } - } - for (int oy = 0; oy < OH; oy++) { - float iy = coordTransToInput(oy, fy, IH, OH); - int iy_r = static_cast(std::round(iy)); - for (int r = iy_r - ry, i = 0; r <= iy_r + ry; r++, i++) { - idxOH[oy * diaOH + i] = r; - if (r < 0 || r >= static_cast(IH)) { - weightOH[oy * diaOH + i] = 0.f; - } else { - float dy = iy - r; - weightOH[oy * diaOH + i] = ay * triangleCoeff(ay * dy); - } - } - } - for (int ox = 0; ox < OW; ox++) { - float ix = coordTransToInput(ox, fx, IW, OW); - int ix_r = static_cast(std::round(ix)); - for (int r = ix_r - rx, i = 0; r <= ix_r + rx; r++, i++) { - idxOW[ox * diaOW + i] = r; - if (r < 0 || r >= static_cast(IW)) { - weightOW[ox * diaOW + i] = 0.f; - } else { - float dx = ix - r; - weightOW[ox * diaOW + i] = ax * triangleCoeff(ax * dx); - } - } - } - } -} - -std::vector Interpolate::InterpolateExecutor::getCubicCoeffs(float mantissa, float a) { - float m = std::fabs(mantissa); - std::vector coeffs(4, 0.f); - - coeffs[0] = a * (m - 1.0) * (m - 1.0) * m; - coeffs[1] = ((a + 2.0) * m - (a + 3.0)) * m * m + 1.0; - coeffs[2] = (((-a - 2.0) * m + (2.0 * a + 3.0)) * m - a) * m; - coeffs[3] = -a * m * m * (m - 1.0); - return coeffs; -} - -// table layout: -// OW OW OW OW OW OH OH OH OH OH -// x_idx x_weight0 x_weight1 x_weight2 x_weight3 y_idx y_weight0 y_weight1 y_weight2 y_weight3 -void Interpolate::InterpolateExecutor::buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector& dataScales, - float cubicCoeff, InterpolateLayoutType layout) { - int dimSize = dataRank; - float fy = dataScales[dimSize - 2]; - float fx = dataScales[dimSize - 1]; - int IH = srcDimPad5d[3], IW = srcDimPad5d[4]; - int OH = dstDim5d[3], OW = dstDim5d[4]; - - // idxNum for index, CUBIC_GRID_LEN for weight - const int idxNum = 1; - size_t idxWeightSize = (CUBIC_GRID_LEN + idxNum) * OW + (CUBIC_GRID_LEN + idxNum) * OH; - if (layout != InterpolateLayoutType::planar) { - indexTable.resize(idxWeightSize); - } else { - size_t sequenceSize = 2 * OH * OW; - indexTable.resize(idxWeightSize + sequenceSize); - } - - int tblAdvance = 0; - int *xOrigin = static_cast(&indexTable[tblAdvance]); - tblAdvance += OW; - float *xFactor = reinterpret_cast(&indexTable[tblAdvance]); - for (int ox = 0; ox < OW; ox++) { - float ix = coordTransToInput(ox, fx, IW, OW); - int ix_r = static_cast(std::floor(ix)); - xOrigin[ox] = ix_r; - float m = ix - ix_r; - std::vector coffes = getCubicCoeffs(m, cubicCoeff); - xFactor[CUBIC_GRID_LEN * ox] = coffes[0]; - xFactor[CUBIC_GRID_LEN * ox + 1] = coffes[1]; - xFactor[CUBIC_GRID_LEN * ox + 2] = coffes[2]; - xFactor[CUBIC_GRID_LEN * ox + 3] = coffes[3]; - } - - tblAdvance += CUBIC_GRID_LEN * OW; - int *yOrigin = static_cast(&indexTable[tblAdvance]); - tblAdvance += OH; - float *yFactor = reinterpret_cast(&indexTable[tblAdvance]); - for (int oy = 0; oy < OH; oy++) { - float iy = coordTransToInput(oy, fy, IH, OH); - int iy_r = static_cast(std::floor(iy)); - yOrigin[oy] = iy_r; - float m = iy - iy_r; - std::vector coffes = getCubicCoeffs(m, cubicCoeff); - yFactor[CUBIC_GRID_LEN * oy] = coffes[0]; - yFactor[CUBIC_GRID_LEN * oy + 1] = coffes[1]; - yFactor[CUBIC_GRID_LEN * oy + 2] = coffes[2]; - yFactor[CUBIC_GRID_LEN * oy + 3] = coffes[3]; - } - - if (layout == InterpolateLayoutType::planar) { - tblAdvance += CUBIC_GRID_LEN * OH; - int *sequenceOH = static_cast(&indexTable[tblAdvance]); - tblAdvance += OH * OW; - int *sequenceOW = static_cast(&indexTable[tblAdvance]); - for (int h = 0; h < OH; ++h) { - int offset = h * OW; - for (int w = 0; w < OW; ++w) { - sequenceOH[offset + w] = h * sizeof(int); - sequenceOW[offset + w] = w * sizeof(int); - } - } - } -} - -void Interpolate::InterpolateRefExecutor::NNRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - int OD, int OH, int OW) { - int *index_d = static_cast(&indexTable[0]); - int *index_h = static_cast(&indexTable[OD]); - int *index_w = static_cast(&indexTable[OD + OH]); - - const float *in_ptr_f32 = reinterpret_cast(in_ptr_); - float *out_ptr_f32 = reinterpret_cast(out_ptr_); - - parallel_for3d(B, C, OD, [&](size_t b, size_t c, size_t od) { - const float *in_ptr = in_ptr_f32 + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]); - float *out_ptr = out_ptr_f32 + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od); - for (int oh = 0; oh < OH; oh++) { - const float *in_ptr_h = in_ptr + (IW * index_h[oh]); - float *out_ptr_h = out_ptr + (OW * oh); - for (int ow = 0; ow < OW; ow++) { - out_ptr_h[ow] = in_ptr_h[index_w[ow]]; - } - } - }); -} - -void Interpolate::InterpolateRefExecutor::linearOnnxRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - int OD, int OH, int OW) { - std::vector indexPtr(MAX_INPUT_INTERPOLATE, 0); - std::vector weightPtr(MAX_INPUT_INTERPOLATE, 0); - // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, - // EndTopLeft:4, EndTopRight:5, EndBottomLeft:6, EndBottomRight:7 - // weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5 - - int eltInGrid = (spatialDimSize > 2) ? MAX_INPUT_INTERPOLATE : ((spatialDimSize > 1) ? 4 : 2); - int scratchLen = rnd_up(eltInGrid * OW * OH * OD, 16); - - indexPtr[0] = static_cast(&indexTable[0]); - indexPtr[1] = static_cast(&indexTable[OW * OH * OD]); - weightPtr[0] = reinterpret_cast(&indexTable[scratchLen]); - weightPtr[1] = reinterpret_cast(&indexTable[scratchLen + OW * OH * OD]); - if (spatialDimSize > 1) { - indexPtr[2] = static_cast(&indexTable[2 * OW * OH * OD]); - indexPtr[3] = static_cast(&indexTable[3 * OW * OH * OD]); - weightPtr[2] = reinterpret_cast(&indexTable[scratchLen + 2 * OW * OH * OD]); - weightPtr[3] = reinterpret_cast(&indexTable[scratchLen + 3 * OW * OH * OD]); - } - if (spatialDimSize > 2) { - indexPtr[4] = static_cast(&indexTable[4 * OW * OH * OD]); - indexPtr[5] = static_cast(&indexTable[5 * OW * OH * OD]); - indexPtr[6] = static_cast(&indexTable[6 * OW * OH * OD]); - indexPtr[7] = static_cast(&indexTable[7 * OW * OH * OD]); - weightPtr[4] = reinterpret_cast(&indexTable[scratchLen + 4 * OW * OH * OD]); - weightPtr[5] = reinterpret_cast(&indexTable[scratchLen + 5 * OW * OH * OD]); - } - - const float *in_ptr_f32 = reinterpret_cast(in_ptr_); - float *out_ptr_f32 = reinterpret_cast(out_ptr_); - - parallel_for2d(B, C, [&](size_t b, size_t c) { - float *out_ptr_nc = out_ptr_f32 + (OD * OH * OW * C * b + OD * OH * OW * c); - const float *in_ptr_nc = in_ptr_f32 + (ID * IH * IW * C * b + ID * IH * IW * c); - // do not combined 1d/2d to 3d unified process to get rid of invalid computing. - switch (spatialDimSize) { - case 1: - for (int i = 0; i < OW; i++) { - float src0 = in_ptr_nc[indexPtr[0][i]]; - float src1 = in_ptr_nc[indexPtr[1][i]]; - - out_ptr_nc[i] = src0 * weightPtr[0][i] + - src1 * weightPtr[1][i]; - } - break; - case 2: - for (int i = 0; i < OH * OW; i++) { - float src00 = in_ptr_nc[indexPtr[0][i]]; - float src01 = in_ptr_nc[indexPtr[1][i]]; - float src10 = in_ptr_nc[indexPtr[2][i]]; - float src11 = in_ptr_nc[indexPtr[3][i]]; - - out_ptr_nc[i] = src00 * weightPtr[2][i] * weightPtr[0][i] + - src01 * weightPtr[2][i] * weightPtr[1][i] + - src10 * weightPtr[3][i] * weightPtr[0][i] + - src11 * weightPtr[3][i] * weightPtr[1][i]; - } - break; - case 3: - for (int i = 0; i < OD * OH * OW; i++) { - float src000 = in_ptr_nc[indexPtr[0][i]]; - float src001 = in_ptr_nc[indexPtr[1][i]]; - float src010 = in_ptr_nc[indexPtr[2][i]]; - float src011 = in_ptr_nc[indexPtr[3][i]]; - float src100 = in_ptr_nc[indexPtr[4][i]]; - float src101 = in_ptr_nc[indexPtr[5][i]]; - float src110 = in_ptr_nc[indexPtr[6][i]]; - float src111 = in_ptr_nc[indexPtr[7][i]]; - - // float dstValue = - // weightPtr[4][i] * weightPtr[2][i] * weightPtr[0][i] * src000 + - // weightPtr[4][i] * weightPtr[2][i] * weightPtr[1][i] * src001 + - // weightPtr[4][i] * weightPtr[3][i] * weightPtr[0][i] * src010 + - // weightPtr[4][i] * weightPtr[3][i] * weightPtr[1][i] * src011 + - // weightPtr[5][i] * weightPtr[2][i] * weightPtr[0][i] * src100 + - // weightPtr[5][i] * weightPtr[2][i] * weightPtr[1][i] * src101 + - // weightPtr[5][i] * weightPtr[3][i] * weightPtr[0][i] * src110 + - // weightPtr[5][i] * weightPtr[3][i] * weightPtr[1][i] * src111; - - out_ptr_nc[i] = - weightPtr[4][i] * (weightPtr[2][i] * (weightPtr[0][i] * src000 + - weightPtr[1][i] * src001) + - weightPtr[3][i] * (weightPtr[0][i] * src010 + - weightPtr[1][i] * src011)) + - weightPtr[5][i] * (weightPtr[2][i] * (weightPtr[0][i] * src100 + - weightPtr[1][i] * src101) + - weightPtr[3][i] * (weightPtr[0][i] * src110 + - weightPtr[1][i] * src111)); - } - break; - default: - break; - } - }); -} - -void Interpolate::InterpolateRefExecutor::cubicRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW) { - const int idxNum = 1; - int *xOrigin = static_cast(&indexTable[0]); - float *xFactor = reinterpret_cast(&indexTable[OW]); - int *yOrigin = static_cast(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW]); - float *yFactor = reinterpret_cast(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); - - const float *in_ptr_f32 = reinterpret_cast(in_ptr_); - float *out_ptr_f32 = reinterpret_cast(out_ptr_); - - parallel_for4d(B, C, OH, OW, [&](size_t n, size_t c, size_t oy, size_t ox) { - const float *in_ptr_nc = in_ptr_f32 + (IW * IH * C * n + IW * IH * c); - float *out_ptr_nc = out_ptr_f32 + (OW * OH * C * n + OW * OH * c); - - int iy = yOrigin[oy]; - int ix = xOrigin[ox]; - - float retY = 0.f; - for (int y = iy - 1, i = 0; y <= iy + 2; y++, i++) { - int yInRange = std::max(0, std::min(y, IH - 1)); - const float *in_ptr_nch = in_ptr_nc + IW * yInRange; - float retX = 0.f; - for (int x = ix - 1, j = 0; x <= ix + 2; x++, j++) { - int xInRange = std::max(0, std::min(x, IW - 1)); - retX += xFactor[ox * CUBIC_GRID_LEN + j] * in_ptr_nch[xInRange]; - } - retY += yFactor[oy * CUBIC_GRID_LEN + i] * retX; - } - out_ptr_nc[oy * OW + ox] = retY; - }); -} - -float Interpolate::InterpolateRefExecutor::getValue(const uint8_t *base, size_t offset, InferenceEngine::Precision prec) { - const uint8_t *baseOffset = base + offset; - switch (prec) { - case Precision::U8: { - return static_cast(*baseOffset); - break; - } - case Precision::I8: { - const int8_t *valuePtr = reinterpret_cast(baseOffset); - return static_cast(*valuePtr); - break; - } - case Precision::BF16: { - const uint16_t *valuePtr = reinterpret_cast(baseOffset); - return bfloat16_t::from_bits(*valuePtr); - break; - } - case Precision::FP32: { - const float *valuePtr = reinterpret_cast(baseOffset); - return *valuePtr; - break; - } - default: { - IE_THROW() << "Interpolate layer does not support precision: " << prec; - break; - } - } -} - -void Interpolate::InterpolateRefExecutor::setValue(uint8_t *base, size_t offset, float value, InferenceEngine::Precision prec) { - uint8_t *baseOffset = base + offset; - switch (prec) { - case Precision::U8: { - uint8_t data = static_cast(value < 0 ? 0 : value); - cpu_memcpy(baseOffset, &data, 1); - break; - } - case Precision::I8: { - int8_t data = static_cast(value); - cpu_memcpy(baseOffset, &data, 1); - break; - } - case Precision::BF16: { - uint16_t data = bfloat16_t(value).to_bits(); - cpu_memcpy(baseOffset, &data, 2); - break; - } - case Precision::FP32: { - cpu_memcpy(baseOffset, &value, sizeof(float)); - break; - } - default: { - IE_THROW() << "Interpolate layer does not support precision: " << prec; - break; - } - } -} - -void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias) { - if (IW == OW && IH == OH && ID == OD) { - size_t spatialDimSize = IW * IH * ID; - // TODO: enable when fusing into interp with linear mode will support - if (/*fusedWith.empty() &&*/ inputPrec == outputPrec) { - size_t size = B * C * spatialDimSize * srcDataSize; - cpu_memcpy(out_ptr_, in_ptr_, size); - } else { - parallel_for2d(B, C, [&](size_t b, size_t c) { - const uint8_t *in_ptr_nc = in_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * srcDataSize; - uint8_t *out_ptr_nc = out_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * dstDataSize; - for (size_t i = 0; i < spatialDimSize; i++) { - float dstValue = getValue(in_ptr_nc, i * srcDataSize, inputPrec); - setValue(out_ptr_nc, i * dstDataSize, dstValue, outputPrec); - } - }); - } - return; - } - - float ax = antialias ? fx : 1.0f; - float ay = antialias ? fy : 1.0f; - float az = antialias ? fz : 1.0f; - - int rx = (fx > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / ax)); - int ry = (fy > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / ay)); - int rz = (fz > 1.0f) ? 2 : static_cast(ceil(static_cast(kernel_width) / az)); - - int diaOD = 2 * rz + 1; - int diaOH = 2 * ry + 1; - int diaOW = 2 * rx + 1; - int sizeOD = OD * diaOD; - int sizeOH = OH * diaOH; - int sizeOW = OW * diaOW; - - float *weightTable = reinterpret_cast(&indexTable[0]); - float *weightOD = static_cast(&weightTable[0]); - float *weightOH = static_cast(&weightTable[sizeOD]); - float *weightOW = static_cast(&weightTable[sizeOD + sizeOH]); - - int *idxTable = static_cast(&indexTable[sizeOD + sizeOH + sizeOW]); - int *idxOD = static_cast(&idxTable[0]); - int *idxOH = static_cast(&idxTable[sizeOD]); - int *idxOW = static_cast(&idxTable[sizeOD + sizeOH]); - - parallel_for2d(B, C, [&](size_t b, size_t c) { - const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c) * srcDataSize; - uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c) * dstDataSize; - for (size_t oz = 0; oz < OD; oz++) { - uint8_t *out_ptr_ncd = out_ptr_nc + (OW * OH * oz) * dstDataSize; - for (size_t oy = 0; oy < OH; oy++) { - uint8_t *out_ptr_ncdh = out_ptr_ncd + (OW * oy) * dstDataSize; - for (size_t ox = 0; ox < OW; ox++) { - float sum = 0.f; - float wsum = 0.f; - - // this comment explains the original algo. - // for (int z = iz_r - rz; z <= iz_r + rz; z++) { - // for (int y = iy_r - ry; y <= iy_r + ry; y++) { - // for (int x = ix_r - rx; x <= ix_r + rx; x++) { - // bool is_continue = z < 0 || - // y < 0 || - // x < 0 || - // z >= static_cast(ID) || - // y >= static_cast(IH) || - // x >= static_cast(IW); - // if (is_continue) - // continue; - - // float dx = ix - x; - // float dy = iy - y; - // float dz = iz - z; - - // float w = ax * triangleCoeff(ax * dx) * - // ay * triangleCoeff(ay * dy) * - // az * triangleCoeff(az * dz); - - // sum += w * getValue(in_ptr_nc, (z * IH * IW + y * IW + x) * srcDataSize, inputPrec); - // wsum += w; - // } - // } - //} - - for (int iz = 0; iz < diaOD; iz++) { - if (weightOD[oz * diaOD + iz] == 0.f) - continue; - for (int iy = 0; iy < diaOH; iy++) { - if (weightOH[oy * diaOH + iy] == 0.f) { - continue; - } - for (int ix = 0; ix < diaOW; ix++) { - if (weightOW[ox * diaOW + ix] == 0.f) { - continue; - } - float w = weightOD[oz * diaOD + iz] * weightOH[oy * diaOH + iy] * weightOW[ox * diaOW + ix]; - float value = getValue(in_ptr_nc, - (idxOD[oz * diaOD + iz] * IH * IW + idxOH[oy * diaOH + iy] * IW + idxOW[ox * diaOW + ix]) * srcDataSize, inputPrec); - - sum += w * value; - wsum += w; - } - } - } - - if (!wsum) { - setValue(out_ptr_ncdh, ox * dstDataSize, 0.f, outputPrec); - } else { - float dst_value = sum / wsum; - setValue(out_ptr_ncdh, ox * dstDataSize, dst_value, outputPrec); - } - } - } - } - }); -} - -Interpolate::InterpolateExecutor::InterpolateExecutor(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales) : - mode(interpAttrs.mode), coordTransMode(interpAttrs.coordTransMode), configured_for_layout(interpAttrs.layout), - inputPrec(interpAttrs.inPrc), outputPrec(interpAttrs.outPrc) { - srcDimPad5d = to5Dim(getPaddedInputShape(srcDims, interpAttrs.padBegin, interpAttrs.padEnd)); - dstDim5d = to5Dim(dstDims); - srcDataSize = interpAttrs.inPrc.size(); - dstDataSize = interpAttrs.outPrc.size(); - dataRank = srcDims.size(); - spatialDimSize = getSpatialDimsNum(dataRank); - - switch (mode) { - case InterpolateMode::nearest: { - buildTblNN(srcDimPad5d, dstDim5d, dataScales, interpAttrs.layout, interpAttrs.nearestMode); - break; - } - case InterpolateMode::linear_onnx: { - buildTblLinearOnnx(srcDimPad5d, dstDim5d, dataScales, interpAttrs.layout); - break; - } - case InterpolateMode::linear: { - static constexpr int LINEAR_KERNEL = 2; - buildTblLinear(srcDimPad5d, dstDim5d, dataScales, LINEAR_KERNEL, interpAttrs.antialias); - break; - } - case InterpolateMode::cubic: { - buildTblCubic(srcDimPad5d, dstDim5d, dataScales, interpAttrs.cubeCoeff, interpAttrs.layout); - break; - } - default: { - IE_THROW() << "Interpolate executor does not support interpolate mode: " << mode; - break; - } - } -} - -Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales, - const dnnl::primitive_attr &attr) : - InterpolateExecutor(interpAttrs, srcDims, dstDims, dataScales) { - auto jcp = jit_interpolate_config_params(); - jcp.mode = mode; - jcp.src_prc = interpAttrs.inPrc; - jcp.dst_prc = interpAttrs.outPrc; - jcp.src_data_size = jcp.src_prc.size(); - jcp.dst_data_size = jcp.dst_prc.size(); - jcp.indices_size = sizeof(int); - jcp.C = dstDim5d[1]; - jcp.OW = dstDim5d[4]; - jcp.OH = dstDim5d[3]; - jcp.OD = dstDim5d[2]; - jcp.IW = srcDimPad5d[4]; - jcp.IH = srcDimPad5d[3]; - jcp.ID = srcDimPad5d[2]; - jcp.spatial_dim_size = getSpatialDimsNum(srcDims.size()); - jcp.layout = interpAttrs.layout; - if (jcp.layout != InterpolateLayoutType::planar) { - if (mayiuse(cpu::x64::avx512_core)) { - interpolateKernel.reset(new jit_uni_interpolate_kernel_f32(jcp, *attr.get())); - } else if (mayiuse(cpu::x64::avx2)) { - interpolateKernel.reset(new jit_uni_interpolate_kernel_f32(jcp, *attr.get())); - } else if (mayiuse(cpu::x64::sse41)) { - interpolateKernel.reset(new jit_uni_interpolate_kernel_f32(jcp, *attr.get())); - } - } else if (mayiuse(cpu::x64::avx2) && interpAttrs.inPrc == InferenceEngine::Precision::FP32) { - // gather ISA(for planar JIT kernel) for avx2 and fp32 - interpolateKernel.reset(new jit_uni_interpolate_kernel_f32(jcp, *attr.get())); - } else { - IE_THROW() << "Can't create InterpolateJitExecutor"; - } - if (interpolateKernel) { - interpolateKernel->create_ker(); - } else { - IE_THROW() << "Can't compile InterpolateJitExecutor"; - } -} - -void Interpolate::InterpolateJitExecutor::exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) { - size_t N = srcDimPad5d[0], C = srcDimPad5d[1], ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; - size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; - - if (!interpolateKernel) { - IE_THROW() << "Can't execute, kernel for Interpolate node is not compiled"; - } - switch (mode) { - case InterpolateMode::nearest: { - if (configured_for_layout == InterpolateLayoutType::planar) { - NNPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } else { - NNCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } - break; - } - case InterpolateMode::linear_onnx: { - if (configured_for_layout == InterpolateLayoutType::planar) { - linearOnnxPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } else { - linearOnnxCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } - break; - } - case InterpolateMode::cubic: { - if (configured_for_layout == InterpolateLayoutType::planar) { - cubicPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); - } else { - cubicCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); - } - break; - } - default: { - IE_THROW() << "InterpolateJitExecutor has unsupported interpolate mode: " << mode; - } - } -} - -void Interpolate::InterpolateRefExecutor::exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) { - size_t N = srcDimPad5d[0], C = srcDimPad5d[1], ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; - size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; - - switch (mode) { - case InterpolateMode::nearest: { - NNRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); - break; - } - case InterpolateMode::linear_onnx: { - linearOnnxRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); - break; - } - case InterpolateMode::cubic: { - cubicRef(in_ptr_, out_ptr_, N, C, IH, IW, OH, OW); - break; - } - case InterpolateMode::linear: { - float fz = (dataRank == 5) ? dataScales[dataRank - 3] : 1.f; - float fy = dataScales[dataRank - 2]; - float fx = dataScales[dataRank - 1]; - - bool isDownsample = (fx < 1.f) || (fy < 1.f) || (fz < 1.f); - int kernel_width = 2; - linearInterpolation(in_ptr_, out_ptr_, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width, isDownsample && antialias); - break; - } - default: { - IE_THROW() << "Interpolate layer has unsupported interpolate mode: " << mode; - } - } -} - -size_t Interpolate::getSpatialDimsNum(const Dim rank) { - switch (rank) { - case 1: - case 3: - return 1; - case 2: - case 4: - return 2; - case 5: - return 3; - default: - IE_THROW() << "Can't define number spatial"; - } + execPtr->exec({srcMemPtr}, {dstMemPtr}, postOpsDataPtrs.data()); } bool Interpolate::canFuse(const NodePtr& node) const { @@ -3266,4 +672,4 @@ bool Interpolate::created() const { } // namespace node } // namespace intel_cpu -} // namespace ov +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.h b/src/plugins/intel_cpu/src/nodes/interpolate.h index 95f07df5c71edd..03ea5427615bee 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.h +++ b/src/plugins/intel_cpu/src/nodes/interpolate.h @@ -4,8 +4,8 @@ #pragma once -#include -#include +#include "executors/interpolate.hpp" +#include "executors/interpolate_list.hpp" #include #include #include @@ -18,81 +18,6 @@ namespace ov { namespace intel_cpu { namespace node { -enum InterpolateLayoutType { - planar, - block, - by_channel -}; - -enum InterpolateMode { - nearest, - linear, - linear_onnx, - cubic -}; - -enum InterpolateCoordTransMode { - half_pixel, - pytorch_half_pixel, - asymmetric, - tf_half_pixel_for_nn, - align_corners -}; - -enum class InterpolateNearestMode { - round_prefer_floor, - round_prefer_ceil, - floor, - ceil, - simple -}; - -enum class InterpolateShapeCalcMode { - sizes, - scales -}; - -struct jit_interpolate_config_params { - InterpolateLayoutType layout; - InterpolateMode mode; - InferenceEngine::Precision src_prc; - InferenceEngine::Precision dst_prc; - int src_data_size; - int dst_data_size; - int indices_size; - int spatial_dim_size; - int C, ID, IH, IW, OD, OH, OW; -}; - -struct jit_interpolate_call_args { - const void *src_ptr[MAX_INPUT_INTERPOLATE]; - const void *weight_ptr[MAX_INPUT_INTERPOLATE]; - const int *index; - void *dst; - size_t work_amount; - size_t oc_off; - //ptr to array of post op inputs pointers (flat list) - const void* post_op_data; -}; - -struct jit_uni_interpolate_kernel { - void (*ker_)(const jit_interpolate_call_args *); - - void operator()(const jit_interpolate_call_args *args) { - assert(ker_); - ker_(args); - } - - explicit jit_uni_interpolate_kernel(jit_interpolate_config_params jcp, const dnnl_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {} - virtual ~jit_uni_interpolate_kernel() {} - - virtual void create_ker() = 0; - - jit_interpolate_config_params jcp_; - const dnnl_primitive_attr &attr_; -}; - - class Interpolate : public Node { public: static constexpr size_t DATA_ID = 0; @@ -121,128 +46,13 @@ class Interpolate : public Node { bool needPrepareParams() const override; void prepareParams() override; - struct InterpolateAttrs { - InterpolateMode mode = InterpolateMode::nearest; - InterpolateCoordTransMode coordTransMode = InterpolateCoordTransMode::half_pixel; - InterpolateNearestMode nearestMode = InterpolateNearestMode::round_prefer_floor; - bool antialias = false; - float cubeCoeff = -0.75; - std::vector padBegin; - std::vector padEnd; - InferenceEngine::Precision inPrc; - InferenceEngine::Precision outPrc; - InterpolateLayoutType layout; - }; - private: InterpolateAttrs interpAttrs; - - class InterpolateExecutor { - public: - InterpolateExecutor(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales); - - virtual void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) = 0; - virtual ~InterpolateExecutor() = default; - VectorDims getSrcDimPad5d() const { return srcDimPad5d; } - - private: - void buildTblNN(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector& dataScales, - InterpolateLayoutType layout, InterpolateNearestMode nearestMode); - void buildTblLinearOnnx(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector& dataScales, - InterpolateLayoutType layout); - void buildTblLinear(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector& dataScales, int kernel_width, - bool antialias); - void buildTblCubic(const SizeVector& srcDimPad5d, const SizeVector& dstDim5d, const std::vector& dataScales, float cubicCoeff, - InterpolateLayoutType layout); - - float coordTransToInput(int outCoord, float scale, int inShape, int outShape) const; - int nearestRound(float origin, bool isDownsample, InterpolateNearestMode nearestMode) const; - void linearOnnxCF(int outCoord, float scale, int inShape, int outShape, int& index0, int& index1, float& weight0, float& weight1); - std::vector getCubicCoeffs(float mantissa, float a); - - protected: - InterpolateMode mode; - InterpolateCoordTransMode coordTransMode; - InterpolateLayoutType configured_for_layout; - VectorDims srcDimPad5d, dstDim5d; - InferenceEngine::Precision inputPrec, outputPrec; - size_t srcDataSize, dstDataSize; - int spatialDimSize; - size_t dataRank; - std::vector indexTable; - }; std::shared_ptr execPtr = nullptr; - class InterpolateJitExecutor : public InterpolateExecutor { - public: - InterpolateJitExecutor(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales, - const dnnl::primitive_attr &attr); - - void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) override; - - private: - // nearest neighbor - void NNPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - void NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - - // onnx linear - void linearOnnxPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - void linearOnnxCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - - // cubic - void cubicPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW); - void cubicCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW); - - private: - std::shared_ptr interpolateKernel = nullptr; - }; - - class InterpolateRefExecutor : public InterpolateExecutor { - public: - InterpolateRefExecutor(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &_dataScales) : - InterpolateExecutor(interpAttrs, srcDims, dstDims, _dataScales), - antialias(interpAttrs.antialias), dataScales(_dataScales) {} - - void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) override; - - private: - void NNRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - void linearOnnxRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - - void cubicRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW); - void linearInterpolation(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias); - - static float getValue(const uint8_t *base, size_t offset, InferenceEngine::Precision prec); - static void setValue(uint8_t *base, size_t offset, float value, InferenceEngine::Precision prec); - - private: - bool antialias; - std::vector dataScales; - }; - void setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims); - static SizeVector getPaddedInputShape(const VectorDims &srcDims, const std::vector &padBegin, const std::vector &padEnd); std::vector getScales(const VectorDims &srcDimPad, const VectorDims &dstDim); - static size_t getSpatialDimsNum(const Dim rank); - - bool hasPad = false; InterpolateShapeCalcMode shapeCalcMode; bool isAxesSpecified = false; @@ -263,4 +73,4 @@ class Interpolate : public Node { } // namespace node } // namespace intel_cpu -} // namespace ov +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp index 2976e31b6e8725..a5cfe2e9740fda 100644 --- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp +++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp @@ -33,6 +33,7 @@ namespace ov { namespace intel_cpu { namespace node { +#if defined(OPENVINO_ARCH_X86_64) template struct jit_uni_nms_kernel_f32 : public jit_uni_nms_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_nms_kernel_f32) @@ -551,6 +552,7 @@ struct jit_uni_nms_kernel_f32 : public jit_uni_nms_kernel, public jit_generator dw(0x0001); } }; +#endif bool NonMaxSuppression::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { @@ -701,6 +703,7 @@ bool NonMaxSuppression::isExecutable() const { } void NonMaxSuppression::createJitKernel() { +#if defined(OPENVINO_ARCH_X86_64) auto jcp = jit_nms_config_params(); jcp.box_encode_type = boxEncodingType; jcp.is_soft_suppressed_by_iou = isSoftSuppressedByIOU; @@ -715,6 +718,7 @@ void NonMaxSuppression::createJitKernel() { if (nms_kernel) nms_kernel->create_ker(); +#endif } void NonMaxSuppression::executeDynamicImpl(dnnl::stream strm) { diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h index a0666f66b63331..2599fa3843ff06 100644 --- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h +++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h @@ -148,7 +148,7 @@ class NonMaxSuppression : public Node { void checkOutput(const Shape& shape, const std::vector& precList, const std::string& name, const size_t port); void createJitKernel(); - std::shared_ptr nms_kernel; + std::shared_ptr nms_kernel = nullptr; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/roi_pooling.cpp b/src/plugins/intel_cpu/src/nodes/roi_pooling.cpp index 3b58e699fd1b6c..b082635f2a3b4d 100644 --- a/src/plugins/intel_cpu/src/nodes/roi_pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/roi_pooling.cpp @@ -36,6 +36,7 @@ namespace ov { namespace intel_cpu { namespace node { +#if defined(OPENVINO_ARCH_X86_64) template struct jit_uni_roi_pooling_kernel_f32 : public jit_uni_roi_pooling_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_roi_pooling_kernel_f32); @@ -308,6 +309,7 @@ struct jit_uni_roi_pooling_kernel_f32 : public jit_uni_roi_pooling_kernel, publi L(exit_label); } }; +#endif namespace { struct RoiPoolingKey { @@ -532,6 +534,7 @@ template class ROIPooling::ROIPoolingJitExecutor : public ROIPooling::ROIPoolingExecutor { public: ROIPoolingJitExecutor(const jit_roi_pooling_params &jpp) { +#if defined(OPENVINO_ARCH_X86_64) if (mayiuse(cpu::x64::avx512_core)) { roi_pooling_kernel.reset(new jit_uni_roi_pooling_kernel_f32(jpp)); } else if (mayiuse(cpu::x64::avx2)) { @@ -544,6 +547,7 @@ class ROIPooling::ROIPoolingJitExecutor : public ROIPooling::ROIPoolingExecutor if (roi_pooling_kernel) roi_pooling_kernel->create_ker(); +#endif } void exec( @@ -891,10 +895,12 @@ std::pair ROIPooling::ROIPoolingExecutor::getXYForBilinearMode( template std::shared_ptr ROIPooling::ROIPoolingExecutor::makeExecutor( const jit_roi_pooling_params& jpp) { +#if defined(OPENVINO_ARCH_X86_64) if (mayiuse(cpu::x64::sse41)) return std::make_shared>(jpp); - else - return std::make_shared>(jpp); +#endif + + return std::make_shared>(jpp); } bool ROIPooling::created() const { diff --git a/src/plugins/intel_cpu/src/nodes/topk.cpp b/src/plugins/intel_cpu/src/nodes/topk.cpp index 77e45cf162e770..d94502c3b6400c 100644 --- a/src/plugins/intel_cpu/src/nodes/topk.cpp +++ b/src/plugins/intel_cpu/src/nodes/topk.cpp @@ -32,6 +32,7 @@ namespace ov { namespace intel_cpu { namespace node { +#if defined(OPENVINO_ARCH_X86_64) #define GET_OFF(field) offsetof(jit_topk_call_args, field) #define vmm_mask Vmm(0) @@ -1787,6 +1788,7 @@ struct jit_uni_topk_kernel_f32 : public jit_uni_topk_kernel, public jit_generato } } }; +#endif bool TopK::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { @@ -1898,7 +1900,11 @@ void TopK::initSupportedPrimitiveDescriptors() { impl_type = impl_desc_type::ref; } +#if defined(OPENVINO_ARCH_X86_64) jit_mode = mayiuse(cpu::x64::sse41); +#else + jit_mode = false; +#endif static const Precision supportedPrecision[] = { Precision::FP32, @@ -2110,7 +2116,7 @@ void TopK::createPrimitive() { calc_bitonic_idx(top_k, jcp.bitonic_k_idx_cnt, false); } } - +#if defined(OPENVINO_ARCH_X86_64) if (mayiuse(cpu::x64::avx512_core)) { topk_kernel.reset(new jit_uni_topk_kernel_f32(jcp)); } else if (mayiuse(cpu::x64::avx2)) { @@ -2121,6 +2127,7 @@ void TopK::createPrimitive() { if (topk_kernel) topk_kernel->create_ker(); +#endif } } diff --git a/src/plugins/intel_cpu/src/nodes/topk.h b/src/plugins/intel_cpu/src/nodes/topk.h index 29060026e70fd2..cb2d202012a627 100644 --- a/src/plugins/intel_cpu/src/nodes/topk.h +++ b/src/plugins/intel_cpu/src/nodes/topk.h @@ -144,7 +144,7 @@ class TopK : public Node { std::vector vec_process_ptr; std::vector vec_process_idx_ptr; - std::shared_ptr topk_kernel; + std::shared_ptr topk_kernel = nullptr; std::string errorPrefix; }; diff --git a/src/plugins/intel_cpu/src/nodes_factory.cpp b/src/plugins/intel_cpu/src/nodes_factory.cpp index 44d99374531fb6..3944751bf31515 100644 --- a/src/plugins/intel_cpu/src/nodes_factory.cpp +++ b/src/plugins/intel_cpu/src/nodes_factory.cpp @@ -172,9 +172,14 @@ Node::NodesFactory::NodesFactory() INTEL_CPU_NODE(Eye, Type::Eye); INTEL_CPU_NODE(Unique, Type::Unique); INTEL_CPU_NODE(Ngram, Type::Ngram); + INTEL_CPU_NODE(Interpolate, Type::Interpolate); INTEL_CPU_NODE(Reduce, Type::Reduce); -#if defined(OPENVINO_ARCH_X86_64) INTEL_CPU_NODE(Gather, Type::Gather); + INTEL_CPU_NODE(NonMaxSuppression, Type::NonMaxSuppression); + INTEL_CPU_NODE(ROIPooling, Type::ROIPooling); + INTEL_CPU_NODE(TopK, Type::TopK); + INTEL_CPU_NODE(Proposal, Type::Proposal); +#if defined(OPENVINO_ARCH_X86_64) INTEL_CPU_NODE(GridSample, Type::GridSample); INTEL_CPU_NODE(DeformableConvolution, Type::DeformableConvolution); INTEL_CPU_NODE(DepthToSpace, Type::DepthToSpace); @@ -183,17 +188,12 @@ Node::NodesFactory::NodesFactory() INTEL_CPU_NODE(ColorConvert, Type::ColorConvert); INTEL_CPU_NODE(NormalizeL2, Type::NormalizeL2); INTEL_CPU_NODE(BinaryConvolution, Type::BinaryConvolution); - INTEL_CPU_NODE(NonMaxSuppression, Type::NonMaxSuppression); - INTEL_CPU_NODE(Interpolate, Type::Interpolate); - INTEL_CPU_NODE(ROIPooling, Type::ROIPooling); INTEL_CPU_NODE(ROIAlign, Type::ROIAlign); INTEL_CPU_NODE(RegionYolo, Type::RegionYolo); - INTEL_CPU_NODE(TopK, Type::TopK); INTEL_CPU_NODE(Interaction, Type::Interaction); INTEL_CPU_NODE(MHA, Type::MHA); INTEL_CPU_NODE(ExtractImagePatches, Type::ExtractImagePatches); INTEL_CPU_NODE(FakeQuantize, Type::FakeQuantize); - INTEL_CPU_NODE(Proposal, Type::Proposal); INTEL_CPU_NODE(ShuffleChannels, Type::ShuffleChannels); INTEL_CPU_NODE(SpaceToDepth, Type::SpaceToDepth); INTEL_CPU_NODE(Snippet, Type::Subgraph); diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp index 6eef5f7eb8f78c..8f33b058b9a64c 100644 --- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp +++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.cpp @@ -36,6 +36,7 @@ impl_desc_type parse_impl_name(std::string impl_desc_name) { SEARCH_WORD(any); SEARCH_WORD(_1x1); SEARCH_WORD(_dw); + SEARCH_WORD_2(dw, _dw); SEARCH_WORD(reorder); SEARCH_WORD(sparse); SEARCH_WORD(acl); @@ -113,6 +114,7 @@ const char* impl_type_to_string(impl_desc_type type) { CASE(brgemm_avx512_amx); CASE(brgemm_sparse_avx512_amx); CASE(acl); + CASE(dw_acl); CASE(gemm_acl); CASE(winograd_acl); diff --git a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h index 3257cad37cc904..d91b6c6e139b0e 100644 --- a/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h +++ b/src/plugins/intel_cpu/src/onednn/iml_type_mapper.h @@ -94,6 +94,7 @@ enum impl_desc_type { brgemm_avx512_amx = brgemm | avx512 | amx, brgemm_sparse_avx512_amx = brgemm | sparse | avx512 | amx, + dw_acl = _dw | acl, gemm_acl = gemm | acl, winograd_acl = winograd | acl, }; diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.cpp new file mode 100644 index 00000000000000..ad7d17682feae5 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + + +#include "mish_decomposition.hpp" + +#include +#include + +ov::intel_cpu::MishDecomposition::MishDecomposition() { + auto mish = ngraph::pattern::wrap_type(); + + ngraph::matcher_pass_callback callback = [](ngraph::pattern::Matcher& m) { + auto mish = std::dynamic_pointer_cast(m.get_match_root()); + if (!mish) { + return false; + } + + auto exp = std::make_shared(mish->input_value(0)); + auto add = std::make_shared(exp, opset4::Constant::create(mish->get_output_element_type(0), ngraph::Shape{}, {1.0f})); + auto log = std::make_shared(add); + auto tanh = std::make_shared(log); + auto mul = std::make_shared(mish->input_value(0), tanh); + + mul->set_friendly_name(mish->get_friendly_name()); + ngraph::copy_runtime_info(mish, {exp, add, log, tanh, mul}); + ngraph::replace_node(mish, mul); + return true; + }; + + auto m = std::make_shared(mish, "MishDecomposition"); + register_matcher(m, callback); +} \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.hpp new file mode 100644 index 00000000000000..b56870679edaaf --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/arm/pass/mish_decomposition.hpp @@ -0,0 +1,19 @@ +// Copyright (C) 2020-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +namespace ov { +namespace intel_cpu { + +class MishDecomposition: public ngraph::pass::MatcherPass { +public: + OPENVINO_RTTI("MishDecomposition", "0"); + MishDecomposition(); +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp index e4715d9ce6289d..6cd95233577aa6 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp @@ -36,7 +36,7 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr &nGraphF CPU_REGISTER_PASS_COMMON(manager, AlignMatMulInputRanks); CPU_REGISTER_PASS_COMMON(manager, ConvertTileToSeqTiles); CPU_REGISTER_PASS_COMMON(manager, FullyConnectedBiasFusion); - CPU_REGISTER_PASS_COMMON(manager, ConvertToPowerStatic); + CPU_REGISTER_PASS_X64(manager, ConvertToPowerStatic); CPU_REGISTER_PASS_COMMON(manager, ConvertToLeakyRelu); CPU_REGISTER_PASS_COMMON(manager, ConvertToSwishCPU); CPU_REGISTER_PASS_COMMON(manager, OptimizeSequenceTransposes); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 4073bba3bcbb8a..f654f1462c1c1e 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -94,6 +94,7 @@ #include "transformations/cpu_opset/x64/pass/mha_fusion.hpp" #include "transformations/cpu_opset/x64/pass/convert_to_interaction.hpp" #include "transformations/cpu_opset/arm/pass/convert_reduce_multi_axis.hpp" +#include "transformations/cpu_opset/arm/pass/mish_decomposition.hpp" #include "transformations/cpu_opset/common/pass/convert_fq_rnn_to_quantized_rnn.hpp" #include "transformations/cpu_opset/common/pass/move_eltwise_up_data_movement.hpp" #include "transformations/cpu_opset/common/pass/swap_convert_transpose.hpp" @@ -251,6 +252,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_REGISTER_PASS_X64(manager, ConvertToInteraction); CPU_REGISTER_PASS_X64(manager, ConvertInteractionInt8); CPU_REGISTER_PASS_ARM(manager, ConvertReduceMultiAxis); + CPU_REGISTER_PASS_ARM(manager, MishDecomposition); // SpaceToDepth/ DepthToSpace node implementation supports only equal input/output tensors with rank <= 5 CPU_SET_CALLBACK_COMMON(manager, @@ -386,9 +388,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertGELU); CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertShuffleChannels3); CPU_DISABLE_PASS_COMMON(manager, ov::pass::Gelu7Downgrade); - CPU_DISABLE_PASS_COMMON(manager, ov::pass::HSwishDecomposition); CPU_DISABLE_PASS_COMMON(manager, ov::pass::SoftPlusDecomposition); - CPU_DISABLE_PASS_COMMON(manager, ov::pass::HSigmoidDecomposition); CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertMod); CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertShuffleChannels3); CPU_DISABLE_PASS_COMMON(manager, ov::pass::WeightsDequantizeToFakeQuantize); @@ -407,6 +407,8 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_DISABLE_PASS_COMMON(manager, ov::pass::UniqueDecomposition); CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertTopK3); CPU_DISABLE_PASS_COMMON(manager, ov::pass::ConvertTopK11ToTopK3); + CPU_DISABLE_PASS_COMMON(manager, ov::pass::HSwishDecomposition); + CPU_DISABLE_PASS_X64(manager, ov::pass::HSigmoidDecomposition); CPU_DISABLE_PASS_X64(manager, ov::pass::ReduceL1Decomposition); CPU_DISABLE_PASS_X64(manager, ov::pass::ReduceL2Decomposition); diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/interpolate.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/interpolate.cpp index 21a96de9592c94..b728e310ebdf71 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/interpolate.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/interpolate.cpp @@ -328,8 +328,10 @@ const std::vector cubeCoefs = { const std::vector interpolateFusingParamsSet{ emptyFusingSpec, +#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) fusingSwish, fusingFakeQuantizePerTensorRelu, +#endif }; std::vector> filterAdditionalConfig() { @@ -458,6 +460,7 @@ const std::vector shapeParams4D_fixed_C = { } }; +#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) INSTANTIATE_TEST_SUITE_P(smoke_InterpolateNN_Layout_PerChannelFuse_Test, InterpolateLayerCPUTest, ::testing::Combine( interpolateCasesNN_Smoke, @@ -477,6 +480,7 @@ INSTANTIATE_TEST_SUITE_P(InterpolateNN_Layout_PerChannelFuse_Test, InterpolateLa ::testing::Values(fusingFakeQuantizePerChannelRelu), ::testing::ValuesIn(filterAdditionalConfig())), InterpolateLayerCPUTest::getTestCaseName); +#endif const auto interpolateCasesLinearOnnx_Smoke = ::testing::Combine( ::testing::Values(ngraph::op::v4::Interpolate::InterpolateMode::LINEAR_ONNX),