diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp index 6799cde1fa5a55..0206d06c86c4d4 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp @@ -474,37 +474,35 @@ void Deconvolution::getSupportedDescriptors() { config.outConfs.resize(getOriginalOutputsNumber()); auto& creatorsMap = BlockedDescCreator::getCommonCreators(); - for (size_t i = 0; i < getParentEdges().size(); ++i) { - auto checkDesc = [&](LayoutType format) -> bool { - NodeConfig config; - config.inConfs.resize(getParentEdges().size()); - config.outConfs.resize(getOriginalOutputsNumber()); - - for (size_t i = 0; i < getParentEdges().size(); ++i) { - config.inConfs[i].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); - } + auto checkDesc = [&](LayoutType format) -> bool { + NodeConfig config; + config.inConfs.resize(getParentEdges().size()); + config.outConfs.resize(getOriginalOutputsNumber()); - for (size_t i = 0; i < getChildEdges().size(); ++i) { - config.outConfs[i].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); - } + for (size_t i = 0; i < getParentEdges().size(); ++i) { + config.inConfs[i].setMemDesc( + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); + } - std::vector srcMemoryDescs; - srcMemoryDescs.push_back(config.inConfs[0].getMemDesc()->cloneWithNewDims(inDims)); - for (size_t i = 1; i < config.inConfs.size(); i++) { - srcMemoryDescs.push_back(config.inConfs[i].getMemDesc()->clone()); - } - std::vector dstMemoryDescs; - dstMemoryDescs.push_back(config.outConfs[0].getMemDesc()->cloneWithNewDims(outDims)); - for (size_t i = 1; i < config.outConfs.size(); i++) { - dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()->clone()); - } + for (size_t i = 0; i < getChildEdges().size(); ++i) { + config.outConfs[i].setMemDesc( + creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); + } - return AclDeconvExecutorBuilder::customIsSupported(deconvAttrs, srcMemoryDescs, dstMemoryDescs); - }; - useACL = checkDesc(LayoutType::ncsp); - } + std::vector srcMemoryDescs; + srcMemoryDescs.push_back(config.inConfs[0].getMemDesc()->cloneWithNewDims(inDims)); + for (size_t i = 1; i < config.inConfs.size(); i++) { + srcMemoryDescs.push_back(config.inConfs[i].getMemDesc()->clone()); + } + std::vector dstMemoryDescs; + dstMemoryDescs.push_back(config.outConfs[0].getMemDesc()->cloneWithNewDims(outDims)); + for (size_t i = 1; i < config.outConfs.size(); i++) { + dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()->clone()); + } + + return AclDeconvExecutorBuilder::customIsSupported(deconvAttrs, srcMemoryDescs, dstMemoryDescs); + }; + useACL = checkDesc(LayoutType::nspc) || checkDesc(LayoutType::ncsp); if (useACL) return; #endif @@ -875,11 +873,11 @@ void Deconvolution::prepareParams() { if (useACL) { std::vector srcMemoryDescs; for (size_t i = 0; i < getOriginalInputsNumber(); i++) { - srcMemoryDescs.push_back(getParentEdgesAtPort(i).front()->getMemory().getDescWithType()); + srcMemoryDescs.push_back(getParentEdgesAtPort(i).front()->getMemory().getDescPtr()); } std::vector dstMemoryDescs; for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { - dstMemoryDescs.push_back(getChildEdgesAtPort(i).front()->getMemory().getDescWithType()); + dstMemoryDescs.push_back(getChildEdgesAtPort(i).front()->getMemory().getDescPtr()); } execPtrDeconv = selected_pd->getExecutorFactoryAs()->makeExecutor(deconvAttrs, srcMemoryDescs, @@ -1253,6 +1251,7 @@ void Deconvolution::initSupportedPrimitiveDescriptors() { supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::gemm_acl, factory); }; + pushDesc(LayoutType::nspc); pushDesc(LayoutType::ncsp); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp index 692d9d8f370d49..69ce589f9667fd 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp @@ -15,26 +15,49 @@ ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs, const std::vector& dstDescs) { auto srcDims = srcDescs[0]->getShape().getDims(); auto weiDims = srcDescs[1]->getShape().getDims(); +// std::cout << (srcDescs[0]->hasLayoutType(LayoutType::nspc) ? "nhwc" : "nchw") << std::endl; +// std::cout << weiDims[0] << " | " << weiDims[1] << " | " << weiDims[2] << " | " << weiDims[3] << " | " << std::endl; // swap input and output channels dimensions to be align with ACL // weights tensor shape is changed because ACL expects [O, I, H, W] tensor while OV uses [I, O, H, W] tensor std::swap(weiDims[0], weiDims[1]); +// std::cout << weiDims[0] << " | " << weiDims[1] << " | " << weiDims[2] << " | " << weiDims[3] << " | " << std::endl; auto dstDims = dstDescs[0]->getShape().getDims(); - VectorDims biasDims; - TensorInfo biasTensorInfo; - + arm_compute::TensorShape srcVecDims = shapeCast(srcDims); + arm_compute::TensorShape weiVecDims = shapeCast(weiDims); + arm_compute::TensorShape dstVecDims = shapeCast(dstDims); + arm_compute::TensorShape biasVecDims; if (deconvAttrs.withBiasesParam) { - biasDims = srcDescs[2]->getShape().getStaticDims(); - biasTensorInfo = TensorInfo(shapeCast(biasDims), 1, - precisionToAclDataType(srcDescs[2]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2])); + biasVecDims = shapeCast(srcDescs[2]->getShape().getDims()); + } + if (srcDescs[0]->hasLayoutType(LayoutType::nspc)) { + auto dim_size = srcDescs[0]->getShape().getDims().size(); + auto mover = [&dim_size](TensorShape &_shape) { + if (dim_size > 4) { std::swap(_shape[2], _shape[3]); } + if (dim_size > 3) { std::swap(_shape[1], _shape[2]); } + if (dim_size > 2) { std::swap(_shape[0], _shape[1]); } + }; + mover(srcVecDims); + mover(weiVecDims); + mover(dstVecDims); + if (deconvAttrs.withBiasesParam) { + mover(biasVecDims); + } } - TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1, + std::cout << weiVecDims[0] << " | " << weiVecDims[1] << " | " << weiVecDims[2] << " | " << weiVecDims[3] << " | " << std::endl; + std::cout << weiVecDims[0] << " ======================== " << std::endl; + TensorInfo srcTensorInfo = TensorInfo(srcVecDims, 1, precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0])); - TensorInfo weiTensorInfo = TensorInfo(shapeCast(weiDims), 1, + TensorInfo weiTensorInfo = TensorInfo(weiVecDims, 1, precisionToAclDataType(srcDescs[1]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[1])); - TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1, + TensorInfo dstTensorInfo = TensorInfo(dstVecDims, 1, precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0])); + TensorInfo biasTensorInfo; + if (deconvAttrs.withBiasesParam) { + biasTensorInfo = TensorInfo(biasVecDims, 1, + precisionToAclDataType(srcDescs[2]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2])); + } unsigned int pad_l = (deconvAttrs.paddingL.size() > 1) ? static_cast(deconvAttrs.paddingL.at(1)) : static_cast(deconvAttrs.paddingL.at(0)); @@ -114,9 +137,37 @@ static void transpose_to_1023(const MemoryCPtr& srcMemPtr, std::vector& d }); } +//static void transpose_to_0231(VectorDims new_dims, std::vector& dst_data) { +// const unsigned long DIM0 = new_dims[0]; +// const unsigned long DIM1 = new_dims[1]; +// const unsigned long DIM2 = new_dims[2]; +// const unsigned long DIM3 = new_dims[3]; +// +// parallel_for3d(DIM0, DIM1, DIM2, [&](unsigned long dim0, unsigned long dim1, unsigned long dim2) { +// for (int dim3 = 0; dim3 < DIM3; ++dim3) { +// unsigned long src_off = dim0 * DIM1 * DIM2 * DIM3 + +// dim1 * DIM2 * DIM3 + +// dim2 * DIM3 + +// dim3; +// unsigned long dst_off = dim0 * DIM2 * DIM3 * DIM1 + +// dim2 * DIM3 * DIM1 + +// dim3 * DIM1 + +// dim1; +// +// std::swap(dst_data[dst_off], dst_data[src_off]); +// } +// }); +//} + void AclDeconvExecutor::exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) { // TODO: Remove transpose from exec transpose_to_1023(src[1], weiBuffer); +// std::cout << src[1]->getShape().getDims()[0] << " * " << +// src[1]->getShape().getDims()[1] << " * " << +// src[1]->getShape().getDims()[2] << " * " << +// src[1]->getShape().getDims()[3] << " * " << std::endl; +// VectorDims vec_dims = {12, 6, 3, 3}; +// transpose_to_0231(vec_dims, weiBuffer); srcTensor.allocator()->import_memory(src[0]->getData()); dstTensor.allocator()->import_memory(dst[0]->getData()); @@ -193,28 +244,17 @@ bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs, TensorInfo dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo; PadStrideInfo deconv_info = aclDeconvTensorInfo.deconv_info; - unsigned int kernel_x = (deconvAttrs.kernel.size() > 1) ? deconvAttrs.kernel.at(1) : deconvAttrs.kernel.at(0); - unsigned int kernel_y = deconvAttrs.kernel.at(0); - // After stride=8 up-sampling in ACL Deconvolution layer slower than reference - if (deconv_info.stride().first >= 8 || deconv_info.stride().second >= 8) return false; + if (deconv_info.stride().first >= 8 || deconv_info.stride().second >= 8) { + DEBUG_LOG("AclDeconvExecutor does not support strides > 8:"); + return false; + } unsigned int dilation_x = (deconvAttrs.dilation.size() > 1) ? deconvAttrs.dilation.at(1) : deconvAttrs.dilation.at(0); unsigned int dilation_y = deconvAttrs.dilation.at(0); if (!one_of(dilation_x, static_cast(0), static_cast(1)) || !one_of(dilation_y, static_cast(0), static_cast(1))) return false; - size_t in_h = srcDescs[0]->hasLayoutType(LayoutType::ncsp) ? srcDescs[0]->getShape().getDims()[2] : srcDescs[0]->getShape().getDims()[1]; - size_t in_w = srcDescs[0]->hasLayoutType(LayoutType::ncsp) ? srcDescs[0]->getShape().getDims()[3] : srcDescs[0]->getShape().getDims()[2]; - - // Validate function has bug (https://github.com/ARM-software/ComputeLibrary/issues/1061) with error exception. - // We copy deconvolution_output_dimensions function for get correct validation - // TODO: remove after fix - if (validate_deconvolution_output_dimensions(in_w, in_h, kernel_x, kernel_y, deconv_info)) { - DEBUG_LOG("NEDeconvolutionLayer arm_compute::deconvolution_output_dimensions failed"); - return false; - } - arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo, &weiTensorInfo, deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr, @@ -228,21 +268,5 @@ bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs, return true; } -bool AclDeconvExecutorBuilder::validate_deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height, - unsigned int kernel_width, - unsigned int kernel_height, - const PadStrideInfo &pad_stride_info) { - const unsigned int pad_left = pad_stride_info.pad_left(); - const unsigned int pad_top = pad_stride_info.pad_top(); - const unsigned int pad_right = pad_stride_info.pad_right(); - const unsigned int pad_bottom = pad_stride_info.pad_bottom(); - const unsigned int stride_x = pad_stride_info.stride().first; - const unsigned int stride_y = pad_stride_info.stride().second; - - if (!((in_width < 1 || in_height < 1) || - (((in_width - 1) * stride_x + kernel_width) < (pad_left + pad_right)) || - (((in_height - 1) * stride_y + kernel_height) < (pad_top + pad_bottom)))) { return false; } - return true; -} } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/convolution_backprop_data.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/convolution_backprop_data.cpp index c98b70f8fc1328..0b30f8c68b96fa 100755 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/convolution_backprop_data.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/convolution_backprop_data.cpp @@ -66,7 +66,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Deconv_2D_Planar_FP32, ::testing::ValuesIn(Planar_2D_inputs_smoke), ::testing::Values(ElementType::f32), ::testing::ValuesIn(fusingParamsSet), - ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl})), + ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl, conv_gemm_acl_2D_nspc})), ::testing::Values(CPUTestUtils::empty_plugin_config)), DeconvolutionLayerCPUTest::getTestCaseName); @@ -76,7 +76,7 @@ INSTANTIATE_TEST_SUITE_P(nightly_Deconv_2D_Planar_FP32, ::testing::ValuesIn(Planar_2D_inputs_nightly), ::testing::Values(ElementType::f32), ::testing::ValuesIn(fusingParamsSet), - ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl})), + ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl, conv_gemm_acl_2D_nspc})), ::testing::Values(CPUTestUtils::empty_plugin_config)), DeconvolutionLayerCPUTest::getTestCaseName); @@ -109,7 +109,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Deconv_2D_AutoPadding_FP32, ::testing::ValuesIn(inputs_2D_AutoPadding), ::testing::Values(ElementType::f32), ::testing::Values(emptyFusingSpec), - ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl, conv_avx512_2D})), + ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl, conv_gemm_acl_2D_nspc, conv_avx512_2D})), ::testing::Values(CPUTestUtils::empty_plugin_config)), DeconvolutionLayerCPUTest::getTestCaseName);