new fix of dynamic shapes

allnes · Jan 12, 2024 · 45facf2 · 45facf2
1 parent 4ca8f15
commit 45facf2
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 73 deletions.
diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp
@@ -474,37 +474,35 @@ void Deconvolution::getSupportedDescriptors() {
     config.outConfs.resize(getOriginalOutputsNumber());
 
     auto& creatorsMap = BlockedDescCreator::getCommonCreators();
-    for (size_t i = 0; i < getParentEdges().size(); ++i) {
-        auto checkDesc = [&](LayoutType format) -> bool {
-            NodeConfig config;
-            config.inConfs.resize(getParentEdges().size());
-            config.outConfs.resize(getOriginalOutputsNumber());
-
-            for (size_t i = 0; i < getParentEdges().size(); ++i) {
-                config.inConfs[i].setMemDesc(
-                        creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i)));
-            }
+    auto checkDesc = [&](LayoutType format) -> bool {
+        NodeConfig config;
+        config.inConfs.resize(getParentEdges().size());
+        config.outConfs.resize(getOriginalOutputsNumber());
 
-            for (size_t i = 0; i < getChildEdges().size(); ++i) {
-                config.outConfs[i].setMemDesc(
-                        creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i)));
-            }
+        for (size_t i = 0; i < getParentEdges().size(); ++i) {
+            config.inConfs[i].setMemDesc(
+                    creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i)));
+        }
 
-            std::vector<MemoryDescPtr> srcMemoryDescs;
-            srcMemoryDescs.push_back(config.inConfs[0].getMemDesc()->cloneWithNewDims(inDims));
-            for (size_t i = 1; i < config.inConfs.size(); i++) {
-                srcMemoryDescs.push_back(config.inConfs[i].getMemDesc()->clone());
-            }
-            std::vector<MemoryDescPtr> dstMemoryDescs;
-            dstMemoryDescs.push_back(config.outConfs[0].getMemDesc()->cloneWithNewDims(outDims));
-            for (size_t i = 1; i < config.outConfs.size(); i++) {
-                dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()->clone());
-            }
+        for (size_t i = 0; i < getChildEdges().size(); ++i) {
+            config.outConfs[i].setMemDesc(
+                    creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i)));
+        }
 
-            return AclDeconvExecutorBuilder::customIsSupported(deconvAttrs, srcMemoryDescs, dstMemoryDescs);
-        };
-        useACL = checkDesc(LayoutType::ncsp);
-    }
+        std::vector<MemoryDescPtr> srcMemoryDescs;
+        srcMemoryDescs.push_back(config.inConfs[0].getMemDesc()->cloneWithNewDims(inDims));
+        for (size_t i = 1; i < config.inConfs.size(); i++) {
+            srcMemoryDescs.push_back(config.inConfs[i].getMemDesc()->clone());
+        }
+        std::vector<MemoryDescPtr> dstMemoryDescs;
+        dstMemoryDescs.push_back(config.outConfs[0].getMemDesc()->cloneWithNewDims(outDims));
+        for (size_t i = 1; i < config.outConfs.size(); i++) {
+            dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()->clone());
+        }
+
+        return AclDeconvExecutorBuilder::customIsSupported(deconvAttrs, srcMemoryDescs, dstMemoryDescs);
+    };
+    useACL = checkDesc(LayoutType::nspc) || checkDesc(LayoutType::ncsp);
     if (useACL) return;
 #endif
 
@@ -875,11 +873,11 @@ void Deconvolution::prepareParams() {
     if (useACL) {
         std::vector<MemoryDescPtr> srcMemoryDescs;
         for (size_t i = 0; i < getOriginalInputsNumber(); i++) {
-            srcMemoryDescs.push_back(getParentEdgesAtPort(i).front()->getMemory().getDescWithType<DnnlMemoryDesc>());
+            srcMemoryDescs.push_back(getParentEdgesAtPort(i).front()->getMemory().getDescPtr());
         }
         std::vector<MemoryDescPtr> dstMemoryDescs;
         for (size_t i = 0; i < getOriginalOutputsNumber(); i++) {
-            dstMemoryDescs.push_back(getChildEdgesAtPort(i).front()->getMemory().getDescWithType<DnnlMemoryDesc>());
+            dstMemoryDescs.push_back(getChildEdgesAtPort(i).front()->getMemory().getDescPtr());
         }
 
         execPtrDeconv = selected_pd->getExecutorFactoryAs<DeconvExecutorFactory>()->makeExecutor(deconvAttrs, srcMemoryDescs,
@@ -1253,6 +1251,7 @@ void Deconvolution::initSupportedPrimitiveDescriptors() {
 
         supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::gemm_acl, factory);
     };
+    pushDesc(LayoutType::nspc);
     pushDesc(LayoutType::ncsp);
 }
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp
@@ -15,26 +15,49 @@ ACLDeconvTensorInfo getACLDeconvTensorInfo(const DeconvAttrs& deconvAttrs,
                                            const std::vector<MemoryDescPtr>& dstDescs) {
     auto srcDims  = srcDescs[0]->getShape().getDims();
     auto weiDims  = srcDescs[1]->getShape().getDims();
+//    std::cout << (srcDescs[0]->hasLayoutType(LayoutType::nspc) ? "nhwc" : "nchw") << std::endl;
+//    std::cout << weiDims[0] << " | " << weiDims[1] << " | " << weiDims[2] << " | " << weiDims[3] << " | " << std::endl;
     // swap input and output channels dimensions to be align with ACL
     // weights tensor shape is changed because ACL expects [O, I, H, W] tensor while OV uses [I, O, H, W] tensor
     std::swap(weiDims[0], weiDims[1]);
+//    std::cout << weiDims[0] << " | " << weiDims[1] << " | " << weiDims[2] << " | " << weiDims[3] << " | " << std::endl;
     auto dstDims  = dstDescs[0]->getShape().getDims();
 
-    VectorDims biasDims;
-    TensorInfo biasTensorInfo;
-
+    arm_compute::TensorShape srcVecDims = shapeCast(srcDims);
+    arm_compute::TensorShape weiVecDims = shapeCast(weiDims);
+    arm_compute::TensorShape dstVecDims = shapeCast(dstDims);
+    arm_compute::TensorShape biasVecDims;
     if (deconvAttrs.withBiasesParam) {
-        biasDims = srcDescs[2]->getShape().getStaticDims();
-        biasTensorInfo = TensorInfo(shapeCast(biasDims), 1,
-                                    precisionToAclDataType(srcDescs[2]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2]));
+        biasVecDims = shapeCast(srcDescs[2]->getShape().getDims());
+    }
+    if (srcDescs[0]->hasLayoutType(LayoutType::nspc)) {
+        auto dim_size = srcDescs[0]->getShape().getDims().size();
+        auto mover = [&dim_size](TensorShape &_shape) {
+            if (dim_size > 4) { std::swap(_shape[2], _shape[3]); }
+            if (dim_size > 3) { std::swap(_shape[1], _shape[2]); }
+            if (dim_size > 2) { std::swap(_shape[0], _shape[1]); }
+        };
+        mover(srcVecDims);
+        mover(weiVecDims);
+        mover(dstVecDims);
+        if (deconvAttrs.withBiasesParam) {
+            mover(biasVecDims);
+        }
     }
 
-    TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
+    std::cout << weiVecDims[0] << " | " << weiVecDims[1] << " | " << weiVecDims[2] << " | " << weiVecDims[3] << " | " << std::endl;
+    std::cout << weiVecDims[0] << " ======================== " << std::endl;
+    TensorInfo srcTensorInfo = TensorInfo(srcVecDims, 1,
                                           precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
-    TensorInfo weiTensorInfo = TensorInfo(shapeCast(weiDims), 1,
+    TensorInfo weiTensorInfo = TensorInfo(weiVecDims, 1,
                                           precisionToAclDataType(srcDescs[1]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[1]));
-    TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
+    TensorInfo dstTensorInfo = TensorInfo(dstVecDims, 1,
                                           precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));
+    TensorInfo biasTensorInfo;
+    if (deconvAttrs.withBiasesParam) {
+        biasTensorInfo = TensorInfo(biasVecDims, 1,
+                                    precisionToAclDataType(srcDescs[2]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2]));
+    }
 
     unsigned int pad_l =
             (deconvAttrs.paddingL.size() > 1) ? static_cast<unsigned int>(deconvAttrs.paddingL.at(1)) : static_cast<unsigned int>(deconvAttrs.paddingL.at(0));
@@ -114,9 +137,37 @@ static void transpose_to_1023(const MemoryCPtr& srcMemPtr, std::vector<float>& d
     });
 }
 
+//static void transpose_to_0231(VectorDims new_dims, std::vector<float>& dst_data) {
+//    const unsigned long DIM0 = new_dims[0];
+//    const unsigned long DIM1 = new_dims[1];
+//    const unsigned long DIM2 = new_dims[2];
+//    const unsigned long DIM3 = new_dims[3];
+//
+//    parallel_for3d(DIM0, DIM1, DIM2, [&](unsigned long dim0, unsigned long dim1, unsigned long dim2) {
+//        for (int dim3 = 0; dim3 < DIM3; ++dim3) {
+//            unsigned long src_off = dim0 * DIM1 * DIM2 * DIM3 +
+//                                    dim1 * DIM2 * DIM3 +
+//                                    dim2 * DIM3 +
+//                                    dim3;
+//            unsigned long dst_off = dim0 * DIM2 * DIM3 * DIM1 +
+//                                    dim2 * DIM3 * DIM1 +
+//                                    dim3 * DIM1 +
+//                                    dim1;
+//
+//            std::swap(dst_data[dst_off], dst_data[src_off]);
+//        }
+//    });
+//}
+
 void AclDeconvExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
     // TODO: Remove transpose from exec
     transpose_to_1023(src[1], weiBuffer);
+//    std::cout << src[1]->getShape().getDims()[0] << " * " <<
+//    src[1]->getShape().getDims()[1] << " * " <<
+//    src[1]->getShape().getDims()[2] << " * " <<
+//    src[1]->getShape().getDims()[3] << " * " << std::endl;
+//    VectorDims vec_dims = {12, 6, 3, 3};
+//    transpose_to_0231(vec_dims, weiBuffer);
 
     srcTensor.allocator()->import_memory(src[0]->getData());
     dstTensor.allocator()->import_memory(dst[0]->getData());
@@ -193,28 +244,17 @@ bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs,
     TensorInfo dstTensorInfo = aclDeconvTensorInfo.dstTensorInfo;
     PadStrideInfo deconv_info = aclDeconvTensorInfo.deconv_info;
 
-    unsigned int kernel_x = (deconvAttrs.kernel.size() > 1) ? deconvAttrs.kernel.at(1) : deconvAttrs.kernel.at(0);
-    unsigned int kernel_y = deconvAttrs.kernel.at(0);
-
     // After stride=8 up-sampling in ACL Deconvolution layer slower than reference
-    if (deconv_info.stride().first >= 8 || deconv_info.stride().second >= 8) return false;
+    if (deconv_info.stride().first >= 8 || deconv_info.stride().second >= 8) {
+        DEBUG_LOG("AclDeconvExecutor does not support strides > 8:");
+        return false;
+    }
 
     unsigned int dilation_x = (deconvAttrs.dilation.size() > 1) ? deconvAttrs.dilation.at(1) : deconvAttrs.dilation.at(0);
     unsigned int dilation_y = deconvAttrs.dilation.at(0);
     if (!one_of(dilation_x, static_cast<unsigned int >(0), static_cast<unsigned int >(1)) ||
         !one_of(dilation_y, static_cast<unsigned int >(0), static_cast<unsigned int >(1))) return false;
 
-    size_t in_h = srcDescs[0]->hasLayoutType(LayoutType::ncsp) ? srcDescs[0]->getShape().getDims()[2] : srcDescs[0]->getShape().getDims()[1];
-    size_t in_w = srcDescs[0]->hasLayoutType(LayoutType::ncsp) ? srcDescs[0]->getShape().getDims()[3] : srcDescs[0]->getShape().getDims()[2];
-
-    // Validate function has bug (https://github.com/ARM-software/ComputeLibrary/issues/1061) with error exception.
-    // We copy deconvolution_output_dimensions function for get correct validation
-    // TODO: remove after fix
-    if (validate_deconvolution_output_dimensions(in_w, in_h, kernel_x, kernel_y, deconv_info)) {
-        DEBUG_LOG("NEDeconvolutionLayer arm_compute::deconvolution_output_dimensions failed");
-        return false;
-    }
-
     arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo,
                                                                              &weiTensorInfo,
                                                                              deconvAttrs.withBiasesParam ? &biasTensorInfo : nullptr,
@@ -228,21 +268,5 @@ bool AclDeconvExecutorBuilder::customIsSupported(const DeconvAttrs &deconvAttrs,
     return true;
 }
 
-bool AclDeconvExecutorBuilder::validate_deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                                                   unsigned int kernel_width,
-                                                                   unsigned int kernel_height,
-                                                                   const PadStrideInfo &pad_stride_info) {
-    const unsigned int pad_left   = pad_stride_info.pad_left();
-    const unsigned int pad_top    = pad_stride_info.pad_top();
-    const unsigned int pad_right  = pad_stride_info.pad_right();
-    const unsigned int pad_bottom = pad_stride_info.pad_bottom();
-    const unsigned int stride_x   = pad_stride_info.stride().first;
-    const unsigned int stride_y   = pad_stride_info.stride().second;
-
-    if (!((in_width < 1 || in_height < 1) ||
-          (((in_width - 1) * stride_x + kernel_width) < (pad_left + pad_right)) ||
-          (((in_height - 1) * stride_y + kernel_height) < (pad_top + pad_bottom)))) { return false; }
-    return true;
-}
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/...el_cpu/tests/functional/single_layer_tests/instances/common/convolution_backprop_data.cpp b/...el_cpu/tests/functional/single_layer_tests/instances/common/convolution_backprop_data.cpp
@@ -66,7 +66,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Deconv_2D_Planar_FP32,
                                             ::testing::ValuesIn(Planar_2D_inputs_smoke),
                                             ::testing::Values(ElementType::f32),
                                             ::testing::ValuesIn(fusingParamsSet),
-                                            ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl})),
+                                            ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl, conv_gemm_acl_2D_nspc})),
                                             ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          DeconvolutionLayerCPUTest::getTestCaseName);
 
@@ -76,7 +76,7 @@ INSTANTIATE_TEST_SUITE_P(nightly_Deconv_2D_Planar_FP32,
                                             ::testing::ValuesIn(Planar_2D_inputs_nightly),
                                             ::testing::Values(ElementType::f32),
                                             ::testing::ValuesIn(fusingParamsSet),
-                                            ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl})),
+                                            ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl, conv_gemm_acl_2D_nspc})),
                                             ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          DeconvolutionLayerCPUTest::getTestCaseName);
 
@@ -109,7 +109,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Deconv_2D_AutoPadding_FP32,
                                             ::testing::ValuesIn(inputs_2D_AutoPadding),
                                             ::testing::Values(ElementType::f32),
                                             ::testing::Values(emptyFusingSpec),
-                                            ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl, conv_avx512_2D})),
+                                            ::testing::ValuesIn(filterCPUInfo({conv_gemm_2D, conv_gemm_2D_acl, conv_gemm_acl_2D_nspc, conv_avx512_2D})),
                                             ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          DeconvolutionLayerCPUTest::getTestCaseName);