From c205f4b797642ef4bb27520e9a3771f02dd5eff6 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Sun, 14 Apr 2024 07:44:00 +0800
Subject: [PATCH] Fix

---
 paddle/fluid/operators/fused/CMakeLists.txt   |   5 -
 .../fused/fusion_conv_inception_op.cc         | 130 -------
 .../fused/fusion_conv_inception_op.cu         | 350 ------------------
 3 files changed, 485 deletions(-)
 delete mode 100644 paddle/fluid/operators/fused/fusion_conv_inception_op.cc
 delete mode 100644 paddle/fluid/operators/fused/fusion_conv_inception_op.cu
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index ced20a0108a52..713ad1931ce23 100755
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -6,7 +6,6 @@ endif()
 register_operators(
   EXCLUDES
   fused_bn_activation_op
-  fusion_conv_inception_op
   yolo_box_head_op
   yolo_box_post_op
   fusion_group_op
@@ -39,11 +38,7 @@ if(WITH_GPU OR WITH_ROCM)
     op_library(fused_bn_activation_op)
   endif()
   # HIP not support cudnnTransformTensor
-  # fusion_conv_inception_op needs cudnn 7 above
   # HIP not support cudnnConvolutionBiasActivationForward
-  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100))
-    op_library(fusion_conv_inception_op)
-  endif()
   op_library(yolo_box_head_op)
   op_library(yolo_box_post_op)
   op_library(fused_gate_attention_op)
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
deleted file mode 100644
index 6e1536e9934a2..0000000000000
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
-
-namespace paddle {
-namespace operators {
-
-class ConvInceptionFusionOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    // 1 x
-    auto in_dims = ctx->GetInputDim("Input");
-    // 4 filters
-    auto w_dims = ctx->GetInputsDim("Filter");
-
-    PADDLE_ENFORCE_EQ(
-        in_dims.size(),
-        4,
-        phi::errors::InvalidArgument("Conv intput should be 4-D tensor."));
-    PADDLE_ENFORCE_EQ(
-        w_dims.size(),
-        4,
-        phi::errors::InvalidArgument("There should be 4 filters."));
-    PADDLE_ENFORCE_EQ(w_dims[0][1],
-                      in_dims[1],
-                      phi::errors::InvalidArgument(
-                          "Invalid filter channel number %d, which should be "
-                          "equal to input channel number %d.",
-                          w_dims[0][1],
-                          in_dims[1]));
-    PADDLE_ENFORCE_EQ(w_dims[1][1],
-                      in_dims[1],
-                      phi::errors::InvalidArgument(
-                          "Invalid filter channel number %d, which should be "
-                          "equal to input channel number %d.",
-                          w_dims[1][1],
-                          in_dims[1]));
-
-    int n = in_dims[0];
-    // compute output channel
-    // 1st channel
-    int c = w_dims[0][0];
-    // add 2nd channel
-    c += (w_dims[1][0] - w_dims[2][1] * 2);
-    // add 3rd channel
-    c += (w_dims[2][0] - w_dims[3][1]);
-    // add 4-th channel
-    c += w_dims[3][0];
-
-    int h = in_dims[2];
-    int w = in_dims[3];
-
-    ctx->SetOutputDim("Output", {n, c, h, w});
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-                          ctx.GetPlace());
-  }
-};
-
-class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker {
- protected:
-  void Make() override {
-    AddInput("Input", "(Tensor) NCHW layout.");
-    AddInput("Filter", "(vector<Tensor>) 4 aggregated filters").AsDuplicable();
-    AddInput("Bias", "(vector<Tensor>) it's length is equal to Filter")
-        .AsDuplicable();
-    AddOutput("Output",
-              "(Tensor) The output tensor of convolution operator. "
-              "The format of output tensor is also NCHW.");
-    AddOutput("TempOutput", "").AsDuplicable();
-    AddAttr<std::string>(
-        "pooling_type",
-        "(string), pooling type, can be \"max\" for max-pooling "
-        "and \"avg\" for average-pooling.")
-        .InEnum({"max", "avg"});
-    AddAttr<bool>(
-        "exclusive",
-        "(bool, default True) When true, will exclude the zero-padding in the "
-        "averaging calculating, otherwise, include the zero-padding. Note, it "
-        "is only used when pooling_type is avg. The default is True.")
-        .SetDefault(true);
-    AddAttr<std::string>(
-        "activation",
-        "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
-        "'relux' , 'tanh', 'band_pass'")
-        .SetDefault("relu");
-    AddAttr<int>("workspace_size_MB",
-                 "Only used in cudnn kernel. Need set use_cudnn to true."
-                 "workspace size for cudnn, in MB, "
-                 "workspace is a section of GPU memory which will be "
-                 "allocated/freed each time the operator runs, larger "
-                 "workspace size can increase performance but also requires "
-                 "better hardware. This size should be chosen carefully.")
-        .SetDefault(phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB());
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    conv2d_inception_fusion,
-    ops::ConvInceptionFusionOp,
-    ops::ConvInceptionFusionOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
deleted file mode 100644
index 63f065e0fef49..0000000000000
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ /dev/null
@@ -1,350 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/phi/kernels/gpudnn/conv_gpudnn_info.h"
-
-namespace paddle {
-namespace operators {
-
-#if CUDNN_VERSION >= 7100
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
-using DataLayout = platform::DataLayout;
-
-using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
-using PoolingMode = platform::PoolingMode;
-template <typename T>
-using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
-
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-
-template <typename T, typename DeviceContext>
-class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if CUDNN_VERSION < 7100
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "The conv2d_inception_fusion operator is not supported on GPU "
-        "when CUDNN version < 7.1.0"));
-#endif
-    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto filters = ctx.MultiInput<phi::DenseTensor>("Filter");
-    auto bias = ctx.MultiInput<phi::DenseTensor>("Bias");
-
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    auto temp_outs = ctx.MultiOutput<phi::DenseTensor>("TempOutput");
-
-    const std::string pool_type = ctx.Attr<std::string>("pooling_type");
-    const std::string activation = ctx.Attr<std::string>("activation");
-    const bool exclusive = ctx.Attr<bool>("exclusive");
-
-    int64_t user_workspace_size =
-        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
-
-    const T* input_data = input->data<T>();
-    T* output_data = dev_ctx.Alloc<T>(output, output->numel() * sizeof(T));
-    temp_outs[0]->Resize(input->dims());
-    T* temp_data =
-        dev_ctx.Alloc<T>(temp_outs[0], temp_outs[0]->numel() * sizeof(T));
-
-    DataLayout layout = DataLayout::kNCHW;
-    std::vector<int> in_dim = common::vectorize<int>(input->dims());
-
-    // ------------------- cudnn descriptors ---------------------
-    PoolingMode pooling_mode;
-    if (pool_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
-    } else {
-      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
-                               : (PoolingMode::kAverageInclusive);
-    }
-    std::vector<int> k0x0 = {0, 0};
-    std::vector<int> k1x1 = {1, 1};
-    std::vector<int> k1x1_2 = {1, 1};
-    std::vector<int> k3x3 = {3, 3};
-    ScopedPoolingDescriptor pool_desc;
-    ScopedActivationDescriptor act_desc;
-    ScopedTensorDescriptor out_pool_desc;
-    ScopedTensorDescriptor input_desc;
-    cudnnPoolingDescriptor_t cudnn_pool_desc =
-        pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1);
-
-    cudnnTensorDescriptor_t cudnn_input_desc =
-        input_desc.descriptor<T>(layout, common::vectorize<int>(input->dims()));
-    cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor<T>(
-        layout, common::vectorize<int>(input->dims()));
-
-    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
-    cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4];
-    cudnnFilterDescriptor_t* filter_desc = new cudnnFilterDescriptor_t[4];
-    cudnnTensorDescriptor_t* bias_desc = new cudnnTensorDescriptor_t[4];
-    cudnnTensorDescriptor_t* in_desc = new cudnnTensorDescriptor_t[4];
-    cudnnConvolutionDescriptor_t* conv_desc =
-        new cudnnConvolutionDescriptor_t[4];
-    for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i]));
-    }
-
-    std::vector<std::vector<int>> filter_dims;
-    std::vector<std::vector<int>> bias_dims;
-    std::vector<std::vector<int>> in_dims;
-    std::vector<std::vector<int>> out_dims;
-    std::vector<std::vector<int>> in_strides;
-    std::vector<std::vector<int>> out_strides;
-    std::vector<std::vector<int>> bias_strides;
-
-    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
-    int n = in_dim[0];
-    int h = in_dim[2];
-    int w = in_dim[3];
-    int oc = output->dims()[1];
-
-    cudnnDataType_t compute_type = (cudnn_dtype == CUDNN_DATA_DOUBLE)
-                                       ? CUDNN_DATA_DOUBLE
-                                       : CUDNN_DATA_FLOAT;
-
-    for (int i = 0; i < 4; ++i) {
-      filter_dims.push_back(common::vectorize<int>(filters[i]->dims()));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
-          filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
-      bias_dims.push_back({1, filter_dims[i][0], 1, 1});
-      bias_strides.push_back({filter_dims[i][0], 1, 1, 1});
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          bias_desc[i],
-          cudnn_dtype,
-          4,
-          bias_dims[i].data(),
-          bias_strides[i].data()));
-      in_dims.push_back({n, filter_dims[i][1], h, w});
-      out_dims.push_back({n, filter_dims[i][0], h, w});
-      in_strides.push_back({filter_dims[i][1] * h * w, h * w, w, 1});
-      out_strides.push_back({oc * h * w, h * w, w, 1});
-
-      if (i < 2) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnSetConvolutionNdDescriptor(
-                conv_desc[i],
-                2,
-                k0x0.data(),
-                k1x1.data(),
-                k1x1.data(),
-                CUDNN_CROSS_CORRELATION,
-                compute_type));
-      } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnSetConvolutionNdDescriptor(
-                conv_desc[i],
-                2,
-                k1x1.data(),
-                k1x1.data(),
-                k1x1.data(),
-                CUDNN_CROSS_CORRELATION,
-                compute_type));
-      }
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-          conv_desc[i], CUDNN_DEFAULT_MATH));
-#if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
-      if (!phi::allow_tf32_cudnn) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
-                                                           CUDNN_FMA_MATH));
-      }
-#endif  // CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
-    }
-    in_dims[2][1] *= 2;
-    in_strides[2][0] = oc * h * w;
-    out_strides[2][0] = filter_dims[2][0] * h * w;  // this out is continuous.
-    in_strides[3][0] = filter_dims[2][0] * h * w;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2));
-
-    cudnnConvolutionFwdAlgo_t algo[4];
-    auto handle = dev_ctx.cudnn_handle();
-    size_t workspace_size_in_bytes = 0;  // final workspace to allocate.
-
-    size_t workspace_size_limit = 0;
-    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
-      int64_t max_user_size =
-          std::min(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
-                   user_workspace_size);
-      workspace_size_limit = max_user_size * 1024 * 1024;
-    }
-
-    for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data()));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnSetTensorNdDescriptor(out_desc[i],
-                                                        cudnn_dtype,
-                                                        4,
-                                                        out_dims[i].data(),
-                                                        out_strides[i].data()));
-
-      int perf_count;
-      int best_algo_idx = 0;
-      size_t tmp_size = 0;
-      std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
-          new cudnnConvolutionFwdAlgoPerf_t[phi::kNUM_CUDNN_FWD_ALGS]);
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
-              handle,
-              in_desc[i],
-              filter_desc[i],
-              conv_desc[i],
-              out_desc[i],
-              phi::kNUM_CUDNN_FWD_ALGS,
-              &perf_count,
-              perf_results.get()));
-      algo[i] = (perf_results.get())[best_algo_idx].algo;
-
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-              handle,
-              in_desc[i],
-              filter_desc[i],
-              conv_desc[i],
-              out_desc[i],
-              algo[i],
-              &tmp_size));
-
-      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
-    }
-    cudnnActivationDescriptor_t cudnn_act_desc =
-        act_desc.descriptor<T>(activation);
-
-    int oc0 = filter_dims[0][0];
-    int oc1 = filter_dims[1][0] - filter_dims[2][1] * 2;
-    int oc3 = filter_dims[3][0];
-    int oc2 = oc - oc0 - oc1 - oc3;
-
-    // branch1: pool + 1x1 conv
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnPoolingForward(handle,
-                                               cudnn_pool_desc,
-                                               &alpha,
-                                               cudnn_input_desc,
-                                               input_data,
-                                               &beta,
-                                               pool_out_desc,
-                                               temp_data));
-
-    std::vector<const void*> in_datas;
-    in_datas.push_back(static_cast<const void*>(temp_data));
-    in_datas.push_back(static_cast<const void*>(input_data));
-    in_datas.push_back(
-        static_cast<const void*>(output_data + (oc0 + oc1) * h * w));
-    temp_outs[1]->Resize(common::make_ddim(out_dims[2]));
-    T* temp2_data =
-        dev_ctx.Alloc<T>(temp_outs[1], temp_outs[1]->numel() * sizeof(T));
-    in_datas.push_back(static_cast<const void*>(temp2_data + oc2 * h * w));
-
-    std::vector<void*> out_datas;
-    out_datas.push_back(static_cast<void*>(output_data));
-    out_datas.push_back(static_cast<void*>(output_data + oc0 * h * w));
-    out_datas.push_back(static_cast<void*>(temp2_data));
-    out_datas.push_back(
-        static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
-
-    for (int i = 0; i < 4; ++i) {
-      auto func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnConvolutionBiasActivationForward(
-                handle,
-                &alpha,
-                in_desc[i],
-                in_datas[i],
-                filter_desc[i],
-                static_cast<const void*>(filters[i]->data<T>()),
-                conv_desc[i],
-                algo[i],
-                cudnn_workspace,
-                workspace_size_in_bytes,
-                &beta,
-                out_desc[i],
-                out_datas[i],
-                bias_desc[i],
-                static_cast<const void*>(bias[i]->data<T>()),
-                cudnn_act_desc,
-                out_desc[i],
-                out_datas[i]));
-      };
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-      workspace_handle.RunFunc(func, workspace_size_in_bytes);
-    }
-
-    cudnnTensorDescriptor_t x_desc;
-    cudnnTensorDescriptor_t y_desc;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&x_desc));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&y_desc));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnTransformTensor(
-        handle,
-        CudnnDataType<T>::kOne(),
-        x_desc,
-        static_cast<const void*>(out_datas[2]),
-        CudnnDataType<T>::kZero(),
-        y_desc,
-        static_cast<void*>(output_data + (oc0 + oc1) * h * w)));
-
-    for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i]));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i]));
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(x_desc));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(y_desc));
-  }
-};
-#endif
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(conv2d_inception_fusion,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::CUDNNConvInceptionFusionOpKernel,
-                          float,
-                          double) {}