diff --git a/cinn/frontend/syntax.cc b/cinn/frontend/syntax.cc
index adb725b661df5..7e2ac9196c30f 100644
--- a/cinn/frontend/syntax.cc
+++ b/cinn/frontend/syntax.cc
@@ -45,7 +45,7 @@ Variable Program::conv2d(const Variable& a,
     instr.SetAttr(iter.first, iter.second);
   }
   AppendInstruction(instr);
-  return instr.GetOutput(2);
+  return instr.GetOutput(0);
 }
 
 Variable Program::depthwise_conv2d(const Variable& a,
@@ -57,7 +57,7 @@ Variable Program::depthwise_conv2d(const Variable& a,
     instr.SetAttr(iter.first, iter.second);
   }
   AppendInstruction(instr);
-  return instr.GetOutput(1);
+  return instr.GetOutput(0);
 }
 
 Variable Program::pool2d(const Variable& a, const std::unordered_map<std::string, attr_t>& attr_store) {
@@ -67,7 +67,7 @@ Variable Program::pool2d(const Variable& a, const std::unordered_map<std::string
     instr.SetAttr(iter.first, iter.second);
   }
   AppendInstruction(instr);
-  return instr.GetOutput(1);
+  return instr.GetOutput(0);
 }
 
 Variable Program::batchnorm(const Variable& a,
diff --git a/cinn/hlir/framework/graph_compiler.cc b/cinn/hlir/framework/graph_compiler.cc
index 45985f12b5dd8..1e5259a9c2a7b 100644
--- a/cinn/hlir/framework/graph_compiler.cc
+++ b/cinn/hlir/framework/graph_compiler.cc
@@ -68,6 +68,7 @@ ir::LoweredFunc GraphCompiler::GetOpFunc(const Node* node) {
   std::vector<ir::Tensor> inputs;
   std::vector<common::CINNValue> cinn_inputs;
   std::vector<std::vector<int>> output_shapes;
+  LOG(INFO) << "GetOpFunc of op " << node->id();
   for (auto& i : node->inlinks_in_order()) {
     std::string input_id = i->source()->as<NodeData>()->id();
     auto in_shape        = shape_dict.at(input_id);
diff --git a/cinn/hlir/op/CMakeLists.txt b/cinn/hlir/op/CMakeLists.txt
index 340a9922493ee..3a97d1f7ac9b9 100644
--- a/cinn/hlir/op/CMakeLists.txt
+++ b/cinn/hlir/op/CMakeLists.txt
@@ -11,4 +11,3 @@ foreach(cpp ${srcs})
 endforeach()
 
 cc_test(test_op_broadcast SRCS op_broadcast_test.cc DEPS core)
-cc_test(test_op_nn SRCS op_nn_test.cc DEPS core)
diff --git a/cinn/hlir/op/nn.cc b/cinn/hlir/op/nn.cc
index caefe8a4cc1ab..53213b81526dc 100644
--- a/cinn/hlir/op/nn.cc
+++ b/cinn/hlir/op/nn.cc
@@ -6,6 +6,7 @@
 #include "cinn/hlir/pe/broadcast.h"
 #include "cinn/hlir/pe/elementwise.h"
 #include "cinn/ir/node.h"
+#include "cinn/poly/stage.h"
 
 namespace cinn {
 namespace hlir {
@@ -141,7 +142,6 @@ std::shared_ptr<OpStrategy> StrategyForConv2d(const framework::NodeAttr &attrs,
                             stride[1],
                             dilation[0],
                             dilation[1],
-                            output_shapes,
                             UniqName("Conv2d_nchw_out"));
     } else if (data_format == "NHWC") {
       // A is input: [N, H, W, C], B is filter: [C_out, C_in/group, filter_h, filter_w]
@@ -153,7 +153,6 @@ std::shared_ptr<OpStrategy> StrategyForConv2d(const framework::NodeAttr &attrs,
                             stride[1],
                             dilation[0],
                             dilation[1],
-                            output_shapes,
                             UniqName("Conv2d_nhwc_out"));
     } else {
       LOG(FATAL) << "Only support NCHW and NHWC data layout\n";
@@ -177,8 +176,14 @@ std::shared_ptr<OpStrategy> StrategyForConv2d(const framework::NodeAttr &attrs,
     CHECK(!args.empty()) << "The input argument of conv2d schedule is empty! Please check.\n";
     CINNValuePack arg_pack = args[0];
     CHECK_EQ(arg_pack.size(), 4UL);
-    Expr A [[maybe_unused]] = arg_pack[0];
-    *ret                    = arg_pack;
+    poly::StageMap stages = arg_pack[3];
+    Expr input_pad        = arg_pack[0];
+    CHECK(input_pad.as_tensor());
+    stages[input_pad.as_tensor_ref()]->ComputeInline();
+    Expr weights_dilation = arg_pack[1];
+    CHECK(weights_dilation.as_tensor());
+    stages[weights_dilation.as_tensor_ref()]->ComputeInline();
+    *ret = CINNValuePack{{arg_pack[2], CINNValue(stages)}};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
@@ -220,30 +225,14 @@ std::vector<shape_t> InferShapeForConv2d(const std::vector<shape_t> &inputs_shap
         (inputs_shape[0][2] - ((inputs_shape[1][2] - 1) * dilation[0] + 1) + 2 * padding[0]) / stride[0] + 1;
     int out_shape_w =
         (inputs_shape[0][3] - ((inputs_shape[1][3] - 1) * dilation[1] + 1) + 2 * padding[1]) / stride[1] + 1;
-    res = {{inputs_shape[0][0],
-            inputs_shape[0][1],
-            inputs_shape[0][2] + 2 * padding[0],
-            inputs_shape[0][3] + 2 * padding[1]},
-           {inputs_shape[1][0],
-            inputs_shape[1][1],
-            (inputs_shape[1][2] - 1) * dilation[0] + 1,
-            (inputs_shape[1][3] - 1) * dilation[1] + 1},
-           {inputs_shape[0][0], inputs_shape[1][0], out_shape_h, out_shape_w}};
+    res = {{inputs_shape[0][0], inputs_shape[1][0], out_shape_h, out_shape_w}};
   } else if (data_format == "NHWC") {
     // A is input: [N, H, W, C], B is filter: [C_out, C_in/group, filter_h, filter_w]
     int out_shape_h =
         (inputs_shape[0][1] - ((inputs_shape[1][2] - 1) * dilation[0] + 1) + 2 * padding[0]) / stride[0] + 1;
     int out_shape_w =
         (inputs_shape[0][2] - ((inputs_shape[1][3] - 1) * dilation[1] + 1) + 2 * padding[1]) / stride[1] + 1;
-    res = {{inputs_shape[0][0],
-            inputs_shape[0][1] + 2 * padding[0],
-            inputs_shape[0][2] + 2 * padding[1],
-            inputs_shape[0][3]},
-           {inputs_shape[1][0],
-            inputs_shape[1][1],
-            (inputs_shape[1][2] - 1) * dilation[0] + 1,
-            (inputs_shape[1][3] - 1) * dilation[1] + 1},
-           {inputs_shape[0][0], out_shape_h, out_shape_w, inputs_shape[1][0]}};
+    res = {{inputs_shape[0][0], out_shape_h, out_shape_w, inputs_shape[1][0]}};
   } else {
     LOG(FATAL) << "Only support NCHW and NHWC data layout\n";
   }
@@ -252,7 +241,7 @@ std::vector<shape_t> InferShapeForConv2d(const std::vector<shape_t> &inputs_shap
 
 std::vector<Type> InferDtypeForConv2d(const std::vector<Type> &inputs_type, const framework::NodeAttr &attrs) {
   CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
-  std::vector<Type> res{inputs_type[0], inputs_type[1], inputs_type[0]};
+  std::vector<Type> res{inputs_type[0]};
   return res;
 }
 
@@ -293,7 +282,6 @@ std::shared_ptr<OpStrategy> StrategyForDepthwiseConv2d(const framework::NodeAttr
                                       padding[1],
                                       stride[0],
                                       stride[1],
-                                      output_shapes,
                                       UniqName("T_depthwise_conv2d_nchw_out"));
     } else if (data_format == "NHWC") {
       out = pe::Depthwise_Conv2d_NHWC(A.as_tensor_ref(),
@@ -302,7 +290,6 @@ std::shared_ptr<OpStrategy> StrategyForDepthwiseConv2d(const framework::NodeAttr
                                       padding[1],
                                       stride[0],
                                       stride[1],
-                                      output_shapes,
                                       UniqName("T_depthwise_conv2d_nhwc_out"));
     } else {
       LOG(FATAL) << "Only support NCHW and NHWC data layout\n";
@@ -314,8 +301,9 @@ std::shared_ptr<OpStrategy> StrategyForDepthwiseConv2d(const framework::NodeAttr
       stages->InsertLazily(t);
       res.push_back(CINNValue(t));
     }
-    CHECK_EQ(out.size(), 2U) << "The output tensor sizes of depthwise_conv op in depthwise_conv op should be 2\n";
-    out[1]->InitReduction(stages, make_const(out[1]->type(), 0));  // res
+    CHECK(out.size() == 2U || out.size() == 1U)
+        << "The output tensor sizes of depthwise_conv op in depthwise_conv op should be 1 or 2\n";
+    out.back()->InitReduction(stages, make_const(out.back()->type(), 0));  // res
     res.push_back(CINNValue(stages));
     *ret = CINNValuePack{res};
   });
@@ -323,9 +311,16 @@ std::shared_ptr<OpStrategy> StrategyForDepthwiseConv2d(const framework::NodeAttr
   framework::CINNSchedule depthwise_conv2d_schedule([](lang::Args args, lang::RetValue *ret) {
     CHECK(!args.empty()) << "The input argument of depthwise_conv schedule is empty! Please check.\n";
     CINNValuePack arg_pack = args[0];
-    CHECK_EQ(arg_pack.size(), 3UL);
-    Expr A [[maybe_unused]] = arg_pack[0];
-    *ret                    = arg_pack;
+    CHECK(arg_pack.size() == 2UL || arg_pack.size() == 3UL);
+    if (arg_pack.size() == 3UL) {
+      poly::StageMap stages = arg_pack[2];
+      Expr input_pad        = arg_pack[0];
+      CHECK(input_pad.as_tensor());
+      stages[input_pad.as_tensor_ref()]->ComputeInline();
+      *ret = CINNValuePack{{arg_pack[1], CINNValue(stages)}};
+    } else {
+      *ret = arg_pack;
+    }
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
@@ -362,20 +357,12 @@ std::vector<shape_t> InferShapeForDepthwiseConv2d(const std::vector<shape_t> &in
     // A is input: [N, C, H, W], and B is filter: [C_in, channel_multiplier, f_h, f_w]
     int out_shape_h = (inputs_shape[0][2] - inputs_shape[1][2] + 2 * padding[0]) / stride[0] + 1;
     int out_shape_w = (inputs_shape[0][3] - inputs_shape[1][3] + 2 * padding[1]) / stride[1] + 1;
-    res             = {{inputs_shape[0][0],
-            inputs_shape[0][1],
-            inputs_shape[0][2] + 2 * padding[0],
-            inputs_shape[0][3] + 2 * padding[1]},
-           {inputs_shape[0][0], inputs_shape[1][1] * inputs_shape[0][1], out_shape_h, out_shape_w}};
+    res             = {{inputs_shape[0][0], inputs_shape[1][1] * inputs_shape[0][1], out_shape_h, out_shape_w}};
   } else if (data_format == "NHWC") {
     // A is input: [N, H, W, C], and B is filter: [C_in, channel_multiplier, f_h, f_w]
     int out_shape_h = (inputs_shape[0][1] - inputs_shape[1][1] + 2 * padding[0]) / stride[0] + 1;
     int out_shape_w = (inputs_shape[0][2] - inputs_shape[1][2] + 2 * padding[1]) / stride[1] + 1;
-    res             = {{inputs_shape[0][0],
-            inputs_shape[0][1] + 2 * padding[0],
-            inputs_shape[0][2] + 2 * padding[1],
-            inputs_shape[0][3]},
-           {inputs_shape[0][0], out_shape_h, out_shape_w, inputs_shape[1][1] * inputs_shape[0][3]}};
+    res             = {{inputs_shape[0][0], out_shape_h, out_shape_w, inputs_shape[1][1] * inputs_shape[0][3]}};
   } else {
     LOG(FATAL) << "Only support NCHW and NHWC data layout\n";
   }
@@ -384,7 +371,7 @@ std::vector<shape_t> InferShapeForDepthwiseConv2d(const std::vector<shape_t> &in
 
 std::vector<Type> InferDtypeForDepthwiseConv2d(const std::vector<Type> &inputs_type, const framework::NodeAttr &attrs) {
   CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
-  std::vector<Type> res{inputs_type[0], inputs_type[0]};
+  std::vector<Type> res{inputs_type[0]};
   return res;
 }
 
@@ -507,9 +494,9 @@ std::shared_ptr<OpStrategy> StrategyForPool1d(const framework::NodeAttr &attrs,
                           UniqName("T_Pool1d_out"));
 
     auto stages = CreateStages(out);
-    CHECK_EQ(out.size(), 2U) << "The size of pe::Pool1d's output should be 2.";
+    CHECK(out.size() == 1U || out.size() == 2U) << "The size of pe::Pool1d's output should be 1 or 2.";
     CHECK(!out_type.empty()) << "Output type of Pool1d is empty! Please check.\n";
-    out[1]->InitReduction(stages, ir::Zero(out_type[0]));
+    out.back()->InitReduction(stages, ir::Zero(out_type[0]));
     std::vector<CINNValue> res;
     for (auto &t : out) {
       res.push_back(CINNValue(Expr(t.get())));
@@ -521,9 +508,16 @@ std::shared_ptr<OpStrategy> StrategyForPool1d(const framework::NodeAttr &attrs,
   framework::CINNSchedule pool1d_schedule([](lang::Args args, lang::RetValue *ret) {
     CHECK(!args.empty()) << "The input argument of pool1d schedule is empty! Please check.\n";
     CINNValuePack arg_pack = args[0];
-    CHECK_EQ(arg_pack.size(), 3UL);
-    Expr A [[maybe_unused]] = arg_pack[0];
-    *ret                    = arg_pack;
+    CHECK(arg_pack.size() == 2UL || arg_pack.size() == 3UL);
+    if (arg_pack.size() == 3UL) {
+      poly::StageMap stages = arg_pack[2];
+      Expr input_pad        = arg_pack[0];
+      CHECK(input_pad.as_tensor());
+      stages[input_pad.as_tensor_ref()]->ComputeInline();
+      *ret = CINNValuePack{{arg_pack[1], CINNValue(stages)}};
+    } else {
+      *ret = arg_pack;
+    }
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
@@ -562,9 +556,8 @@ std::vector<std::vector<int>> InferShapeForPool1d(const std::vector<std::vector<
   CHECK_EQ(stride_size.size(), 1U) << "stride_size size for pool1d should be 1.\n";
   CHECK_EQ(padding_size.size(), 2U) << "padding_size size for pool1d should be 2.\n";
 
-  std::vector<int> output_shape0 = inputs_shape[0];
   std::vector<int> output_shape1 = inputs_shape[0];
-  CHECK_EQ(output_shape0.size(), 3U);
+  CHECK_EQ(output_shape1.size(), 3U);
   int width_axis = -1;
   if (data_format == "NCW") {
     width_axis = 2;
@@ -574,9 +567,7 @@ std::vector<std::vector<int>> InferShapeForPool1d(const std::vector<std::vector<
     LOG(FATAL) << "unsupported data_format: " << data_format << std::endl;
   }
 
-  output_shape0[width_axis] += padding_size[0] + padding_size[1];
   if (ceil_mode) {
-    output_shape0[width_axis] += stride_size[0];
     output_shape1[width_axis] =
         (inputs_shape[0][width_axis] - kernel_size[0] + padding_size[0] + padding_size[1] + stride_size[0] - 1) /
             stride_size[0] +
@@ -586,7 +577,7 @@ std::vector<std::vector<int>> InferShapeForPool1d(const std::vector<std::vector<
         (inputs_shape[0][width_axis] - kernel_size[0] + padding_size[0] + padding_size[1]) / stride_size[0] + 1;
   }
 
-  std::vector<std::vector<int>> res{output_shape0, output_shape1};
+  std::vector<std::vector<int>> res{output_shape1};
   return res;
 }
 
@@ -643,9 +634,9 @@ std::shared_ptr<OpStrategy> StrategyForPool2d(const framework::NodeAttr &attrs,
                           UniqName("T_Pool2d_out"));
 
     auto stages = CreateStages(out);
-    CHECK_EQ(out.size(), 2U) << "The size of pe::Pool2d's output should be 2.";
+    CHECK(out.size() == 1U || out.size() == 2U) << "The size of pe::Pool2d's output should be 1 or 2.";
     CHECK(!out_type.empty()) << "Output type of Pool2d is empty! Please check.\n";
-    out[1]->InitReduction(stages, ir::Zero(out_type[0]));
+    out.back()->InitReduction(stages, ir::Zero(out_type[0]));
     std::vector<CINNValue> res;
     for (auto &t : out) {
       res.push_back(CINNValue(Expr(t.get())));
@@ -657,9 +648,16 @@ std::shared_ptr<OpStrategy> StrategyForPool2d(const framework::NodeAttr &attrs,
   framework::CINNSchedule pool2d_schedule([](lang::Args args, lang::RetValue *ret) {
     CHECK(!args.empty()) << "The input argument of pool2d schedule is empty! Please check.\n";
     CINNValuePack arg_pack = args[0];
-    CHECK_EQ(arg_pack.size(), 3UL);
-    Expr A [[maybe_unused]] = arg_pack[0];
-    *ret                    = arg_pack;
+    CHECK(arg_pack.size() == 2UL || arg_pack.size() == 3UL);
+    if (arg_pack.size() == 3UL) {
+      poly::StageMap stages = arg_pack[2];
+      Expr input_pad        = arg_pack[0];
+      CHECK(input_pad.as_tensor());
+      stages[input_pad.as_tensor_ref()]->ComputeInline();
+      *ret = CINNValuePack{{arg_pack[1], CINNValue(stages)}};
+    } else {
+      *ret = arg_pack;
+    }
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
@@ -697,7 +695,6 @@ std::vector<std::vector<int>> InferShapeForPool2d(const std::vector<std::vector<
   CHECK_EQ(kernel_size.size(), 2U) << "kernel size for pool1d should be 2.\n";
   CHECK_EQ(stride_size.size(), 2U) << "stride_size size for pool1d should be 2.\n";
 
-  std::vector<int> output_shape0 = inputs_shape[0];
   std::vector<int> output_shape1 = inputs_shape[0];
   CHECK_EQ(inputs_shape[0].size(), 4U) << "input_shape size for pool2d should be 4.\n";
   int height_axis = -1;
@@ -716,11 +713,7 @@ std::vector<std::vector<int>> InferShapeForPool2d(const std::vector<std::vector<
     LOG(ERROR) << "unsupported data_format: " << data_format << std::endl;
   }
 
-  output_shape0[height_axis] += padding_size[0] + padding_size[2];
-  output_shape0[width_axis] += padding_size[1] + padding_size[3];
   if (ceil_mode) {
-    output_shape0[height_axis] += stride_size[0] - 1;
-    output_shape0[width_axis] += stride_size[1] - 1;
     output_shape1[height_axis] =
         (inputs_shape[0][height_axis] - kernel_size[0] + padding_size[0] + padding_size[2] + stride_size[0] - 1) /
             stride_size[0] +
@@ -736,7 +729,7 @@ std::vector<std::vector<int>> InferShapeForPool2d(const std::vector<std::vector<
         (inputs_shape[0][width_axis] - kernel_size[1] + padding_size[1] + padding_size[3]) / stride_size[1] + 1;
   }
 
-  std::vector<std::vector<int>> res{output_shape0, output_shape1};
+  std::vector<std::vector<int>> res{output_shape1};
   return res;
 }
 
@@ -794,9 +787,9 @@ std::shared_ptr<OpStrategy> StrategyForPool3d(const framework::NodeAttr &attrs,
                           UniqName("T_Pool3d_out"));
 
     auto stages = CreateStages(out);
-    CHECK_EQ(out.size(), 2U) << "The size of pe::Pool3d's output should be 2.";
+    CHECK(out.size() == 1U || out.size() == 2U) << "The size of pe::Pool3d's output should be 1 or 2.";
     CHECK(!out_type.empty()) << "Output type of Pool3d is empty! Please check.\n";
-    out[1]->InitReduction(stages, ir::Zero(out_type[0]));
+    out.back()->InitReduction(stages, ir::Zero(out_type[0]));
     std::vector<CINNValue> res;
     for (auto &t : out) {
       res.push_back(CINNValue(Expr(t.get())));
@@ -808,9 +801,16 @@ std::shared_ptr<OpStrategy> StrategyForPool3d(const framework::NodeAttr &attrs,
   framework::CINNSchedule pool3d_schedule([](lang::Args args, lang::RetValue *ret) {
     CHECK(!args.empty()) << "The input argument of pool3d schedule is empty! Please check.\n";
     CINNValuePack arg_pack = args[0];
-    CHECK_EQ(arg_pack.size(), 3UL);
-    Expr A [[maybe_unused]] = arg_pack[0];
-    *ret                    = arg_pack;
+    CHECK(arg_pack.size() == 2UL || arg_pack.size() == 3UL);
+    if (arg_pack.size() == 3UL) {
+      poly::StageMap stages = arg_pack[2];
+      Expr input_pad        = arg_pack[0];
+      CHECK(input_pad.as_tensor());
+      stages[input_pad.as_tensor_ref()]->ComputeInline();
+      *ret = CINNValuePack{{arg_pack[1], CINNValue(stages)}};
+    } else {
+      *ret = arg_pack;
+    }
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
@@ -850,7 +850,6 @@ std::vector<std::vector<int>> InferShapeForPool3d(const std::vector<std::vector<
   CHECK_EQ(kernel_size.size(), 3U) << "kernel_size for pool3d should be 3.\n";
   CHECK_EQ(stride_size.size(), 3U) << "stride_size for pool3d should be 3.\n";
 
-  std::vector<int> output_shape0 = inputs_shape[0];
   std::vector<int> output_shape1 = inputs_shape[0];
   CHECK_EQ(inputs_shape[0].size(), 6U) << "input_shape size for pool3d should be 6.\n";
   int depth_axis  = -1;
@@ -868,13 +867,7 @@ std::vector<std::vector<int>> InferShapeForPool3d(const std::vector<std::vector<
     LOG(ERROR) << "unsupported data_format: " << data_format << std::endl;
   }
 
-  output_shape0[depth_axis] += padding_size[0] + padding_size[3];
-  output_shape0[height_axis] += padding_size[1] + padding_size[4];
-  output_shape0[width_axis] += padding_size[2] + padding_size[5];
   if (ceil_mode) {
-    output_shape0[depth_axis] += stride_size[0] - 1;
-    output_shape0[height_axis] += stride_size[1] - 1;
-    output_shape0[width_axis] += stride_size[2] - 1;
     output_shape1[depth_axis] =
         (inputs_shape[0][depth_axis] - kernel_size[0] + padding_size[0] + padding_size[3] + stride_size[0] - 1) /
             stride_size[0] +
@@ -896,13 +889,13 @@ std::vector<std::vector<int>> InferShapeForPool3d(const std::vector<std::vector<
         (inputs_shape[0][width_axis] - kernel_size[2] + padding_size[2] + padding_size[5]) / stride_size[2] + 1;
   }
 
-  std::vector<std::vector<int>> res{output_shape0, output_shape1};
+  std::vector<std::vector<int>> res{output_shape1};
   return res;
 }
 
 std::vector<Type> InferDtypeForPool(const std::vector<Type> &inputs_type, const framework::NodeAttr &attrs) {
   CHECK(!inputs_type.empty()) << "The input's type size is 0! Please check again.";
-  std::vector<Type> res{inputs_type[0], inputs_type[0]};
+  std::vector<Type> res{inputs_type[0]};
   return res;
 }
 
@@ -1152,7 +1145,7 @@ CINN_REGISTER_HELPER(nn_ops) {
   CINN_REGISTER_OP(conv2d)
       .describe("Do a 2-D convolution with an NCHW/NHWC layout.")
       .set_num_inputs(2)  // here we consider filter as another input
-      .set_num_outputs(3)
+      .set_num_outputs(1)
       .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForConv2d)
       .set_attr("infershape", std::function(cinn::hlir::op::InferShapeForConv2d))
       .set_attr("inferdtype", std::function(cinn::hlir::op::InferDtypeForConv2d))
@@ -1161,7 +1154,7 @@ CINN_REGISTER_HELPER(nn_ops) {
   CINN_REGISTER_OP(depthwise_conv2d)
       .describe("Do a 2-D depthwise convolution with an NCHW/NHWC layout.")
       .set_num_inputs(2)  // here we consider filter as another input
-      .set_num_outputs(2)
+      .set_num_outputs(1)
       .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForDepthwiseConv2d)
       .set_attr("infershape", std::function(cinn::hlir::op::InferShapeForDepthwiseConv2d))
       .set_attr("inferdtype", std::function(cinn::hlir::op::InferDtypeForDepthwiseConv2d))
@@ -1179,7 +1172,7 @@ CINN_REGISTER_HELPER(nn_ops) {
   CINN_REGISTER_OP(pool1d)
       .describe("Do pooling on the width dimension of the input tensor.")
       .set_num_inputs(1)
-      .set_num_outputs(2)
+      .set_num_outputs(1)
       .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForPool1d)
       .set_attr("infershape", std::function(cinn::hlir::op::InferShapeForPool1d))
       .set_attr("inferdtype", std::function(cinn::hlir::op::InferDtypeForPool))
@@ -1188,7 +1181,7 @@ CINN_REGISTER_HELPER(nn_ops) {
   CINN_REGISTER_OP(pool2d)
       .describe("Do pooling on the height and width dimension of the input tensor.")
       .set_num_inputs(1)
-      .set_num_outputs(2)
+      .set_num_outputs(1)
       .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForPool2d)
       .set_attr("infershape", std::function(cinn::hlir::op::InferShapeForPool2d))
       .set_attr("inferdtype", std::function(cinn::hlir::op::InferDtypeForPool))
@@ -1197,7 +1190,7 @@ CINN_REGISTER_HELPER(nn_ops) {
   CINN_REGISTER_OP(pool3d)
       .describe("Do pooling on the depth, height and width dimension of the input tensor.")
       .set_num_inputs(1)
-      .set_num_outputs(2)
+      .set_num_outputs(1)
       .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForPool3d)
       .set_attr("infershape", std::function(cinn::hlir::op::InferShapeForPool3d))
       .set_attr("inferdtype", std::function(cinn::hlir::op::InferDtypeForPool))
@@ -1230,14 +1223,5 @@ CINN_REGISTER_HELPER(nn_ops) {
       .set_attr("inferdtype", std::function(cinn::hlir::op::InferDtypeForSlice))
       .set_support_level(4);
 
-  CINN_REGISTER_OP(depthwise_conv2d)
-      .describe("Do a 2-D depthwise convolution with an NCHW/NHWC layout.")
-      .set_num_inputs(2)  // here we consider filter as another input
-      .set_num_outputs(2)
-      .set_attr<cinn::hlir::framework::StrategyFunction>("CINNStrategy", cinn::hlir::op::StrategyForDepthwiseConv2d)
-      .set_attr("infershape", std::function(cinn::hlir::op::InferShapeForDepthwiseConv2d))
-      .set_attr("inferdtype", std::function(cinn::hlir::op::InferDtypeForDepthwiseConv2d))
-      .set_support_level(4);
-
   return true;
 }
diff --git a/cinn/hlir/pe/nn.cc b/cinn/hlir/pe/nn.cc
index f023c5ccf9b18..050de1e80be1b 100644
--- a/cinn/hlir/pe/nn.cc
+++ b/cinn/hlir/pe/nn.cc
@@ -45,37 +45,24 @@ std::vector<ir::Tensor> Conv2d_NCHW(const ir::Tensor &input,
                                     int stride_w,
                                     int dilation_h,
                                     int dilation_w,
-                                    const std::vector<std::vector<int>> &output_shapes,
                                     const std::string &output_name) {
   CHECK_EQ(input->shape.size(), 4U) << "Input's dimension of Conv2d_NCHW op is not 4! Please check.";
   CHECK_EQ(weights->shape.size(), 4U) << "Weight's dimension of Conv2d_NCHW op is not 4! Please check.";
   std::vector<Expr> output_shape;
   std::vector<Expr> new_weights_shape;
   std::vector<Expr> input_pad_shape;
-  if (output_shapes.size() == 3) {
-    // already computed by infer_shape
-    CHECK_EQ(output_shapes[0].size(), 4U) << "The size of output_shapes[0] of Conv2d op is not 4! Please check.";
-    CHECK_EQ(output_shapes[1].size(), 4U) << "The size of output_shapes[1] of Conv2d op is not 4! Please check.";
-    CHECK_EQ(output_shapes[2].size(), 4U) << "The size of output_shapes[2] of Conv2d op is not 4! Please check.";
-    output_shape = {
-        Expr(output_shapes[2][0]), Expr(output_shapes[2][1]), Expr(output_shapes[2][2]), Expr(output_shapes[2][3])};
-    new_weights_shape = {
-        Expr(output_shapes[1][0]), Expr(output_shapes[1][1]), Expr(output_shapes[1][2]), Expr(output_shapes[1][3])};
-    input_pad_shape = {
-        Expr(output_shapes[0][0]), Expr(output_shapes[0][1]), Expr(output_shapes[0][2]), Expr(output_shapes[0][3])};
-  } else {
-    output_shape = {
-        input->shape[0],                                                                                  // B
-        weights->shape[0],                                                                                // O
-        Expr((input->shape[2] - ((weights->shape[2] - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1),  // H
-        Expr((input->shape[3] - ((weights->shape[3] - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + 1)   // W
-    };
-    new_weights_shape = {weights->shape[0],
-                         weights->shape[1],
-                         dilation_h * (weights->shape[2] - 1) + 1,
-                         dilation_w * (weights->shape[3] - 1) + 1};
-    input_pad_shape   = {input->shape[0], input->shape[1], input->shape[2] + 2 * pad_h, input->shape[3] + 2 * pad_w};
-  }
+  output_shape = {
+      input->shape[0],                                                                                  // B
+      weights->shape[0],                                                                                // O
+      Expr((input->shape[2] - ((weights->shape[2] - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1),  // H
+      Expr((input->shape[3] - ((weights->shape[3] - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + 1)   // W
+  };
+  new_weights_shape = {weights->shape[0],
+                       weights->shape[1],
+                       dilation_h * (weights->shape[2] - 1) + 1,
+                       dilation_w * (weights->shape[3] - 1) + 1};
+  input_pad_shape   = {input->shape[0], input->shape[1], input->shape[2] + 2 * pad_h, input->shape[3] + 2 * pad_w};
+
   auto input_pad = Compute(
       input_pad_shape,
       [=](Expr nn, Expr cc, Expr yy, Expr xx) {
@@ -123,38 +110,25 @@ std::vector<ir::Tensor> Conv2d_NHWC(const ir::Tensor &input,
                                     int stride_w,
                                     int dilation_h,
                                     int dilation_w,
-                                    const std::vector<std::vector<int>> &output_shapes,
                                     const std::string &output_name) {
   CHECK_EQ(input->shape.size(), 4U) << "Input's dimension of Conv2d_NHWC op is not 4! Please check.";
   CHECK_EQ(weights->shape.size(), 4U) << "Weight's dimension of Conv2d_NHWC op is not 4! Please check.";
   std::vector<Expr> output_shape;
   std::vector<Expr> new_weights_shape;
   std::vector<Expr> input_pad_shape;
-  if (output_shapes.size() == 3) {
-    // already computed by infer_shape
-    CHECK_EQ(output_shapes[0].size(), 4U) << "The size of output_shapes[0] of Conv2d op is not 4! Please check.";
-    CHECK_EQ(output_shapes[1].size(), 4U) << "The size of output_shapes[1] of Conv2d op is not 4! Please check.";
-    CHECK_EQ(output_shapes[2].size(), 4U) << "The size of output_shapes[2] of Conv2d op is not 4! Please check.";
-    output_shape = {
-        Expr(output_shapes[2][0]), Expr(output_shapes[2][1]), Expr(output_shapes[2][2]), Expr(output_shapes[2][3])};
-    new_weights_shape = {
-        Expr(output_shapes[1][0]), Expr(output_shapes[1][1]), Expr(output_shapes[1][2]), Expr(output_shapes[1][3])};
-    input_pad_shape = {
-        Expr(output_shapes[0][0]), Expr(output_shapes[0][1]), Expr(output_shapes[0][2]), Expr(output_shapes[0][3])};
-  } else {
-    output_shape = {
-        input->shape[0],                                                                                  // B
-        Expr((input->shape[1] - ((weights->shape[2] - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1),  // H
-        Expr((input->shape[2] - ((weights->shape[3] - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + 1),  // W
-        weights->shape[0]                                                                                 // O
-    };
-    new_weights_shape = {weights->shape[0],
-                         weights->shape[1],
-                         dilation_h * (weights->shape[2] - 1) + 1,
-                         dilation_w * (weights->shape[3] - 1) + 1};
-    input_pad_shape   = {input->shape[0], input->shape[1] + 2 * pad_h, input->shape[2] + 2 * pad_w, input->shape[3]};
-  }
-  auto input_pad = Compute(
+
+  output_shape = {
+      input->shape[0],                                                                                  // B
+      Expr((input->shape[1] - ((weights->shape[2] - 1) * dilation_h + 1) + 2 * pad_h) / stride_h + 1),  // H
+      Expr((input->shape[2] - ((weights->shape[3] - 1) * dilation_w + 1) + 2 * pad_w) / stride_w + 1),  // W
+      weights->shape[0]                                                                                 // O
+  };
+  new_weights_shape = {weights->shape[0],
+                       weights->shape[1],
+                       dilation_h * (weights->shape[2] - 1) + 1,
+                       dilation_w * (weights->shape[3] - 1) + 1};
+  input_pad_shape   = {input->shape[0], input->shape[1] + 2 * pad_h, input->shape[2] + 2 * pad_w, input->shape[3]};
+  auto input_pad    = Compute(
       input_pad_shape,
       [=](Expr nn, Expr yy, Expr xx, Expr cc) {
         auto cond =
@@ -200,7 +174,6 @@ std::vector<Tensor> Depthwise_Conv2d_NCHW(const Tensor &input,
                                           int pad_w,
                                           int stride_h,
                                           int stride_w,
-                                          const std::vector<std::vector<int>> &output_shapes,
                                           const std::string output_name) {
   CHECK_EQ(input->shape.size(), 4U) << "Input's dimension of Depthwise_Conv2d_NCHW is not 4! Please check.\n";
   CHECK_EQ(weight->shape.size(), 4U) << "Weight's dimension of Depthwise_Conv2d_NCHW is not 4! Please check.\n";
@@ -208,20 +181,13 @@ std::vector<Tensor> Depthwise_Conv2d_NCHW(const Tensor &input,
   Expr in_w = input->shape[3];
   Expr c_m  = weight->shape[1];  // channel_multiplier
   std::vector<Expr> output_shape;
-  if (output_shapes.size() == 2) {
-    // already computed by infer_shape
-    CHECK_EQ(output_shapes[1].size(), 4U)
-        << "The size of output_shapes[1] of Depthwise_Conv2d op is not 4! Please check.";
-    output_shape = {
-        Expr(output_shapes[1][0]), Expr(output_shapes[1][1]), Expr(output_shapes[1][2]), Expr(output_shapes[1][3])};
-  } else {
-    output_shape = {
-        input->shape[0],                                                  // B
-        weight->shape[1] * input->shape[1],                               // O
-        (input->shape[2] - weight->shape[2] + 2 * pad_h) / stride_h + 1,  // H
-        (input->shape[3] - weight->shape[3] + 2 * pad_w) / stride_w + 1   // W
-    };
-  }
+
+  output_shape = {
+      input->shape[0],                                                  // B
+      weight->shape[1] * input->shape[1],                               // O
+      (input->shape[2] - weight->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (input->shape[3] - weight->shape[3] + 2 * pad_w) / stride_w + 1   // W
+  };
   auto input_pad =
       (pad_h == 0 && pad_w == 0) ? Identity(input) : Pad(input, {Expr(0), Expr(0), Expr(pad_h), Expr(pad_w)});
 
@@ -245,7 +211,6 @@ std::vector<Tensor> Depthwise_Conv2d_NHWC(const Tensor &input,
                                           int pad_w,
                                           int stride_h,
                                           int stride_w,
-                                          const std::vector<std::vector<int>> &output_shapes,
                                           const std::string output_name) {
   CHECK_EQ(input->shape.size(), 4U) << "Input's dimension of Depthwise_Conv2d_NCHW is not 4! Please check.\n";
   CHECK_EQ(weight->shape.size(), 4U) << "Weight's dimension of Depthwise_Conv2d_NCHW is not 4! Please check.\n";
@@ -253,20 +218,13 @@ std::vector<Tensor> Depthwise_Conv2d_NHWC(const Tensor &input,
   Expr in_w = input->shape[2];
   Expr c_m  = weight->shape[1];  // channel_multiplier
   std::vector<Expr> output_shape;
-  if (output_shapes.size() == 2) {
-    // already computed by infer_shape
-    CHECK_EQ(output_shapes[1].size(), 4U)
-        << "The size of output_shapes[1] of Depthwise_Conv2d op is not 4! Please check.";
-    output_shape = {
-        Expr(output_shapes[1][0]), Expr(output_shapes[1][1]), Expr(output_shapes[1][2]), Expr(output_shapes[1][3])};
-  } else {
-    output_shape = {
-        input->shape[0],                                                  // B
-        (input->shape[1] - weight->shape[2] + 2 * pad_h) / stride_h + 1,  // H
-        (input->shape[2] - weight->shape[3] + 2 * pad_w) / stride_w + 1,  // W
-        weight->shape[1] * input->shape[3]                                // O
-    };
-  }
+
+  output_shape = {
+      input->shape[0],                                                  // B
+      (input->shape[1] - weight->shape[2] + 2 * pad_h) / stride_h + 1,  // H
+      (input->shape[2] - weight->shape[3] + 2 * pad_w) / stride_w + 1,  // W
+      weight->shape[1] * input->shape[3]                                // O
+  };
 
   auto input_pad =
       (pad_h == 0 && pad_w == 0) ? Identity(input) : Pad(input, {Expr(0), Expr(pad_h), Expr(pad_w), Expr(0)});
@@ -541,7 +499,7 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
   if (pool_type == "max") {
     Expr min_value = ir::min_value(tensor->type());
     // Pad the input tensor with the pad_value of type's minimum value
-    temp = do_pad ? Pad(tensor, pad_before, pad_after, min_value, UniqName("pad_temp")) : Identity(tensor);
+    temp = do_pad ? Pad(tensor, pad_before, pad_after, min_value, UniqName("pad_temp")) : tensor;
     res  = Compute(
         out_shape,
         [=](const std::vector<Expr> &output) {
@@ -559,7 +517,7 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
         daxis);
   } else if (pool_type == "avg") {
     // Pad the input tensor with pad_value zero
-    temp = do_pad ? Pad(tensor, pad_before, pad_after, 0, UniqName("pad_temp")) : Identity(tensor);
+    temp = do_pad ? Pad(tensor, pad_before, pad_after, 0, UniqName("pad_temp")) : tensor;
     res  = Compute(
         out_shape,
         [=](const std::vector<Expr> &output) {
@@ -599,7 +557,11 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
   } else {
     LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
   }
-  return {temp, res};
+  if (do_pad) {
+    return {temp, res};
+  } else {
+    return {res};
+  }
 }
 
 std::vector<Tensor> Pool1d(const Tensor &tensor,
diff --git a/cinn/hlir/pe/nn.h b/cinn/hlir/pe/nn.h
index 4a03505ab50c9..ee8aaaf3657b9 100644
--- a/cinn/hlir/pe/nn.h
+++ b/cinn/hlir/pe/nn.h
@@ -97,7 +97,6 @@ std::vector<ir::Tensor> Conv2d_NCHW(const ir::Tensor &input,
                                     int stride_w,
                                     int dilation_h,
                                     int dilation_w,
-                                    const std::vector<std::vector<int>> &output_shapes,
                                     const std::string &output_name = UniqName("T_Conv2d_NCHW_out"));
 
 /**
@@ -124,7 +123,6 @@ std::vector<ir::Tensor> Conv2d_NHWC(const ir::Tensor &input,
                                     int stride_w,
                                     int dilation_h,
                                     int dilation_w,
-                                    const std::vector<std::vector<int>> &output_shapes,
                                     const std::string &output_name = UniqName("T_Conv2d_NHWC_out"));
 
 /**
@@ -147,7 +145,6 @@ std::vector<ir::Tensor> Depthwise_Conv2d_NCHW(const ir::Tensor &input,
                                               int pad_w,
                                               int stride_h,
                                               int stride_w,
-                                              const std::vector<std::vector<int>> &output_shapes,
                                               const std::string output_name = UniqName("T_depthwise_conv2d_nchw"));
 
 /**
@@ -170,7 +167,6 @@ std::vector<ir::Tensor> Depthwise_Conv2d_NHWC(const ir::Tensor &input,
                                               int pad_w,
                                               int stride_h,
                                               int stride_w,
-                                              const std::vector<std::vector<int>> &output_shapes,
                                               const std::string output_name = UniqName("T_depthwise_conv2d_nhwc"));
 
 ir::Tensor BatchNorm_NCHW(const ir::Tensor &input,
diff --git a/cinn/lang/lower_impl.cc b/cinn/lang/lower_impl.cc
index fbdf51d7a1aad..2ea457259e4ce 100644
--- a/cinn/lang/lower_impl.cc
+++ b/cinn/lang/lower_impl.cc
@@ -400,6 +400,14 @@ ir::LoweredFunc LowerImpl::operator()() {
       if (arg->is_placeholder_node()) continue;
       if (arg->buffer.defined()) continue;
       if (arg->body().As<ir::Call>() && arg->body().type().is_void()) continue;  // extern call
+      if (tensor_map.find(arg->name) == tensor_map.end()) {
+        LOG(INFO) << "Didn't find arg tensor " << arg->name << "in tensor_map.\n"
+                  << "The function is " << fn_name_ << "\nAnd all the arg tensors are:\n";
+        for (auto& i : tensor_args_) {
+          LOG(INFO) << i->name;
+        }
+        LOG(FATAL) << "Fatal Error!";
+      }
       Reference(&arg)->buffer = tensor_map.at(arg->name)->buffer;
     }
   }
@@ -421,7 +429,9 @@ ir::LoweredFunc LowerImpl::operator()() {
   auto func = ir::_LoweredFunc_::Make(fn_name_, func_args, func_body, temp_buffers);
 
   // some necessary modification.
+  LOG(INFO) << "Before optim::ComputeInlineExpand(&func->body, stages_); in function " << fn_name_;
   optim::ComputeInlineExpand(&func->body, stages_);
+  LOG(INFO) << "After optim::ComputeInlineExpand(&func->body, stages_); in function " << fn_name_;
   Target target = cuda_axis_info_.valid() ? common::DefaultNVGPUTarget() : common::DefaultHostTarget();
   auto res      = optim::Optimize(func, target, FLAGS_cinn_runtime_display_debug_info);
 
diff --git a/cinn/optim/compute_inline_expand.cc b/cinn/optim/compute_inline_expand.cc
index f4c6e934f12a7..3c5c04b7eedd3 100644
--- a/cinn/optim/compute_inline_expand.cc
+++ b/cinn/optim/compute_inline_expand.cc
@@ -16,7 +16,11 @@ struct TensorInlineExpandMutator : public ir::IRMutator<> {
 
   TensorInlineExpandMutator(const std::string &tensor_name) : tensor_name(tensor_name) {}
 
-  void operator()(Expr *expr) { ir::IRMutator<>::Visit(expr, expr); }
+  void operator()(Expr *expr) {
+    LOG(INFO) << "void operator()(Expr *expr) Begin";
+    ir::IRMutator<>::Visit(expr, expr);
+    LOG(INFO) << "void operator()(Expr *expr) End";
+  }
 
   void Visit(const ir::Load *op, Expr *expr) override {
     auto *node   = expr->As<ir::Load>();
diff --git a/cinn/pybind/framework.cc b/cinn/pybind/framework.cc
index b938c190ec967..20af6aa50d0d3 100644
--- a/cinn/pybind/framework.cc
+++ b/cinn/pybind/framework.cc
@@ -35,19 +35,24 @@ void BindFramework(pybind11::module *m) {
              auto impl = OpStrategy::SelectImpl(self[op_ptr](attrs, inputs, out_types, output_shapes, target));
              std::vector<common::CINNValue> temp_inputs;
              std::vector<ir::Tensor> res;
-             for (auto tensor : inputs) {
+             for (auto &tensor : inputs) {
                res.push_back(tensor);
                temp_inputs.push_back(common::CINNValue(tensor));
              }
-             auto stages = CreateStages(inputs);
-             temp_inputs.push_back(common::CINNValue(stages));
              common::CINNValuePack C = impl->fcompute(common::CINNValuePack{temp_inputs});
-             C                       = impl->fschedule(C);
-             for (int i = 0; i < C.get()->size() - 1; i++) {
+             poly::StageMap stages   = C.back();
+             // make sure all the tensors in the stages before schedule launch.
+             for (int i = 0; i < C->size() - 1; i++) {
+               ir::Expr temp = C[i];
+               stages->InsertLazily(temp.as_tensor_ref());
+             }
+             C = impl->fschedule(C);
+             for (int i = 0; i < C->size() - 1; i++) {
                ir::Expr temp = C[i];
                res.push_back(temp.as_tensor_ref());
              }
-             return res;
+             auto func = Lower(key, stages, res);
+             return func;
            });
 
   py::class_<NodeAttr>(*m, "NodeAttr")
diff --git a/python/tests/conv2d_utils.py b/python/tests/conv2d_utils.py
index 02c9f880ae382..dfab9ba545663 100644
--- a/python/tests/conv2d_utils.py
+++ b/python/tests/conv2d_utils.py
@@ -72,26 +72,8 @@ def conv2d_native(inputs_data, input_shape, filter_size, attrs, is_depthwise):
         print("output's shape is:", output.shape)
 
     res_shape = output.shape[1:]
-    pad_shape = list(input_shape)
-    dilation_shape = list(filter_size_new)
-    assert len(padding) == 2
-    assert len(pad_shape) == 4
-    assert len(dilation_shape) == 4
-    if data_format == "NCHW":
-        h_index = 2
-        w_index = 3
-    else:
-        h_index = 1
-        w_index = 2
-
-    pad_shape[h_index] += 2 * padding[0]
-    pad_shape[w_index] += 2 * padding[1]
-    dilation_shape[2] = (filter_size_new[2] - 1) * dilation[0] + 1
-    dilation_shape[3] = (filter_size_new[3] - 1) * dilation[1] + 1
 
-    print("pad's shape is:", pad_shape)
-    print("dilation's shape is:", dilation_shape)
     if is_depthwise:
-        return output, [pad_shape, res_shape]
+        return output, [res_shape]
     else:
-        return output, [pad_shape, dilation_shape, res_shape]
+        return output, [res_shape]
diff --git a/python/tests/pool_utils.py b/python/tests/pool_utils.py
index 05c09c0c6dd9f..3a78195a999db 100644
--- a/python/tests/pool_utils.py
+++ b/python/tests/pool_utils.py
@@ -51,14 +51,8 @@ def pool2d(np_data, attrs, dtype="float32"):
     else:
         pt, pl, pb, pr = padding_size
 
-    out_shape0 = list(in_shape)
-    out_shape0[height_axis] = in_shape[height_axis] + pt + pb
-    out_shape0[width_axis] = in_shape[width_axis] + pl + pr
-
     out_shape = list(in_shape)
     if ceil_mode:
-        out_shape0[height_axis] += s_h - 1
-        out_shape0[width_axis] += s_w - 1
         out_shape[height_axis] = int(
             math.ceil(float(in_shape[height_axis] - k_h + pt + pb) / s_h) + 1)
         out_shape[width_axis] = int(
@@ -144,7 +138,7 @@ def pool2d(np_data, attrs, dtype="float32"):
         raise ValueError("pool type {} is not supported".format(pool_type))
 
     ret_np = np.maximum(ret_np, fill_value)
-    return ret_np, [out_shape0, out_shape]
+    return ret_np, [out_shape]
 
 
 def pool3d(np_data, attrs, dtype="float32"):
@@ -196,16 +190,8 @@ def pool3d(np_data, attrs, dtype="float32"):
     else:
         pf, pt, pl, pk, pb, pr = padding_size
 
-    out_shape0 = list(in_shape)
-    out_shape0[depth_axis] = in_shape[depth_axis] + pf + pk
-    out_shape0[height_axis] = in_shape[height_axis] + pt + pb
-    out_shape0[width_axis] = in_shape[width_axis] + pl + pr
-
     out_shape = list(in_shape)
     if ceil_mode:
-        out_shape0[depth_axis] += s_d - 1
-        out_shape0[height_axis] += s_h - 1
-        out_shape0[width_axis] += s_w - 1
         out_shape[depth_axis] = int(
             math.ceil(float(in_shape[depth_axis] - k_d + pf + pk) / s_d) + 1)
         out_shape[height_axis] = int(
@@ -302,7 +288,7 @@ def pool3d(np_data, attrs, dtype="float32"):
         raise ValueError("pool type {} is not supported".format(pool_type))
 
     ret_np = np.maximum(ret_np, fill_value)
-    return ret_np, [out_shape0, out_shape]
+    return ret_np, [out_shape]
 
 
 def pool1d(np_data, attrs, dtype="float32"):
@@ -350,12 +336,8 @@ def pool1d(np_data, attrs, dtype="float32"):
     else:
         pl, pr = padding_size
 
-    out_shape0 = list(in_shape)
-    out_shape0[width_axis] = in_shape[width_axis] + pl + pr
-
     out_shape = list(in_shape)
     if ceil_mode:
-        out_shape0[width_axis] += s_w - 1
         out_shape[width_axis] = int(
             math.ceil(float(in_shape[width_axis] - k_w + pl + pr) / s_w) + 1)
     else:
@@ -422,4 +404,4 @@ def pool1d(np_data, attrs, dtype="float32"):
         raise ValueError("pool type {} is not supported".format(pool_type))
 
     ret_np = np.maximum(ret_np, fill_value)
-    return ret_np, [out_shape0, out_shape]
+    return ret_np, [out_shape]
diff --git a/python/tests/test_utils.py b/python/tests/test_utils.py
index 8b088805b4651..45ddc2e86fca9 100644
--- a/python/tests/test_utils.py
+++ b/python/tests/test_utils.py
@@ -100,10 +100,8 @@ def to_test_op(self, input_shapes, output_shapes, op_name, attrs):
     def __codegen(self, op_name, inputs, output_shapes, attrs):
         types = [common.Float(32)]
         strategy_map = framework.Operator.get_op_attrs("CINNStrategy")
-        res = strategy_map.apply_strategy(op_name, attrs, inputs, types,
-                                          output_shapes, self.target)
-        stages = create_stages(res)
-        func = lang.lower(op_name, stages, res)
+        func = strategy_map.apply_strategy(op_name, attrs, inputs, types,
+                                           output_shapes, self.target)
         logging.warning('func:\n\n%s\n', func)
         builder = lang.Module.Builder(op_name, self.target)
         builder.add_function(func)