Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… cos_double_grad rebase
PaddlePaddle · Mar 13, 2024 · c917b6f · c917b6f
2 parents adb7d51 + 53bfb21
commit c917b6f
Show file tree

Hide file tree

Showing 1,034 changed files with 40,475 additions and 16,735 deletions.
diff --git a/.flake8 b/.flake8
@@ -22,7 +22,7 @@ ignore =
     E741
 per-file-ignores =
     # These files need tabs for testing.
-    test/dygraph_to_static/test_legacy_error.py:E101
+    test/dygraph_to_static/test_error.py:E101
 
     # Ignore compare with True in sot unittest
     test/sot/test_dup_top.py:E712
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -36,7 +36,7 @@ repos:
         # Exclude some unit test files that require tabs.
         exclude: |
             (?x)^(
-                test/dygraph_to_static/test_legacy_error.py
+                test/dygraph_to_static/test_error.py
             )$
 -   repo: local
     hooks:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -65,7 +65,8 @@ option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
 option(WITH_SHARED_PHI "Compile PaddlePaddle with SHARED LIB of PHI" ON)
 option(CINN_ONLY "Compile CINN only in Paddle" OFF)
 option(CINN_WITH_CUDNN "Compile CINN with CUDNN support" ON)
-
+option(WITH_PIP_CUDA_LIBRARIES
+       "Paddle uses the CUDA library provided by NVIDIA" OFF)
 find_package(Git REQUIRED)
 
 # config GIT_URL with github mirrors to speed up dependent repos clone
@@ -97,11 +98,16 @@ endif()
 
 if(WITH_GPU AND NOT APPLE)
   #(Note risemeup1): The cudart dynamic library libcudart.so is used by set CUDA_USE_STATIC_CUDA_RUNTIME and CMAKE_CUDA_FLAGS
-  if(LINUX)
+  if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL
+                                            "x86_64")
     set(CUDA_USE_STATIC_CUDA_RUNTIME
         OFF
         CACHE BOOL "" FORCE)
     set(CMAKE_CUDA_FLAGS "--cudart shared")
+    if(WITH_PIP_CUDA_LIBRARIES)
+      #(Note risemeup1): Flag 'WITH_PIP_CUDA_LIBRARIES' will be used in dynamic_loader.cc to search for CUDA-related .so files through the Python libraries provided by NVIDIA.
+      add_definitions(-DWITH_PIP_CUDA_LIBRARIES)
+    endif()
   endif()
   enable_language(CUDA)
   message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: "

diff --git a/cmake/export_paddle_header.cmake b/cmake/export_paddle_header.cmake
@@ -27,7 +27,7 @@ function(header_path_compat TARGET_PATH)
                      "${HEADER_CONTENT}")
       string(REPLACE "paddle/fluid/pir/drr/include/" "paddle/pir/drr/"
                      HEADER_CONTENT "${HEADER_CONTENT}")
-      string(REPLACE "paddle/fluid/pir/transforms/" "paddle/pir/transforms/"
+      string(REPLACE "paddle/fluid/pir/utils/" "paddle/pir/utils/"
                      HEADER_CONTENT "${HEADER_CONTENT}")
       file(WRITE ${header} "${HEADER_CONTENT}")
       message(STATUS "header path compat processing complete: ${header}")
@@ -65,7 +65,7 @@ header_path_compat(
 header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/pattern_rewrite)
 header_path_compat(
-  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/transforms)
+  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/utils)
 
 # NOTE(liuyuanle): In inference lib, no need include paddle/utils/pybind.h, so we delete this.
 file(READ ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/extension.h

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
@@ -39,27 +39,19 @@ elseif(LINUX)
   endif()
 endif()
 
-if(CMAKE_COMPILER_IS_GNUCC)
+if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
   file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorRandom.h.patch
        tensor_random_header)
   # See: [Why calling some `git` commands before `patch`?]
   set(EIGEN_PATCH_COMMAND
       git checkout -- . && git checkout ${EIGEN_TAG} && patch -Nd
       ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor <
       ${tensor_random_header})
-  execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpfullversion -dumpversion
-                  OUTPUT_VARIABLE GCC_VERSION)
-  string(REGEX MATCHALL "[0-9]+" GCC_VERSION_COMPONENTS ${GCC_VERSION})
-  list(GET GCC_VERSION_COMPONENTS 0 GCC_MAJOR)
-  list(GET GCC_VERSION_COMPONENTS 1 GCC_MINOR)
-  set(GCC_VERSION "${GCC_MAJOR}.${GCC_MINOR}")
-  if(GCC_VERSION GREATER_EQUAL 12.0)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Complex.h.patch
-         complex_header)
-    set(EIGEN_PATCH_COMMAND
-        ${EIGEN_PATCH_COMMAND} && patch -Nd
-        ${SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header})
-  endif()
+  file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Complex.h.patch
+       complex_header)
+  set(EIGEN_PATCH_COMMAND
+      ${EIGEN_PATCH_COMMAND} && patch -Nd
+      ${SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header})
 endif()
 
 set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})

diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
@@ -42,24 +42,16 @@ if(WITH_GPU)
   endif()
 endif()
 
-if(CMAKE_COMPILER_IS_GNUCC)
-  execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpfullversion -dumpversion
-                  OUTPUT_VARIABLE GCC_VERSION)
-  string(REGEX MATCHALL "[0-9]+" GCC_VERSION_COMPONENTS ${GCC_VERSION})
-  list(GET GCC_VERSION_COMPONENTS 0 GCC_MAJOR)
-  list(GET GCC_VERSION_COMPONENTS 1 GCC_MINOR)
-  set(GCC_VERSION "${GCC_MAJOR}.${GCC_MINOR}")
-  if(GCC_VERSION GREATER_EQUAL "12.0")
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/device.cc.patch
-         native_dst)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/types.h.patch
-         types_header)
-    # See: [Why calling some `git` commands before `patch`?]
-    set(GLOO_PATCH_COMMAND
-        git checkout -- . && git checkout ${GLOO_TAG} && patch -Nd
-        ${SOURCE_DIR}/gloo/transport/tcp < ${native_dst} && patch -Nd
-        ${SOURCE_DIR}/gloo/ < ${types_header})
-  endif()
+if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+  file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/device.cc.patch
+       native_dst)
+  file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/types.h.patch
+       types_header)
+  # See: [Why calling some `git` commands before `patch`?]
+  set(GLOO_PATCH_COMMAND
+      git checkout -- . && git checkout ${GLOO_TAG} && patch -Nd
+      ${SOURCE_DIR}/gloo/transport/tcp < ${native_dst} && patch -Nd
+      ${SOURCE_DIR}/gloo/ < ${types_header})
 endif()
 
 file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/linux.cc.patch

diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
@@ -69,7 +69,7 @@ ExternalProject_Add(
              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
   CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT}
                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-  BUILD_BYPRODUCTS ${PSLIB_LIB})
+  BUILD_BYPRODUCTS ${PSLIB_LIB} ${JVM_LIB})
 
 add_library(pslib SHARED IMPORTED GLOBAL)
 set_property(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -29,7 +29,7 @@ if(NOT DEFINED XPU_BASE_DATE)
   set(XPU_BASE_DATE "20240104")
 endif()
 if(NOT DEFINED XPU_XHPC_BASE_DATE)
-  set(XPU_XHPC_BASE_DATE "20240226")
+  set(XPU_XHPC_BASE_DATE "20240312")
 endif()
 set(XPU_XCCL_BASE_VERSION "1.1.8.1")
 if(NOT DEFINED XPU_XFT_BASE_VERSION)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
@@ -392,8 +392,8 @@ copy(
   DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/drr/)
 copy(
   inference_lib_dist
-  SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/transforms/transform_general_functions.h
-  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/transforms/)
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/utils/general_functions.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/utils/)
 
 # the include path of paddle needs to be changed to adapt to inference api path
 add_custom_command(

diff --git a/cmake/simd.cmake b/cmake/simd.cmake
@@ -4,9 +4,7 @@
 include(CheckCXXSourceRuns)
 include(CheckCXXSourceCompiles)
 
-if(CMAKE_COMPILER_IS_GNUCC
-   OR CMAKE_COMPILER_IS_GNUCXX
-   OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
   set(MMX_FLAG "-mmmx")
   set(SSE2_FLAG "-msse2")
   set(SSE3_FLAG "-msse3")

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
@@ -15,6 +15,11 @@
 include(ExternalProject)
 # Create a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)
 
+# Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
+  cmake_policy(SET CMP0135 NEW)
+endif()
+
 set(THIRD_PARTY_PATH
     "${CMAKE_BINARY_DIR}/third_party"
     CACHE STRING

diff --git a/paddle/cinn/adt/simplify_value.cc b/paddle/cinn/adt/simplify_value.cc
@@ -21,7 +21,7 @@
 #include "paddle/cinn/adt/index_expr_infer_context.h"
 #include "paddle/cinn/adt/match.h"
 #include "paddle/cinn/adt/simplify_value.h"
-#include "paddle/pir/include/dialect/shape/utils/dim_expr_simplify.h"
+#include "paddle/pir/include/dialect/shape/utils/dim_expr_util.h"
 
 namespace cinn::adt {
 

diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -22,6 +22,7 @@
 #include "paddle/cinn/optim/replace_var_with_expr.h"
 
 PD_DECLARE_bool(cinn_new_group_scheduler);
+PD_DECLARE_bool(group_schedule_tiling_first);
 PD_DECLARE_bool(cinn_bucket_compile);
 
 namespace cinn {
@@ -93,9 +94,21 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     std::vector<ir::Expr> iter_values;
     // reduce body and reduce init schedule block should have different objects
     // for same axis so we re-create objects
+    VLOG(4) << "FLAGS_group_schedule_tiling_first = "
+            << FLAGS_group_schedule_tiling_first;
     std::vector<Var> axis_vars = cinn::common::GenDefaultAxis(axis_len);
+    const std::vector<ir::Var>& reduce_axis = tensor->reduce_axis;
+    VLOG(4) << "ast gen: tensor init_body is " << init_body;
     for (int i = 0; i < shape.size(); ++i) {
-      if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
+      bool is_keep_dim = axis[i]->is_keepdim;
+      if (FLAGS_group_schedule_tiling_first && is_keep_dim) {
+        // if tiling first, we need to replace the reduce axis with 0, but don't
+        // deal with the non-reduce axis
+        optim::ReplaceVarWithExpr(&init_body, axis[i], Expr(0));
+        continue;
+      }
+      if (!FLAGS_group_schedule_tiling_first &&
+          FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
         optim::ReplaceVarWithExpr(&init_body, axis[i], Expr(0));
         continue;
       }
@@ -105,29 +118,41 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
                                /*is_reduce = */ false));
       optim::ReplaceVarWithExpr(&init_body, axis[i], block_vars.back());
       axis_vars[i]->is_reduce_axis = false;
-      if (shape[i] == Expr(1)) {
+      if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) {
         iter_values.push_back(Expr(0));
       } else {
         iter_values.push_back(axis_vars[i]);
       }
     }
+    VLOG(4) << "iter_value.size() and block_vars.size() is "
+            << iter_values.size() << " " << block_vars.size();
     init_body = ir::ScheduleBlockRealize::Make(
         iter_values,
         ir::ScheduleBlock::Make(
             block_vars, {}, {}, reduce_init_name, init_body));
 
     // For the remaining reduce axis, make reduce body
-    const std::vector<ir::Var>& reduce_axis = tensor->reduce_axis;
     ir::Expr reduce_body =
         ConvertReduceBody(tensor->body(), tensor, axis_exprs);
+
+    VLOG(4) << "ast gen: reduce body is " << reduce_body;
+
     // create schedule block itervars, i0,i1...
     std::vector<ir::Var> reduce_block_vars;
     std::vector<ir::Expr> reduce_iter_values;
     // reduce body and reduce init schedule block should have different objects
     // for same axis so we re-create objects
     std::vector<Var> reduce_axis_vars = cinn::common::GenDefaultAxis(axis_len);
     for (int i = 0; i < shape.size(); ++i) {
-      if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
+      bool is_keep_dim = axis[i]->is_keepdim;
+      if (FLAGS_group_schedule_tiling_first && is_keep_dim) {
+        // if tiling first, we need to replace the reduce axis with 0, but don't
+        // deal with the non-reduce axis
+        optim::ReplaceVarWithExpr(&reduce_body, axis[i], Expr(0));
+        continue;
+      }
+      if (!FLAGS_group_schedule_tiling_first &&
+          FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
         optim::ReplaceVarWithExpr(&reduce_body, axis[i], Expr(0));
         continue;
       }
@@ -136,12 +161,13 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
                                       cinn::UniqName("i" + std::to_string(i)),
                                       /*is_reduce = */ false));
       reduce_axis_vars[i]->is_reduce_axis = false;
-      if (shape[i] == Expr(1)) {
+      if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) {
         reduce_iter_values.push_back(Expr(0));
       } else {
         reduce_iter_values.push_back(axis_vars[i]);
       }
     }
+    VLOG(4) << "ast gen: reduce body is after replace 0" << reduce_body;
     for (int i = 0; i < reduce_axis.size(); ++i) {
       int count = shape.size() + i;
       reduce_block_vars.push_back(
@@ -155,14 +181,43 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     }
 
     int non_zero_axis_size = 0;
-    for (int i = 0; i < axis.size(); ++i) {
-      if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
-        continue;
+    if (FLAGS_group_schedule_tiling_first) {
+      std::vector<ir::Var> non_reduce_axis_vars = [&]() {
+        std::vector<ir::Var> res;
+        for (int i = 0; i < shape.size(); ++i) {
+          bool is_keep_dim = axis[i]->is_keepdim;
+          if (!is_keep_dim) {
+            res.push_back(axis[i]);
+          }
+        }
+        return res;
+      }();
+      for (int i = 0; i < non_reduce_axis_vars.size(); ++i) {
+        optim::ReplaceVarWithExpr(
+            &reduce_body, non_reduce_axis_vars[i], reduce_block_vars[i]);
+        ++non_zero_axis_size;
       }
-      optim::ReplaceVarWithExpr(
-          &reduce_body, axis[i], reduce_block_vars[non_zero_axis_size]);
-      ++non_zero_axis_size;
+    } else {
+      for (int i = 0; i < axis.size(); ++i) {
+        if (!FLAGS_group_schedule_tiling_first &&
+            FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
+          continue;
+        }
+        optim::ReplaceVarWithExpr(
+            &reduce_body, axis[i], reduce_block_vars[non_zero_axis_size]);
+        ++non_zero_axis_size;
+      }
+    }
+
+    VLOG(4) << "to replace : " << non_zero_axis_size << " "
+            << reduce_block_vars.size();
+    for (auto i = 0; i < reduce_block_vars.size(); i++) {
+      VLOG(4) << "reduce_block_vars[" << i << "] = " << reduce_block_vars[i];
+    }
+    for (auto i = 0; i < reduce_axis.size(); i++) {
+      VLOG(4) << "reduce_axis[" << i << "] = " << reduce_axis[i];
     }
+    VLOG(4) << "before replace body: " << reduce_body;
     for (int i = non_zero_axis_size; i < reduce_block_vars.size(); ++i) {
       optim::ReplaceVarWithExpr(&reduce_body,
                                 reduce_axis[i - non_zero_axis_size],
@@ -185,7 +240,12 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     // Put the two parts together
     ir::Expr body = ir::Block::Make({init_body, reduce_body});
     for (int i = static_cast<int>(axis_len) - 1; i >= 0; --i) {
-      if (!FLAGS_cinn_bucket_compile && shape[i] == Expr(1)) {
+      bool is_keep_dim = axis[i]->is_keepdim;
+      if (FLAGS_group_schedule_tiling_first && is_keep_dim) {
+        continue;
+      }
+      if ((!FLAGS_group_schedule_tiling_first || !FLAGS_cinn_bucket_compile) &&
+          shape[i] == Expr(1)) {
         continue;
       }
       ir::Var loop_var = axis[i];
@@ -210,7 +270,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
           Expr(0), shape[i], cinn::UniqName("i" + std::to_string(i)), false));
       optim::ReplaceVarWithExpr(&body, axis[i], block_vars[i]);
       axis_vars[i]->is_reduce_axis = false;
-      if (shape[i] == Expr(1)) {
+      if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) {
         iter_values.push_back(Expr(0));
       } else {
         iter_values.push_back(axis_vars[i]);

diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -21,6 +21,7 @@
 #include <set>
 #include <unordered_set>
 
+#include "paddle/cinn/common/cas.h"
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/utils/ir_verify.h"
@@ -124,6 +125,7 @@ std::vector<Expr> FilterDeallocTempBuffers(const std::vector<Expr> &frees) {
     bool has_symbolic_constant = false;
     const ir::_Buffer_ *buffer = op->destination.As<ir::_Buffer_>();
     for (Expr shape : buffer->shape) {
+      shape = common::AutoSimplify(shape);
       ir::ir_utils::CollectIRNodes(shape, [&](const Expr *x) {
         if (x->as_var()) {
           CHECK(x->as_var()->is_symbolic_constant)

diff --git a/paddle/cinn/common/CMakeLists.txt b/paddle/cinn/common/CMakeLists.txt
@@ -23,8 +23,7 @@ gather_srcs(
   nvgpu_dev_info.cc
   integer_set.cc
   dim_expr_converter.cc
-  broadcast_tree.cc
-  dim_expr_util.cc)
+  broadcast_tree.cc)
 
 cinn_cc_test(test_equation_graph_topo_walker SRCS
              equation_graph_topo_walker_test.cc DEPS gtest glog)
@@ -49,7 +48,6 @@ if(WITH_CUDA)
                gtest glog)
 endif()
 if(NOT CINN_ONLY)
-  cinn_cc_test(dim_expr_util_test SRCS dim_expr_util_test.cc DEPS cinncore)
   cinn_cc_test(dim_expr_converter_test SRCS dim_expr_converter_test.cc DEPS
                cinncore)
   cinn_cc_test(broadcast_tree_test SRCS broadcast_tree_test.cc DEPS cinncore)