Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Browse files Browse the repository at this point in the history
… cos_double_grad

rebase
  • Loading branch information
YibinLiu666 committed Mar 13, 2024
2 parents adb7d51 + 53bfb21 commit c917b6f
Show file tree
Hide file tree
Showing 1,034 changed files with 40,475 additions and 16,735 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ ignore =
E741
per-file-ignores =
# These files need tabs for testing.
test/dygraph_to_static/test_legacy_error.py:E101
test/dygraph_to_static/test_error.py:E101

# Ignore compare with True in sot unittest
test/sot/test_dup_top.py:E712
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ repos:
# Exclude some unit test files that require tabs.
exclude: |
(?x)^(
test/dygraph_to_static/test_legacy_error.py
test/dygraph_to_static/test_error.py
)$
- repo: local
hooks:
Expand Down
10 changes: 8 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
option(WITH_SHARED_PHI "Compile PaddlePaddle with SHARED LIB of PHI" ON)
option(CINN_ONLY "Compile CINN only in Paddle" OFF)
option(CINN_WITH_CUDNN "Compile CINN with CUDNN support" ON)

option(WITH_PIP_CUDA_LIBRARIES
"Paddle uses the CUDA library provided by NVIDIA" OFF)
find_package(Git REQUIRED)

# config GIT_URL with github mirrors to speed up dependent repos clone
Expand Down Expand Up @@ -97,11 +98,16 @@ endif()

if(WITH_GPU AND NOT APPLE)
#(Note risemeup1): The cudart dynamic library libcudart.so is used by set CUDA_USE_STATIC_CUDA_RUNTIME and CMAKE_CUDA_FLAGS
if(LINUX)
if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL
"x86_64")
set(CUDA_USE_STATIC_CUDA_RUNTIME
OFF
CACHE BOOL "" FORCE)
set(CMAKE_CUDA_FLAGS "--cudart shared")
if(WITH_PIP_CUDA_LIBRARIES)
#(Note risemeup1): Flag 'WITH_PIP_CUDA_LIBRARIES' will be used in dynamic_loader.cc to search for CUDA-related .so files through the Python libraries provided by NVIDIA.
add_definitions(-DWITH_PIP_CUDA_LIBRARIES)
endif()
endif()
enable_language(CUDA)
message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: "
Expand Down
4 changes: 2 additions & 2 deletions cmake/export_paddle_header.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ function(header_path_compat TARGET_PATH)
"${HEADER_CONTENT}")
string(REPLACE "paddle/fluid/pir/drr/include/" "paddle/pir/drr/"
HEADER_CONTENT "${HEADER_CONTENT}")
string(REPLACE "paddle/fluid/pir/transforms/" "paddle/pir/transforms/"
string(REPLACE "paddle/fluid/pir/utils/" "paddle/pir/utils/"
HEADER_CONTENT "${HEADER_CONTENT}")
file(WRITE ${header} "${HEADER_CONTENT}")
message(STATUS "header path compat processing complete: ${header}")
Expand Down Expand Up @@ -65,7 +65,7 @@ header_path_compat(
header_path_compat(
${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/pattern_rewrite)
header_path_compat(
${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/transforms)
${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/utils)

# NOTE(liuyuanle): In inference lib, no need include paddle/utils/pybind.h, so we delete this.
file(READ ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/extension.h
Expand Down
20 changes: 6 additions & 14 deletions cmake/external/eigen.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,19 @@ elseif(LINUX)
endif()
endif()

if(CMAKE_COMPILER_IS_GNUCC)
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorRandom.h.patch
tensor_random_header)
# See: [Why calling some `git` commands before `patch`?]
set(EIGEN_PATCH_COMMAND
git checkout -- . && git checkout ${EIGEN_TAG} && patch -Nd
${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor <
${tensor_random_header})
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpfullversion -dumpversion
OUTPUT_VARIABLE GCC_VERSION)
string(REGEX MATCHALL "[0-9]+" GCC_VERSION_COMPONENTS ${GCC_VERSION})
list(GET GCC_VERSION_COMPONENTS 0 GCC_MAJOR)
list(GET GCC_VERSION_COMPONENTS 1 GCC_MINOR)
set(GCC_VERSION "${GCC_MAJOR}.${GCC_MINOR}")
if(GCC_VERSION GREATER_EQUAL 12.0)
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Complex.h.patch
complex_header)
set(EIGEN_PATCH_COMMAND
${EIGEN_PATCH_COMMAND} && patch -Nd
${SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header})
endif()
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Complex.h.patch
complex_header)
set(EIGEN_PATCH_COMMAND
${EIGEN_PATCH_COMMAND} && patch -Nd
${SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header})
endif()

set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
Expand Down
28 changes: 10 additions & 18 deletions cmake/external/gloo.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,16 @@ if(WITH_GPU)
endif()
endif()

if(CMAKE_COMPILER_IS_GNUCC)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpfullversion -dumpversion
OUTPUT_VARIABLE GCC_VERSION)
string(REGEX MATCHALL "[0-9]+" GCC_VERSION_COMPONENTS ${GCC_VERSION})
list(GET GCC_VERSION_COMPONENTS 0 GCC_MAJOR)
list(GET GCC_VERSION_COMPONENTS 1 GCC_MINOR)
set(GCC_VERSION "${GCC_MAJOR}.${GCC_MINOR}")
if(GCC_VERSION GREATER_EQUAL "12.0")
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/device.cc.patch
native_dst)
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/types.h.patch
types_header)
# See: [Why calling some `git` commands before `patch`?]
set(GLOO_PATCH_COMMAND
git checkout -- . && git checkout ${GLOO_TAG} && patch -Nd
${SOURCE_DIR}/gloo/transport/tcp < ${native_dst} && patch -Nd
${SOURCE_DIR}/gloo/ < ${types_header})
endif()
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/device.cc.patch
native_dst)
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/types.h.patch
types_header)
# See: [Why calling some `git` commands before `patch`?]
set(GLOO_PATCH_COMMAND
git checkout -- . && git checkout ${GLOO_TAG} && patch -Nd
${SOURCE_DIR}/gloo/transport/tcp < ${native_dst} && patch -Nd
${SOURCE_DIR}/gloo/ < ${types_header})
endif()

file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/linux.cc.patch
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/pslib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ ExternalProject_Add(
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT}
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
BUILD_BYPRODUCTS ${PSLIB_LIB})
BUILD_BYPRODUCTS ${PSLIB_LIB} ${JVM_LIB})

add_library(pslib SHARED IMPORTED GLOBAL)
set_property(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ if(NOT DEFINED XPU_BASE_DATE)
set(XPU_BASE_DATE "20240104")
endif()
if(NOT DEFINED XPU_XHPC_BASE_DATE)
set(XPU_XHPC_BASE_DATE "20240226")
set(XPU_XHPC_BASE_DATE "20240312")
endif()
set(XPU_XCCL_BASE_VERSION "1.1.8.1")
if(NOT DEFINED XPU_XFT_BASE_VERSION)
Expand Down
4 changes: 2 additions & 2 deletions cmake/inference_lib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,8 @@ copy(
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/drr/)
copy(
inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/transforms/transform_general_functions.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/transforms/)
SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/utils/general_functions.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/pir/utils/)

# the include path of paddle needs to be changed to adapt to inference api path
add_custom_command(
Expand Down
4 changes: 1 addition & 3 deletions cmake/simd.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
include(CheckCXXSourceRuns)
include(CheckCXXSourceCompiles)

if(CMAKE_COMPILER_IS_GNUCC
OR CMAKE_COMPILER_IS_GNUCXX
OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
set(MMX_FLAG "-mmmx")
set(SSE2_FLAG "-msse2")
set(SSE3_FLAG "-msse3")
Expand Down
5 changes: 5 additions & 0 deletions cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
include(ExternalProject)
# Create a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)

# Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif()

set(THIRD_PARTY_PATH
"${CMAKE_BINARY_DIR}/third_party"
CACHE STRING
Expand Down
2 changes: 1 addition & 1 deletion paddle/cinn/adt/simplify_value.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include "paddle/cinn/adt/index_expr_infer_context.h"
#include "paddle/cinn/adt/match.h"
#include "paddle/cinn/adt/simplify_value.h"
#include "paddle/pir/include/dialect/shape/utils/dim_expr_simplify.h"
#include "paddle/pir/include/dialect/shape/utils/dim_expr_util.h"

namespace cinn::adt {

Expand Down
86 changes: 73 additions & 13 deletions paddle/cinn/ast_gen_ius/ast_gen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "paddle/cinn/optim/replace_var_with_expr.h"

PD_DECLARE_bool(cinn_new_group_scheduler);
PD_DECLARE_bool(group_schedule_tiling_first);
PD_DECLARE_bool(cinn_bucket_compile);

namespace cinn {
Expand Down Expand Up @@ -93,9 +94,21 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
std::vector<ir::Expr> iter_values;
// reduce body and reduce init schedule block should have different objects
// for same axis so we re-create objects
VLOG(4) << "FLAGS_group_schedule_tiling_first = "
<< FLAGS_group_schedule_tiling_first;
std::vector<Var> axis_vars = cinn::common::GenDefaultAxis(axis_len);
const std::vector<ir::Var>& reduce_axis = tensor->reduce_axis;
VLOG(4) << "ast gen: tensor init_body is " << init_body;
for (int i = 0; i < shape.size(); ++i) {
if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
bool is_keep_dim = axis[i]->is_keepdim;
if (FLAGS_group_schedule_tiling_first && is_keep_dim) {
// if tiling first, we need to replace the reduce axis with 0, but don't
// deal with the non-reduce axis
optim::ReplaceVarWithExpr(&init_body, axis[i], Expr(0));
continue;
}
if (!FLAGS_group_schedule_tiling_first &&
FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
optim::ReplaceVarWithExpr(&init_body, axis[i], Expr(0));
continue;
}
Expand All @@ -105,29 +118,41 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
/*is_reduce = */ false));
optim::ReplaceVarWithExpr(&init_body, axis[i], block_vars.back());
axis_vars[i]->is_reduce_axis = false;
if (shape[i] == Expr(1)) {
if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) {
iter_values.push_back(Expr(0));
} else {
iter_values.push_back(axis_vars[i]);
}
}
VLOG(4) << "iter_value.size() and block_vars.size() is "
<< iter_values.size() << " " << block_vars.size();
init_body = ir::ScheduleBlockRealize::Make(
iter_values,
ir::ScheduleBlock::Make(
block_vars, {}, {}, reduce_init_name, init_body));

// For the remaining reduce axis, make reduce body
const std::vector<ir::Var>& reduce_axis = tensor->reduce_axis;
ir::Expr reduce_body =
ConvertReduceBody(tensor->body(), tensor, axis_exprs);

VLOG(4) << "ast gen: reduce body is " << reduce_body;

// create schedule block itervars, i0,i1...
std::vector<ir::Var> reduce_block_vars;
std::vector<ir::Expr> reduce_iter_values;
// reduce body and reduce init schedule block should have different objects
// for same axis so we re-create objects
std::vector<Var> reduce_axis_vars = cinn::common::GenDefaultAxis(axis_len);
for (int i = 0; i < shape.size(); ++i) {
if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
bool is_keep_dim = axis[i]->is_keepdim;
if (FLAGS_group_schedule_tiling_first && is_keep_dim) {
// if tiling first, we need to replace the reduce axis with 0, but don't
// deal with the non-reduce axis
optim::ReplaceVarWithExpr(&reduce_body, axis[i], Expr(0));
continue;
}
if (!FLAGS_group_schedule_tiling_first &&
FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
optim::ReplaceVarWithExpr(&reduce_body, axis[i], Expr(0));
continue;
}
Expand All @@ -136,12 +161,13 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
cinn::UniqName("i" + std::to_string(i)),
/*is_reduce = */ false));
reduce_axis_vars[i]->is_reduce_axis = false;
if (shape[i] == Expr(1)) {
if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) {
reduce_iter_values.push_back(Expr(0));
} else {
reduce_iter_values.push_back(axis_vars[i]);
}
}
VLOG(4) << "ast gen: reduce body is after replace 0" << reduce_body;
for (int i = 0; i < reduce_axis.size(); ++i) {
int count = shape.size() + i;
reduce_block_vars.push_back(
Expand All @@ -155,14 +181,43 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
}

int non_zero_axis_size = 0;
for (int i = 0; i < axis.size(); ++i) {
if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
continue;
if (FLAGS_group_schedule_tiling_first) {
std::vector<ir::Var> non_reduce_axis_vars = [&]() {
std::vector<ir::Var> res;
for (int i = 0; i < shape.size(); ++i) {
bool is_keep_dim = axis[i]->is_keepdim;
if (!is_keep_dim) {
res.push_back(axis[i]);
}
}
return res;
}();
for (int i = 0; i < non_reduce_axis_vars.size(); ++i) {
optim::ReplaceVarWithExpr(
&reduce_body, non_reduce_axis_vars[i], reduce_block_vars[i]);
++non_zero_axis_size;
}
optim::ReplaceVarWithExpr(
&reduce_body, axis[i], reduce_block_vars[non_zero_axis_size]);
++non_zero_axis_size;
} else {
for (int i = 0; i < axis.size(); ++i) {
if (!FLAGS_group_schedule_tiling_first &&
FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) {
continue;
}
optim::ReplaceVarWithExpr(
&reduce_body, axis[i], reduce_block_vars[non_zero_axis_size]);
++non_zero_axis_size;
}
}

VLOG(4) << "to replace : " << non_zero_axis_size << " "
<< reduce_block_vars.size();
for (auto i = 0; i < reduce_block_vars.size(); i++) {
VLOG(4) << "reduce_block_vars[" << i << "] = " << reduce_block_vars[i];
}
for (auto i = 0; i < reduce_axis.size(); i++) {
VLOG(4) << "reduce_axis[" << i << "] = " << reduce_axis[i];
}
VLOG(4) << "before replace body: " << reduce_body;
for (int i = non_zero_axis_size; i < reduce_block_vars.size(); ++i) {
optim::ReplaceVarWithExpr(&reduce_body,
reduce_axis[i - non_zero_axis_size],
Expand All @@ -185,7 +240,12 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
// Put the two parts together
ir::Expr body = ir::Block::Make({init_body, reduce_body});
for (int i = static_cast<int>(axis_len) - 1; i >= 0; --i) {
if (!FLAGS_cinn_bucket_compile && shape[i] == Expr(1)) {
bool is_keep_dim = axis[i]->is_keepdim;
if (FLAGS_group_schedule_tiling_first && is_keep_dim) {
continue;
}
if ((!FLAGS_group_schedule_tiling_first || !FLAGS_cinn_bucket_compile) &&
shape[i] == Expr(1)) {
continue;
}
ir::Var loop_var = axis[i];
Expand All @@ -210,7 +270,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
Expr(0), shape[i], cinn::UniqName("i" + std::to_string(i)), false));
optim::ReplaceVarWithExpr(&body, axis[i], block_vars[i]);
axis_vars[i]->is_reduce_axis = false;
if (shape[i] == Expr(1)) {
if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) {
iter_values.push_back(Expr(0));
} else {
iter_values.push_back(axis_vars[i]);
Expand Down
2 changes: 2 additions & 0 deletions paddle/cinn/backends/codegen_cuda_dev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <set>
#include <unordered_set>

#include "paddle/cinn/common/cas.h"
#include "paddle/cinn/common/ir_util.h"
#include "paddle/cinn/ir/op/ir_operators.h"
#include "paddle/cinn/ir/utils/ir_verify.h"
Expand Down Expand Up @@ -124,6 +125,7 @@ std::vector<Expr> FilterDeallocTempBuffers(const std::vector<Expr> &frees) {
bool has_symbolic_constant = false;
const ir::_Buffer_ *buffer = op->destination.As<ir::_Buffer_>();
for (Expr shape : buffer->shape) {
shape = common::AutoSimplify(shape);
ir::ir_utils::CollectIRNodes(shape, [&](const Expr *x) {
if (x->as_var()) {
CHECK(x->as_var()->is_symbolic_constant)
Expand Down
4 changes: 1 addition & 3 deletions paddle/cinn/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ gather_srcs(
nvgpu_dev_info.cc
integer_set.cc
dim_expr_converter.cc
broadcast_tree.cc
dim_expr_util.cc)
broadcast_tree.cc)

cinn_cc_test(test_equation_graph_topo_walker SRCS
equation_graph_topo_walker_test.cc DEPS gtest glog)
Expand All @@ -49,7 +48,6 @@ if(WITH_CUDA)
gtest glog)
endif()
if(NOT CINN_ONLY)
cinn_cc_test(dim_expr_util_test SRCS dim_expr_util_test.cc DEPS cinncore)
cinn_cc_test(dim_expr_converter_test SRCS dim_expr_converter_test.cc DEPS
cinncore)
cinn_cc_test(broadcast_tree_test SRCS broadcast_tree_test.cc DEPS cinncore)
Expand Down
Loading

0 comments on commit c917b6f

Please sign in to comment.