Skip to content

Commit

Permalink
Merge branch 'develop' into rm_pybind_parallel_executor
Browse files Browse the repository at this point in the history
  • Loading branch information
ccsuzzh authored Apr 3, 2024
2 parents bd56f62 + d3f3c22 commit a95a68e
Show file tree
Hide file tree
Showing 587 changed files with 6,353 additions and 5,832 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ repos:
- id: black
files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.3.0
rev: v0.3.5
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix, --no-cache]
Expand Down
1 change: 1 addition & 0 deletions cmake/flags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ if(NOT WIN32)
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(COMMON_FLAGS
${COMMON_FLAGS}
-Wno-error=unknown-warning-option # For some unknown warning options in lower version clang
-Wno-error=unused-private-field
-Wno-error=unused-const-variable
-Wno-error=deprecated-copy-with-user-provided-copy # For three/five/zeros rule, clang
Expand Down
86 changes: 43 additions & 43 deletions cmake/operators.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -102,42 +102,42 @@ function(register_cu_kernel TARGET)
endforeach()
endfunction()

# Just for those mkldnn kernels locating at "fluid/operators/mkldnn/", such as 'layer_norm_mkldnn_op.cc'.
# Just for those onednn kernels locating at "fluid/operators/onednn/", such as 'layer_norm_onednn_op.cc'.
# Add other file modes if need in the future.
function(register_mkldnn_kernel TARGET)
function(register_onednn_kernel TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(register_mkldnn_kernel "${options}" "${oneValueArgs}"
cmake_parse_arguments(register_onednn_kernel "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})

set(mkldnn_cc_srcs)
set(onednn_cc_srcs)
set(op_common_deps operator op_registry phi layer
common_infer_shape_functions)
foreach(mkldnn_src ${register_mkldnn_kernel_SRCS})
if(${mkldnn_src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs mkldnn/${mkldnn_src})
foreach(onednn_src ${register_onednn_kernel_SRCS})
if(${onednn_src} MATCHES ".*_onednn_op.cc$")
list(APPEND onednn_cc_srcs onednn/${onednn_src})
endif()
endforeach()
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
if(${mkldnn_cc_srcs_len} EQUAL 0)
list(LENGTH onednn_cc_srcs onednn_cc_srcs_len)
if(${onednn_cc_srcs_len} EQUAL 0)
message(
FATAL_ERROR
"The MKLDNN kernel file of ${TARGET} should contains at least one *.*_mkldnn_op.cc file"
"The MKLDNN kernel file of ${TARGET} should contains at least one *.*_onednn_op.cc file"
)
endif()
if(WITH_MKLDNN)
cc_library(
${TARGET}
SRCS ${mkldnn_cc_srcs}
SRCS ${onednn_cc_srcs}
DEPS ${op_library_DEPS} ${op_common_deps})
endif()
set(OP_LIBRARY
${TARGET} ${OP_LIBRARY}
CACHE INTERNAL "op libs")
foreach(mkldnn_src ${mkldnn_cc_srcs})
foreach(onednn_src ${onednn_cc_srcs})
set(op_name "")
find_register(${mkldnn_src} "REGISTER_OP_KERNEL" op_name)
find_register(${onednn_src} "REGISTER_OP_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MKLDNN);\n")
endif()
Expand All @@ -161,7 +161,7 @@ function(op_library TARGET)
set(miopen_cu_srcs)
set(CUDNN_FILE)
set(MIOPEN_FILE)
set(mkldnn_cc_srcs)
set(onednn_cc_srcs)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry phi layer
common_infer_shape_functions)
Expand Down Expand Up @@ -238,9 +238,9 @@ function(op_library TARGET)
endif()
endif()
if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
string(REPLACE "_op" "_onednn_op" MKLDNN_FILE "${TARGET}")
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/onednn/${MKLDNN_FILE}.cc)
list(APPEND onednn_cc_srcs onednn/${MKLDNN_FILE}.cc)
endif()
endif()
if(WITH_XPU)
Expand Down Expand Up @@ -275,8 +275,8 @@ function(op_library TARGET)
list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_onednn_op.cc$")
list(APPEND onednn_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$")
Expand Down Expand Up @@ -349,7 +349,7 @@ function(op_library TARGET)
if(WITH_UNITY_BUILD AND op_library_UNITY)
# Combine the cc and cu source files.
compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${cu_cc_srcs}
${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs})
${cudnn_cu_cc_srcs} ${onednn_cc_srcs})
compose_unity_target_sources(${UNITY_TARGET} cu ${cudnn_cu_srcs}
${cu_srcs})
if(TARGET ${UNITY_TARGET})
Expand All @@ -369,7 +369,7 @@ function(op_library TARGET)
nv_library(
${TARGET}
SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs}
${mkldnn_cc_srcs} ${cu_srcs}
${onednn_cc_srcs} ${cu_srcs}
DEPS ${op_library_DEPS} ${op_common_deps})
endif()
elseif(WITH_ROCM)
Expand All @@ -389,19 +389,19 @@ function(op_library TARGET)
hip_library(
${TARGET}
SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs}
${mkldnn_cc_srcs} ${hip_srcs}
${onednn_cc_srcs} ${hip_srcs}
DEPS ${op_library_DEPS} ${op_common_deps})
elseif(WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
xpu_library(
${TARGET}
SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs}
SRCS ${cc_srcs} ${onednn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs}
DEPS ${op_library_DEPS} ${op_common_deps})
else()
# Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
if(WITH_UNITY_BUILD AND op_library_UNITY)
# Combine the cc source files.
compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs}
${mkldnn_cc_srcs} ${xpu_cc_srcs})
${onednn_cc_srcs} ${xpu_cc_srcs})
if(TARGET ${UNITY_TARGET})
# If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
Expand All @@ -417,7 +417,7 @@ function(op_library TARGET)
else()
cc_library(
${TARGET}
SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs}
SRCS ${cc_srcs} ${onednn_cc_srcs} ${xpu_cc_srcs}
DEPS ${op_library_DEPS} ${op_common_deps})
endif()
endif()
Expand All @@ -426,7 +426,7 @@ function(op_library TARGET)
list(LENGTH hip_srcs hip_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH hip_cc_srcs hip_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH onednn_cc_srcs onednn_cc_srcs_len)
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)

Expand Down Expand Up @@ -463,7 +463,7 @@ function(op_library TARGET)
find_register(${cc_src} "REGISTER_OPERATOR" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
# hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
# hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in onednn
set(TARGET ${op_name})
set(pybind_flag 1)
endif()
Expand All @@ -474,7 +474,7 @@ function(op_library TARGET)
find_register(${cc_src} "REGISTER_ACTIVATION_OP" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
# hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
# hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in onednn
set(TARGET ${op_name})
set(pybind_flag 1)
endif()
Expand All @@ -483,7 +483,7 @@ function(op_library TARGET)
find_register(${cc_src} "REGISTER_OP_WITHOUT_GRADIENT" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
# hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
# hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in onednn
set(TARGET ${op_name})
set(pybind_flag 1)
endif()
Expand All @@ -496,8 +496,8 @@ function(op_library TARGET)
# why change TARGET here?
# when building paddle with on_infer, the REGISTER_OPERATOR(*_grad) will be removed before compiling (see details in remove_grad_op_and_kernel.py)
# in elementwise_op.cc, it will find REGISTER_OPERATOR(grad_add) and set TARGET to grad_add
# and, in the following "mkldnn" part, it will add USE_OP_DEVICE_KERNEL(grad_add, MKLDNN) to pybind.h
# however, grad_add has no mkldnn kernel.
# and, in the following "onednn" part, it will add USE_OP_DEVICE_KERNEL(grad_add, MKLDNN) to pybind.h
# however, grad_add has no onednn kernel.
set(TARGET ${op_name})
set(pybind_flag 1)
endif()
Expand All @@ -520,16 +520,16 @@ function(op_library TARGET)
endif()
endforeach()

# pybind USE_OP_DEVICE_KERNEL for operators/mkldnn/*
list(APPEND mkldnn_srcs ${mkldnn_cc_srcs})
foreach(mkldnn_src ${mkldnn_srcs})
# pybind USE_OP_DEVICE_KERNEL for operators/onednn/*
list(APPEND onednn_srcs ${onednn_cc_srcs})
foreach(onednn_src ${onednn_srcs})
set(op_name "")
# Add PHI Kernel Registry Message
find_phi_register(${mkldnn_src} ${pybind_file} "PD_REGISTER_KERNEL")
find_phi_register(${mkldnn_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL")
find_phi_register(${mkldnn_src} ${pybind_file}
find_phi_register(${onednn_src} ${pybind_file} "PD_REGISTER_KERNEL")
find_phi_register(${onednn_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL")
find_phi_register(${onednn_src} ${pybind_file}
"PD_REGISTER_KERNEL_FOR_ALL_DTYPE")
find_register(${mkldnn_src} "REGISTER_OP_CUDA_KERNEL" op_name)
find_register(${onednn_src} "REGISTER_OP_CUDA_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
set(pybind_flag 1)
Expand Down Expand Up @@ -610,14 +610,14 @@ function(op_library TARGET)
endif()

# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
if(WITH_MKLDNN AND ${onednn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator
if(${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
if(${MKLDNN_FILE} STREQUAL "activation_onednn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(softplus, MKLDNN);\n")
else()
foreach(mkldnn_src ${mkldnn_cc_srcs})
foreach(onednn_src ${onednn_cc_srcs})
set(op_name "")
find_register(${mkldnn_src} "REGISTER_OP_KERNEL" op_name)
find_register(${onednn_src} "REGISTER_OP_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file}
"USE_OP_DEVICE_KERNEL(${op_name}, MKLDNN);\n")
Expand Down Expand Up @@ -666,7 +666,7 @@ function(register_operators)
GLOB OPS
RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
"*_op.cc")
string(REPLACE "_mkldnn" "" OPS "${OPS}")
string(REPLACE "_onednn" "" OPS "${OPS}")
string(REPLACE "_xpu" "" OPS "${OPS}")
string(REPLACE ".cc" "" OPS "${OPS}")
list(REMOVE_DUPLICATES OPS)
Expand Down
28 changes: 20 additions & 8 deletions paddle/cinn/backends/codegen_c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -434,30 +434,37 @@ void CodeGenC::Visit(const ir::_Module_ *op) { CINN_NOT_IMPLEMENTED }
void CodeGenC::Visit(const ir::_Var_ *op) { str_ += op->name; }

void CodeGenC::Visit(const ir::Load *op) {
Expr dense_strided_ramp = detail::StridedRampBase(op->index(), 1);
ir::Expr offset = [&] {
if (load_to_offset_.count(op) == 0) {
load_to_offset_[op] = op->index();
}
return load_to_offset_.at(op);
}();

Expr dense_strided_ramp = detail::StridedRampBase(offset, 1);
if (dense_strided_ramp.defined()) { // Loading a continuous Ramp address.
CHECK(op->type().is_vector());
PrintStackVecType(op->type().ElementOf(), op->index().type().lanes());
PrintStackVecType(op->type().ElementOf(), offset.type().lanes());
str_ += "::";
str_ += "Load(";
str_ += op->tensor.As<ir::_Tensor_>()->name;
str_ += ",";
IrPrinter::Visit(dense_strided_ramp);
str_ += ")";
} else if (op->index().type().is_vector()) {
} else if (offset.type().is_vector()) {
// gather
CHECK(op->type().is_vector());
PrintStackVecType(op->type().ElementOf(), op->index().type().lanes());
PrintStackVecType(op->type().ElementOf(), offset.type().lanes());
str_ += "::Load(";
str_ += op->tensor.As<ir::_Tensor_>()->name;
str_ += ",";
IrPrinter::Visit(op->index());
IrPrinter::Visit(offset);
str_ += ")";
} else if (op->is_addr_tensor()) {
auto *tensor = op->tensor.As<ir::_Tensor_>();
str_ += tensor->name;
str_ += "[";
IrPrinter::Visit(op->index());
IrPrinter::Visit(offset);
str_ += "]";
} else {
IrPrinter::Visit(op);
Expand All @@ -466,12 +473,17 @@ void CodeGenC::Visit(const ir::Load *op) {

void CodeGenC::Visit(const ir::Store *op) {
CHECK(op->is_addr_tensor());

ir::Expr offset = [&] {
if (store_to_offset_.count(op) == 0) {
store_to_offset_[op] = op->index();
}
return store_to_offset_.at(op);
}();
auto *tensor = op->tensor.As<ir::_Tensor_>();
CHECK(tensor);
str_ += tensor->name;
str_ += "[";
IrPrinter::Visit(op->index());
IrPrinter::Visit(offset);
str_ += "]";
str_ += " = ";
IrPrinter::Visit(op->value);
Expand Down
2 changes: 2 additions & 0 deletions paddle/cinn/backends/codegen_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ class CodeGenC : public ir::IrPrinter {
Target target_;
std::stringstream ss_;
bool inline_builtin_codes_{true};
std::unordered_map<const ir::Store*, ir::Expr> store_to_offset_;
std::unordered_map<const ir::Load*, ir::Expr> load_to_offset_;
};

namespace detail {
Expand Down
32 changes: 32 additions & 0 deletions paddle/cinn/backends/codegen_cuda_dev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "paddle/cinn/ir/op/ir_operators.h"
#include "paddle/cinn/ir/utils/ir_verify.h"
#include "paddle/cinn/optim/ir_simplify.h"
#include "paddle/common/errors.h"

namespace cinn {
namespace backends {
Expand Down Expand Up @@ -509,5 +510,36 @@ void CodeGenCUDA_Dev::Visit(const ir::Store *op) {
}
}

ir::Expr CalculateSharedMemory(const ir::Buffer &buffer) {
Expr buffer_size(1);
for (int i = 0; i < buffer->shape.size(); i++) {
buffer_size = buffer_size * buffer->shape[i];
}
int type_bytes = buffer->dtype.bytes();
return buffer_size * Expr(type_bytes);
}

ir::Expr CalculateSharedMemory(const ir::Expr &func_expr) {
auto func = func_expr.as_lowered_func();
PADDLE_ENFORCE_NOT_NULL(
func, ::common::errors::InvalidType("expr is not a lowered_func"));
auto alloc_temp_buffers = func->PrepareAllocTempBufferExprs();
ir::Expr shm_size{0};
for (const auto &alloc : alloc_temp_buffers) {
PADDLE_ENFORCE_NOT_NULL(
alloc.As<ir::Alloc>(),
::common::errors::InvalidType("expr is not a Alloc node"));
PADDLE_ENFORCE_NOT_NULL(
alloc.As<ir::Alloc>()->destination.as_buffer(),
::common::errors::InvalidType("expr is not a Buffer node"));

auto buffer = alloc.As<ir::Alloc>()->destination.as_buffer_ref();
if (buffer->memory_type == ir::MemoryType::GPUShared) {
shm_size = shm_size + CalculateSharedMemory(buffer);
}
}
return common::AutoSimplify(shm_size);
}

} // namespace backends
} // namespace cinn
2 changes: 2 additions & 0 deletions paddle/cinn/backends/codegen_cuda_dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,5 +127,7 @@ class CodeGenCUDA_Dev : public CodeGenC {
std::vector<ir::Buffer> dynamic_alloc_buffers_;
};

ir::Expr CalculateSharedMemory(const ir::Expr& func_expr);

} // namespace backends
} // namespace cinn
7 changes: 1 addition & 6 deletions paddle/cinn/backends/codegen_cuda_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,7 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
ir::Var kernel_ptr(GenDeviceKernelName(func_node->name, predicate),
type_of<std::string>());

// shared_mem_bytes Can be calculated after codegen_cuda_dev buffer creation
// however, this make CodeGenCUDA_Dev before spliting the host and device
// module Maybe we could reorder the process.
CodeGenCUDA_Dev codegen_dev(cinn::common::DefaultNVGPUTarget());
codegen_dev.Compile(ir::LoweredFunc(func.as_lowered_func_ref()));
Expr shared_mem_bytes = codegen_dev.GetDynSharedMemOffset();
Expr shared_mem_bytes = CalculateSharedMemory(func);

VLOG(6) << "Add a call node for func_node->name " << func_node->name << "\n"
<< "grid_dim: (" << func_node->cuda_axis_info.grid_dim(0) << ", "
Expand Down
Loading

0 comments on commit a95a68e

Please sign in to comment.