Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【Hackathon 6th Fundable Projects 3 No.97】fluid operator dgc #64285

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
9e21acf
Fix
co63oc May 14, 2024
15b4a40
Fix
co63oc May 14, 2024
f45289c
Fix
co63oc May 14, 2024
7243625
Fix
co63oc May 14, 2024
1abb47a
Fix
co63oc May 14, 2024
4eabdb3
Fix
co63oc May 15, 2024
1ef4d71
Merge branch 'develop' into fix_dgc
co63oc May 15, 2024
f9e069a
ci
co63oc May 15, 2024
ab5fed6
Fix
co63oc May 16, 2024
e4f61b1
Fix
co63oc May 16, 2024
f3bd8c1
Merge branch 'develop' into fix_dgc
co63oc May 16, 2024
4508abf
ci
co63oc May 16, 2024
c197f8f
add index sample infer symbol (#64332)
phlrain May 17, 2024
b70e514
【Hackathon 6th Fundable Projects 3 No.109】Remove fluid operator dropo…
co63oc May 17, 2024
9fc4c56
Dump original pir code to original_programs.py (#64373)
tc20042008 May 17, 2024
f08017e
[Prim]Support mean_grad decompose in vjp (#64346)
Aurelius84 May 17, 2024
270b0a2
【Fix PIR Unittest No.66】refine pir unique_name and set_parameter (#64…
wanghuancoder May 17, 2024
3cf1d63
【Fix PIR Unittest No.74】Fix some test cast in PIR mode (#64350)
wanghuancoder May 17, 2024
4916033
[CINN] Add CreateConvertMEA2FA Pass (#64288)
jiahy0825 May 17, 2024
b4090d9
add dist branch for add_n op. (#64361)
winter-wang May 17, 2024
49332bb
【Fix PIR JIT SaveLoad Unittest No.12,22-23】modify test_se_resnet.py ,…
xiaoguoguo626807 May 17, 2024
bcf0625
【CINN】Modify gpu resource allocation config (#63138)
Vvsmile May 17, 2024
8321099
[CINN] Add more rules to cinn_to_pd_op pass (#64354)
chen2016013 May 17, 2024
806936f
add CUDA_BFLOAT16_AVALIABLE macro definition (#64372)
yuanlehome May 17, 2024
d3b786b
[CINN/Fusion] horizontal support dynamic shape and enhance fusion abi…
2742195759 May 17, 2024
39d680c
[CINN][New Hardware Update] extend SplitCudaAndHostModule (#64345)
DongBaiYue May 17, 2024
aa4847b
【Hackathon 6th Fundable Projects 3 No.102】fluid operator distributed_…
co63oc May 17, 2024
cf2be10
fix pir api 3 (#64319)
wanghuancoder May 17, 2024
70535a9
[XPU] update XHPC to 20240515 (#64326)
houj04 May 17, 2024
1a13d37
[CINN] Add constraint for while op (#64385)
zyfncg May 17, 2024
a35f2dd
Clean paddle/fluid/operators/elementwise/elementwise_xpu.h (#64394)
co63oc May 17, 2024
59aaeb8
【Hackathon 6th Fundable Projects 3 No.2】fluid operator adadelta (#64…
co63oc May 17, 2024
3fff1ec
【Hackathon 6th No.26】为 paddle.view 进行功能增强 -part (#64205)
yinfan98 May 17, 2024
3d9f3f2
[AutoParallel] 3D parallel on MLP with PIR (#64369)
pkuzyc May 17, 2024
b65e881
Fix
co63oc May 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
set(cuda_arch_bin "75")
elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
message(STATUS "Add Define CUDA_BFLOAT16_AVALIABLE")
add_definitions("-DCUDA_BFLOAT16_AVALIABLE")
if(WITH_NV_JETSON)
set(cuda_arch_bin "87")
else()
Expand All @@ -183,6 +185,8 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
endif()
endif()
elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper")
message(STATUS "Add Define CUDA_BFLOAT16_AVALIABLE")
add_definitions("-DCUDA_BFLOAT16_AVALIABLE")
set(cuda_arch_bin "90")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs})
Expand All @@ -196,8 +200,17 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
to get a full wheel package to resolve this warning.
While, this version will still work on local GPU architecture.")
detect_installed_gpus(cuda_arch_bin)
if(${cuda_arch_bin} MATCHES "[ ]*(8\.0|8\.6|8\.9|9\.0)[ ]*")
message(STATUS "Add Define CUDA_BFLOAT16_AVALIABLE")
add_definitions("-DCUDA_BFLOAT16_AVALIABLE")
endif()
else() # (${CUDA_ARCH_NAME} STREQUAL "Manual")
set(cuda_arch_bin ${CUDA_ARCH_BIN})

if(${CUDA_ARCH_BIN} MATCHES "[ ]*(80|86|89|90)[ ]*")
message(STATUS "Add Define CUDA_BFLOAT16_AVALIABLE")
add_definitions("-DCUDA_BFLOAT16_AVALIABLE")
endif()
endif()

if(NEW_RELEASE_JIT)
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ if(NOT DEFINED XPU_XDNN_BASE_DATE)
set(XPU_XDNN_BASE_DATE "20240327")
endif()
if(NOT DEFINED XPU_XHPC_BASE_DATE)
set(XPU_XHPC_BASE_DATE "20240514")
set(XPU_XHPC_BASE_DATE "20240515")
endif()
set(XPU_XCCL_BASE_VERSION "1.2.0.5")
if(NOT DEFINED XPU_XFT_BASE_VERSION)
Expand Down
5 changes: 3 additions & 2 deletions paddle/cinn/backends/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ gather_srcs(
extern_func_protos.cc
extern_func_jit_register.cc
modular.cc
compiler.cc)
compiler.cc
codegen_device_util.cc)

if(WITH_CUDA)
add_subdirectory(nvrtc)
list(APPEND srcs cuda_util.cc codegen_cuda_dev.cc codegen_cuda_util.cc)
list(APPEND srcs cuda_util.cc codegen_cuda_dev.cc)
endif()

if(WITH_OPENMP)
Expand Down
2 changes: 1 addition & 1 deletion paddle/cinn/backends/codegen_cuda_generate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/codegen_cuda_host.h"
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/codegen_device_util.h"
#include "paddle/cinn/backends/extern_func_jit_register.h"
#include "paddle/cinn/backends/llvm/execution_engine.h"
#include "paddle/cinn/backends/llvm/simple_jit.h"
Expand Down
2 changes: 1 addition & 1 deletion paddle/cinn/backends/codegen_cuda_host.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <string>
#include <unordered_map>

#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/codegen_device_util.h"
#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
#include "paddle/cinn/backends/extern_func_jit_register.h"
#include "paddle/cinn/backends/llvm/llvm_util.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/codegen_device_util.h"

#include "paddle/cinn/backends/cuda_util.h"
#include "paddle/cinn/common/cas.h"
Expand All @@ -22,7 +22,7 @@ PD_DECLARE_bool(cinn_bucket_compile);
namespace cinn {
namespace backends {

std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module) {
std::tuple<ir::Module, ir::Module> SplitDeviceAndHostModule(ir::Module module) {
if (FLAGS_cinn_bucket_compile) {
detail::CollectBucketStrategyHostFunctionVisitor visitor(module->name);
Expr expr(module);
Expand Down Expand Up @@ -91,7 +91,16 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
ir::Var kernel_ptr(GenDeviceKernelName(func_node->name, predicate),
type_of<std::string>());

Expr shared_mem_bytes = CalculateSharedMemory(func);
std::optional<Expr> shared_mem_bytes;
cinn::common::DefaultDeviceTarget().arch.Match(
[&](std::variant<common::UnknownArch, common::X86Arch, common::ARMArch>) {
CINN_NOT_IMPLEMENTED;
},
[&](common::NVGPUArch) {
#ifdef CINN_WITH_CUDA
shared_mem_bytes = CalculateSharedMemory(func);
#endif
});

VLOG(6) << "Add a call node for func_node->name " << func_node->name << "\n"
<< "grid_dim: (" << func_node->cuda_axis_info.grid_dim(0) << ", "
Expand All @@ -100,10 +109,18 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
<< "block_dim: (" << func_node->cuda_axis_info.block_dim(0) << ", "
<< func_node->cuda_axis_info.block_dim(1) << ", "
<< func_node->cuda_axis_info.block_dim(2) << "), "
<< "shared_mem: " << shared_mem_bytes;
<< "shared_mem: " << shared_mem_bytes.value();
std::optional<const char *> call_kernel;
cinn::common::DefaultDeviceTarget().arch.Match(
[&](std::variant<common::UnknownArch, common::X86Arch, common::ARMArch>) {
CINN_NOT_IMPLEMENTED;
},
[&](common::NVGPUArch) {
call_kernel = runtime::intrinsic::call_cuda_kernel;
});
ir::Expr call_extern_api =
ir::Call::Make(Void(),
runtime::intrinsic::call_cuda_kernel,
call_kernel.value(),
{kernel_ptr,
kernel_args_,
kernel_args_num_,
Expand All @@ -113,7 +130,7 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
func_node->cuda_axis_info.block_dim(0), // block_x
func_node->cuda_axis_info.block_dim(1), // block_y
func_node->cuda_axis_info.block_dim(2), // block_z
shared_mem_bytes, // shared_mem
shared_mem_bytes.value(), // shared_mem
kernel_stream_},
{},
ir::CallType::Extern,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@
#include <string>
#include <tuple>
#include <vector>

#ifdef CINN_WITH_CUDA
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#endif
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_mutator.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/runtime/flags.h"

namespace cinn {
namespace backends {
Expand All @@ -43,7 +45,7 @@ namespace backends {
* - replace the original kernel function with a Call node and add it to the
* first module, add a device kernel function to the second module.
*/
std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module);
std::tuple<ir::Module, ir::Module> SplitDeviceAndHostModule(ir::Module module);

namespace detail {

Expand All @@ -52,7 +54,7 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
: host_module_builder(module_name + "_host",
cinn::common::DefaultHostTarget()),
device_module_builder(module_name + "_gpu_device",
cinn::common::DefaultNVGPUTarget()) {}
cinn::common::DefaultDeviceTarget()) {}

std::tuple<ir::Module, ir::Module> operator()(Expr* expr) {
ir::IRMutator<>::Visit(expr, expr);
Expand Down Expand Up @@ -109,9 +111,18 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
// shared_mem_bytes Can be calculated after codegen_cuda_dev buffer creation
// however, this make CodeGenCUDA_Dev before spliting the host and device
// module Maybe we could reorder the process.
CodeGenCUDA_Dev codegen_dev(cinn::common::DefaultNVGPUTarget());
codegen_dev.Compile(ir::LoweredFunc(func));
Expr shared_mem_bytes = codegen_dev.GetDynSharedMemOffset();
std::optional<Expr> shared_mem_bytes;
cinn::common::DefaultDeviceTarget().arch.Match(
[&](std::variant<common::UnknownArch,
common::X86Arch,
common::ARMArch>) { CINN_NOT_IMPLEMENTED; },
[&](common::NVGPUArch) {
#ifdef CINN_WITH_CUDA
CodeGenCUDA_Dev codegen_dev(cinn::common::DefaultNVGPUTarget());
codegen_dev.Compile(ir::LoweredFunc(func));
shared_mem_bytes = codegen_dev.GetDynSharedMemOffset();
#endif
});

VLOG(6) << "Add a call node for func->name " << func->name << "\n"
<< "grid_dim: (" << func->cuda_axis_info.grid_dim(0) << ", "
Expand All @@ -120,10 +131,20 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
<< "block_dim: (" << func->cuda_axis_info.block_dim(0) << ", "
<< func->cuda_axis_info.block_dim(1) << ", "
<< func->cuda_axis_info.block_dim(2) << "), "
<< "shared_mem: " << shared_mem_bytes;
<< "shared_mem: " << shared_mem_bytes.value();

std::optional<const char*> call_kernel;
cinn::common::DefaultDeviceTarget().arch.Match(
[&](std::variant<common::UnknownArch,
common::X86Arch,
common::ARMArch>) { CINN_NOT_IMPLEMENTED; },
[&](common::NVGPUArch) {
call_kernel = runtime::intrinsic::call_cuda_kernel;
});

auto call_extern_api =
ir::Call::Make(Void(),
runtime::intrinsic::call_cuda_kernel,
call_kernel.value(),
{kernel_ptr,
kernel_args,
kernel_args_num,
Expand All @@ -133,7 +154,7 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
func->cuda_axis_info.block_dim(0), // block_x
func->cuda_axis_info.block_dim(1), // block_y
func->cuda_axis_info.block_dim(2), // block_z
shared_mem_bytes,
shared_mem_bytes.value(),
kernel_stream},
{},
ir::CallType::Extern,
Expand Down
7 changes: 4 additions & 3 deletions paddle/cinn/backends/compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#ifdef CINN_WITH_CUDA
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/codegen_cuda_host.h"
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/codegen_device_util.h"
#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
#include "paddle/cinn/runtime/cuda/cuda_module.h"
#include "paddle/cinn/runtime/cuda/cuda_util.h"
Expand Down Expand Up @@ -246,7 +246,7 @@ std::string Compiler::GetSourceCode(const ir::Module& module) {
[&](common::NVGPUArch) -> std::string {
#ifdef CINN_WITH_CUDA
auto _host_module_device_module_ =
SplitCudaAndHostModule(module); // NOLINT
SplitDeviceAndHostModule(module); // NOLINT
auto& host_module = std::get<0>(_host_module_device_module_);
auto& device_module = std::get<1>(_host_module_device_module_);
CodeGenCUDA_Dev codegen(target_);
Expand All @@ -270,7 +270,8 @@ void Compiler::BuildDefault(const Module& module) {
void Compiler::CompileCudaModule(const Module& module,
const std::string& code) {
#ifdef CINN_WITH_CUDA
auto _host_module_device_module_ = SplitCudaAndHostModule(module); // NOLINT
auto _host_module_device_module_ =
SplitDeviceAndHostModule(module); // NOLINT
auto& host_module = std::get<0>(_host_module_device_module_);
auto& device_module = std::get<1>(_host_module_device_module_);
VLOG(3) << "[CUDA] host module:\n" << host_module;
Expand Down
3 changes: 3 additions & 0 deletions paddle/cinn/common/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <functional>
#include <ostream>
#include <variant>
#include "paddle/common/overloaded.h"

namespace cinn {
namespace common {
Expand Down Expand Up @@ -45,6 +46,8 @@ struct Arch final : public ArchBase {
return static_cast<const ArchBase&>(*this);
}

DEFINE_MATCH_METHOD();

bool operator==(const auto& other) const {
return this->index() == other.index();
}
Expand Down
4 changes: 2 additions & 2 deletions paddle/cinn/common/cuda_test_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/codegen_cuda_host.h"
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/codegen_device_util.h"
#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
#include "paddle/cinn/runtime/cuda/cuda_module.h"
#include "paddle/cinn/runtime/cuda/cuda_util.h"
Expand All @@ -28,7 +28,7 @@ namespace common {
void CudaModuleTester::Compile(const ir::Module& m,
const std::string& rewrite_cuda_code) {
auto _host_module_device_module_ =
backends::SplitCudaAndHostModule(m); // NOLINT
backends::SplitDeviceAndHostModule(m); // NOLINT
auto& host_module = std::get<0>(_host_module_device_module_);
auto& device_module = std::get<1>(_host_module_device_module_);
CHECK(!host_module.functions().empty());
Expand Down
6 changes: 6 additions & 0 deletions paddle/cinn/common/target.cc
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,12 @@ const Target &DefaultNVGPUTarget() {
return target;
}

const Target &DefaultDeviceTarget() {
#ifdef CINN_WITH_CUDA
return DefaultNVGPUTarget();
#endif
}

int GetMaxThreads() {
// cudaDeviceGetAttribute ( int* value, cudaDeviceAttr attr, int device )
int max_threads = 1;
Expand Down
2 changes: 2 additions & 0 deletions paddle/cinn/common/target.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ const Target& DefaultHostTarget();

const Target& DefaultNVGPUTarget();

const Target& DefaultDeviceTarget();

const Target& DefaultTarget();

int GetMaxThreads();
Expand Down
2 changes: 1 addition & 1 deletion paddle/cinn/frontend/paddle/model_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/codegen_cuda_host.h"
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/codegen_device_util.h"
#include "paddle/cinn/backends/cuda_util.h"
#include "paddle/cinn/common/common.h"
#include "paddle/cinn/frontend/paddle/compatible_pb.h"
Expand Down
2 changes: 1 addition & 1 deletion paddle/cinn/hlir/dialect/operator/ir/ops.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
interfaces : paddle::dialect::InferSymbolicShapeInterface

- op : uniform_random
args : (int64_t[] shape, float min, float max, int seed, DataType dtype, int diag_num = 0, int diag_step=0, float diag_val=1.0)
args : (int64_t[] shape, float min, float max, int seed, DataType dtype, int diag_num = 0, int diag_step=0, float diag_val=1.0, Place place={})
output : Tensor(out)
infer_meta :
func : CreateVecShapeInferMeta
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ class AddAccuracyCheckPattern
builder.set_insertion_point(fusion_op);

const auto& InsertAccuaryCheckOp = [&](::pir::Operation* op) -> void {
rewriter.SetInsertionPointAfter(fusion_op);
for (size_t i = 0; i < op->num_operands(); ++i) {
rewriter.Build<paddle::dialect::AccuracyCheckOp>(
fusion_op.result(i),
Expand All @@ -67,6 +66,7 @@ class AddAccuracyCheckPattern
i);
}
};

const auto& ConvertCinnOpToPdOp = [&](::pir::Operation* op) -> void {
rewriter.SetInsertionPointAfter(fusion_op);
for (size_t i = 0; i < op->num_operands(); ++i) {
Expand All @@ -86,6 +86,7 @@ class AddAccuracyCheckPattern
}
auto new_op = op->Clone(ir_mapping, clone_options);
rewriter.Insert(new_op);
rewriter.SetInsertionPointAfter(new_op);
};

for (auto& op : op_list) {
Expand All @@ -103,7 +104,7 @@ class AddAccuracyCheckPattern

class AccuarcyCheckPass : public pir::Pass {
public:
AccuarcyCheckPass() : pir::Pass("accuracy_check_pass", /*opt_level=*/4) {}
AccuarcyCheckPass() : pir::Pass("accuracy_check_pass", /*opt_level=*/3) {}

bool Initialize(pir::IrContext* context) override {
pir::RewritePatternSet ps(context);
Expand Down
Loading