PaddlePaddle · co63oc · May 14, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -173,6 +173,8 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
+    message(STATUS "Add Define CUDA_BFLOAT16_AVALIABLE")
+    add_definitions("-DCUDA_BFLOAT16_AVALIABLE")
     if(WITH_NV_JETSON)
       set(cuda_arch_bin "87")
     else()
@@ -183,6 +185,8 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
       endif()
     endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper")
+    message(STATUS "Add Define CUDA_BFLOAT16_AVALIABLE")
+    add_definitions("-DCUDA_BFLOAT16_AVALIABLE")
     set(cuda_arch_bin "90")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
@@ -196,8 +200,17 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
       to get a full wheel package to resolve this warning.
       While, this version will still work on local GPU architecture.")
     detect_installed_gpus(cuda_arch_bin)
+    if(${cuda_arch_bin} MATCHES "[ ]*(8\.0|8\.6|8\.9|9\.0)[ ]*")
+      message(STATUS "Add Define CUDA_BFLOAT16_AVALIABLE")
+      add_definitions("-DCUDA_BFLOAT16_AVALIABLE")
+    endif()
   else() # (${CUDA_ARCH_NAME} STREQUAL "Manual")
     set(cuda_arch_bin ${CUDA_ARCH_BIN})
+
+    if(${CUDA_ARCH_BIN} MATCHES "[ ]*(80|86|89|90)[ ]*")
+      message(STATUS "Add Define CUDA_BFLOAT16_AVALIABLE")
+      add_definitions("-DCUDA_BFLOAT16_AVALIABLE")
+    endif()
   endif()
 
   if(NEW_RELEASE_JIT)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -32,7 +32,7 @@ if(NOT DEFINED XPU_XDNN_BASE_DATE)
   set(XPU_XDNN_BASE_DATE "20240327")
 endif()
 if(NOT DEFINED XPU_XHPC_BASE_DATE)
-  set(XPU_XHPC_BASE_DATE "20240514")
+  set(XPU_XHPC_BASE_DATE "20240515")
 endif()
 set(XPU_XCCL_BASE_VERSION "1.2.0.5")
 if(NOT DEFINED XPU_XFT_BASE_VERSION)

diff --git a/paddle/cinn/backends/CMakeLists.txt b/paddle/cinn/backends/CMakeLists.txt
@@ -13,11 +13,12 @@ gather_srcs(
   extern_func_protos.cc
   extern_func_jit_register.cc
   modular.cc
-  compiler.cc)
+  compiler.cc
+  codegen_device_util.cc)
 
 if(WITH_CUDA)
   add_subdirectory(nvrtc)
-  list(APPEND srcs cuda_util.cc codegen_cuda_dev.cc codegen_cuda_util.cc)
+  list(APPEND srcs cuda_util.cc codegen_cuda_dev.cc)
 endif()
 
 if(WITH_OPENMP)

diff --git a/paddle/cinn/backends/codegen_cuda_generate_test.cc b/paddle/cinn/backends/codegen_cuda_generate_test.cc
@@ -21,7 +21,7 @@
 
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/backends/codegen_cuda_host.h"
-#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_device_util.h"
 #include "paddle/cinn/backends/extern_func_jit_register.h"
 #include "paddle/cinn/backends/llvm/execution_engine.h"
 #include "paddle/cinn/backends/llvm/simple_jit.h"

diff --git a/paddle/cinn/backends/codegen_cuda_host.cc b/paddle/cinn/backends/codegen_cuda_host.cc
@@ -18,7 +18,7 @@
 #include <string>
 #include <unordered_map>
 
-#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_device_util.h"
 #include "paddle/cinn/backends/extern_func_emitter_builtin.h"
 #include "paddle/cinn/backends/extern_func_jit_register.h"
 #include "paddle/cinn/backends/llvm/llvm_util.h"

diff --git a/paddle/cinn/backends/codegen_cuda_util.cc → paddle/cinn/backends/codegen_device_util.cc b/paddle/cinn/backends/codegen_cuda_util.cc → paddle/cinn/backends/codegen_device_util.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_device_util.h"
 
 #include "paddle/cinn/backends/cuda_util.h"
 #include "paddle/cinn/common/cas.h"
@@ -22,7 +22,7 @@ PD_DECLARE_bool(cinn_bucket_compile);
 namespace cinn {
 namespace backends {
 
-std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module) {
+std::tuple<ir::Module, ir::Module> SplitDeviceAndHostModule(ir::Module module) {
   if (FLAGS_cinn_bucket_compile) {
     detail::CollectBucketStrategyHostFunctionVisitor visitor(module->name);
     Expr expr(module);
@@ -91,7 +91,16 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
   ir::Var kernel_ptr(GenDeviceKernelName(func_node->name, predicate),
                      type_of<std::string>());
 
-  Expr shared_mem_bytes = CalculateSharedMemory(func);
+  std::optional<Expr> shared_mem_bytes;
+  cinn::common::DefaultDeviceTarget().arch.Match(
+      [&](std::variant<common::UnknownArch, common::X86Arch, common::ARMArch>) {
+        CINN_NOT_IMPLEMENTED;
+      },
+      [&](common::NVGPUArch) {
+#ifdef CINN_WITH_CUDA
+        shared_mem_bytes = CalculateSharedMemory(func);
+#endif
+      });
 
   VLOG(6) << "Add a call node for func_node->name " << func_node->name << "\n"
           << "grid_dim: (" << func_node->cuda_axis_info.grid_dim(0) << ", "
@@ -100,10 +109,18 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
           << "block_dim: (" << func_node->cuda_axis_info.block_dim(0) << ", "
           << func_node->cuda_axis_info.block_dim(1) << ", "
           << func_node->cuda_axis_info.block_dim(2) << "), "
-          << "shared_mem: " << shared_mem_bytes;
+          << "shared_mem: " << shared_mem_bytes.value();
+  std::optional<const char *> call_kernel;
+  cinn::common::DefaultDeviceTarget().arch.Match(
+      [&](std::variant<common::UnknownArch, common::X86Arch, common::ARMArch>) {
+        CINN_NOT_IMPLEMENTED;
+      },
+      [&](common::NVGPUArch) {
+        call_kernel = runtime::intrinsic::call_cuda_kernel;
+      });
   ir::Expr call_extern_api =
       ir::Call::Make(Void(),
-                     runtime::intrinsic::call_cuda_kernel,
+                     call_kernel.value(),
                      {kernel_ptr,
                       kernel_args_,
                       kernel_args_num_,
@@ -113,7 +130,7 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
                       func_node->cuda_axis_info.block_dim(0),  // block_x
                       func_node->cuda_axis_info.block_dim(1),  // block_y
                       func_node->cuda_axis_info.block_dim(2),  // block_z
-                      shared_mem_bytes,                        // shared_mem
+                      shared_mem_bytes.value(),                // shared_mem
                       kernel_stream_},
                      {},
                      ir::CallType::Extern,

diff --git a/paddle/cinn/backends/codegen_cuda_util.h → paddle/cinn/backends/codegen_device_util.h b/paddle/cinn/backends/codegen_cuda_util.h → paddle/cinn/backends/codegen_device_util.h
@@ -19,12 +19,14 @@
 #include <string>
 #include <tuple>
 #include <vector>
-
+#ifdef CINN_WITH_CUDA
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
+#endif
 #include "paddle/cinn/cinn.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn {
 namespace backends {
@@ -43,7 +45,7 @@ namespace backends {
  * - replace the original kernel function with a Call node and add it to the
  * first module, add a device kernel function to the second module.
  */
-std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module);
+std::tuple<ir::Module, ir::Module> SplitDeviceAndHostModule(ir::Module module);
 
 namespace detail {
 
@@ -52,7 +54,7 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
       : host_module_builder(module_name + "_host",
                             cinn::common::DefaultHostTarget()),
         device_module_builder(module_name + "_gpu_device",
-                              cinn::common::DefaultNVGPUTarget()) {}
+                              cinn::common::DefaultDeviceTarget()) {}
 
   std::tuple<ir::Module, ir::Module> operator()(Expr* expr) {
     ir::IRMutator<>::Visit(expr, expr);
@@ -109,9 +111,18 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
     // shared_mem_bytes Can be calculated after codegen_cuda_dev buffer creation
     // however, this make CodeGenCUDA_Dev before spliting the host and device
     // module Maybe we could reorder the process.
-    CodeGenCUDA_Dev codegen_dev(cinn::common::DefaultNVGPUTarget());
-    codegen_dev.Compile(ir::LoweredFunc(func));
-    Expr shared_mem_bytes = codegen_dev.GetDynSharedMemOffset();
+    std::optional<Expr> shared_mem_bytes;
+    cinn::common::DefaultDeviceTarget().arch.Match(
+        [&](std::variant<common::UnknownArch,
+                         common::X86Arch,
+                         common::ARMArch>) { CINN_NOT_IMPLEMENTED; },
+        [&](common::NVGPUArch) {
+#ifdef CINN_WITH_CUDA
+          CodeGenCUDA_Dev codegen_dev(cinn::common::DefaultNVGPUTarget());
+          codegen_dev.Compile(ir::LoweredFunc(func));
+          shared_mem_bytes = codegen_dev.GetDynSharedMemOffset();
+#endif
+        });
 
     VLOG(6) << "Add a call node for func->name " << func->name << "\n"
             << "grid_dim: (" << func->cuda_axis_info.grid_dim(0) << ", "
@@ -120,10 +131,20 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
             << "block_dim: (" << func->cuda_axis_info.block_dim(0) << ", "
             << func->cuda_axis_info.block_dim(1) << ", "
             << func->cuda_axis_info.block_dim(2) << "), "
-            << "shared_mem: " << shared_mem_bytes;
+            << "shared_mem: " << shared_mem_bytes.value();
+
+    std::optional<const char*> call_kernel;
+    cinn::common::DefaultDeviceTarget().arch.Match(
+        [&](std::variant<common::UnknownArch,
+                         common::X86Arch,
+                         common::ARMArch>) { CINN_NOT_IMPLEMENTED; },
+        [&](common::NVGPUArch) {
+          call_kernel = runtime::intrinsic::call_cuda_kernel;
+        });
+
     auto call_extern_api =
         ir::Call::Make(Void(),
-                       runtime::intrinsic::call_cuda_kernel,
+                       call_kernel.value(),
                        {kernel_ptr,
                         kernel_args,
                         kernel_args_num,
@@ -133,7 +154,7 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
                         func->cuda_axis_info.block_dim(0),  // block_x
                         func->cuda_axis_info.block_dim(1),  // block_y
                         func->cuda_axis_info.block_dim(2),  // block_z
-                        shared_mem_bytes,
+                        shared_mem_bytes.value(),
                         kernel_stream},
                        {},
                        ir::CallType::Extern,

diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
@@ -24,7 +24,7 @@
 #ifdef CINN_WITH_CUDA
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/backends/codegen_cuda_host.h"
-#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_device_util.h"
 #include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
 #include "paddle/cinn/runtime/cuda/cuda_module.h"
 #include "paddle/cinn/runtime/cuda/cuda_util.h"
@@ -246,7 +246,7 @@ std::string Compiler::GetSourceCode(const ir::Module& module) {
       [&](common::NVGPUArch) -> std::string {
 #ifdef CINN_WITH_CUDA
         auto _host_module_device_module_ =
-            SplitCudaAndHostModule(module);  // NOLINT
+            SplitDeviceAndHostModule(module);  // NOLINT
         auto& host_module = std::get<0>(_host_module_device_module_);
         auto& device_module = std::get<1>(_host_module_device_module_);
         CodeGenCUDA_Dev codegen(target_);
@@ -270,7 +270,8 @@ void Compiler::BuildDefault(const Module& module) {
 void Compiler::CompileCudaModule(const Module& module,
                                  const std::string& code) {
 #ifdef CINN_WITH_CUDA
-  auto _host_module_device_module_ = SplitCudaAndHostModule(module);  // NOLINT
+  auto _host_module_device_module_ =
+      SplitDeviceAndHostModule(module);  // NOLINT
   auto& host_module = std::get<0>(_host_module_device_module_);
   auto& device_module = std::get<1>(_host_module_device_module_);
   VLOG(3) << "[CUDA] host module:\n" << host_module;

diff --git a/paddle/cinn/common/arch.h b/paddle/cinn/common/arch.h
@@ -17,6 +17,7 @@
 #include <functional>
 #include <ostream>
 #include <variant>
+#include "paddle/common/overloaded.h"
 
 namespace cinn {
 namespace common {
@@ -45,6 +46,8 @@ struct Arch final : public ArchBase {
     return static_cast<const ArchBase&>(*this);
   }
 
+  DEFINE_MATCH_METHOD();
+
   bool operator==(const auto& other) const {
     return this->index() == other.index();
   }

diff --git a/paddle/cinn/common/cuda_test_helper.cc b/paddle/cinn/common/cuda_test_helper.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/backends/codegen_cuda_host.h"
-#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_device_util.h"
 #include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
 #include "paddle/cinn/runtime/cuda/cuda_module.h"
 #include "paddle/cinn/runtime/cuda/cuda_util.h"
@@ -28,7 +28,7 @@ namespace common {
 void CudaModuleTester::Compile(const ir::Module& m,
                                const std::string& rewrite_cuda_code) {
   auto _host_module_device_module_ =
-      backends::SplitCudaAndHostModule(m);  // NOLINT
+      backends::SplitDeviceAndHostModule(m);  // NOLINT
   auto& host_module = std::get<0>(_host_module_device_module_);
   auto& device_module = std::get<1>(_host_module_device_module_);
   CHECK(!host_module.functions().empty());

diff --git a/paddle/cinn/common/target.cc b/paddle/cinn/common/target.cc
@@ -249,6 +249,12 @@ const Target &DefaultNVGPUTarget() {
   return target;
 }
 
+const Target &DefaultDeviceTarget() {
+#ifdef CINN_WITH_CUDA
+  return DefaultNVGPUTarget();
+#endif
+}
+
 int GetMaxThreads() {
   // cudaDeviceGetAttribute ( int* value, cudaDeviceAttr attr, int  device )
   int max_threads = 1;

diff --git a/paddle/cinn/common/target.h b/paddle/cinn/common/target.h
@@ -100,6 +100,8 @@ const Target& DefaultHostTarget();
 
 const Target& DefaultNVGPUTarget();
 
+const Target& DefaultDeviceTarget();
+
 const Target& DefaultTarget();
 
 int GetMaxThreads();

diff --git a/paddle/cinn/frontend/paddle/model_parser.cc b/paddle/cinn/frontend/paddle/model_parser.cc
@@ -19,7 +19,7 @@
 
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/backends/codegen_cuda_host.h"
-#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/codegen_device_util.h"
 #include "paddle/cinn/backends/cuda_util.h"
 #include "paddle/cinn/common/common.h"
 #include "paddle/cinn/frontend/paddle/compatible_pb.h"

diff --git a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
@@ -117,7 +117,7 @@
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : uniform_random
-  args : (int64_t[] shape,  float min, float max, int seed, DataType dtype, int diag_num = 0, int diag_step=0, float diag_val=1.0)
+  args : (int64_t[] shape,  float min, float max, int seed, DataType dtype, int diag_num = 0, int diag_step=0, float diag_val=1.0, Place place={})
   output : Tensor(out)
   infer_meta :
     func : CreateVecShapeInferMeta

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/accuracy_check_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/accuracy_check_pass.cc
@@ -58,7 +58,6 @@ class AddAccuracyCheckPattern
     builder.set_insertion_point(fusion_op);
 
     const auto& InsertAccuaryCheckOp = [&](::pir::Operation* op) -> void {
-      rewriter.SetInsertionPointAfter(fusion_op);
       for (size_t i = 0; i < op->num_operands(); ++i) {
         rewriter.Build<paddle::dialect::AccuracyCheckOp>(
             fusion_op.result(i),
@@ -67,6 +66,7 @@ class AddAccuracyCheckPattern
             i);
       }
     };
+
     const auto& ConvertCinnOpToPdOp = [&](::pir::Operation* op) -> void {
       rewriter.SetInsertionPointAfter(fusion_op);
       for (size_t i = 0; i < op->num_operands(); ++i) {
@@ -86,6 +86,7 @@ class AddAccuracyCheckPattern
       }
       auto new_op = op->Clone(ir_mapping, clone_options);
       rewriter.Insert(new_op);
+      rewriter.SetInsertionPointAfter(new_op);
     };
 
     for (auto& op : op_list) {
@@ -103,7 +104,7 @@ class AddAccuracyCheckPattern
 
 class AccuarcyCheckPass : public pir::Pass {
  public:
-  AccuarcyCheckPass() : pir::Pass("accuracy_check_pass", /*opt_level=*/4) {}
+  AccuarcyCheckPass() : pir::Pass("accuracy_check_pass", /*opt_level=*/3) {}
 
   bool Initialize(pir::IrContext* context) override {
     pir::RewritePatternSet ps(context);