From cce2b94df86b9df3a9bfed46ad07fdcd5fa8c894 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Thu, 2 Mar 2023 14:08:28 +0800
Subject: [PATCH] [GetCurrentCUDAStream] Add C++ API GetCurrentCUDAStream
 (#51027)

* polish codes according #50813

* [getCurrentCUDAStream] Add C++ API getCurrentCUDAStream

* change get->Get

* wrap with macro

* use Get instead of get
---
 paddle/phi/api/include/context_pool.h    | 19 +++++++++++----
 paddle/phi/api/lib/context_pool.cc       | 27 +++++++++++++++++----
 paddle/phi/core/allocator.cc             |  3 ---
 paddle/phi/core/allocator.h              |  1 -
 paddle/phi/tests/api/CMakeLists.txt      |  8 +++++++
 paddle/phi/tests/api/test_allocator.cu   | 30 +++++++++++-------------
 paddle/phi/tests/api/test_cuda_stream.cu | 26 ++++++++++++++++++++
 7 files changed, 86 insertions(+), 28 deletions(-)
 create mode 100644 paddle/phi/tests/api/test_cuda_stream.cu
diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h
index f548696bf9999..7afe17ba8419d 100644
--- a/paddle/phi/api/include/context_pool.h
+++ b/paddle/phi/api/include/context_pool.h
@@ -25,6 +25,8 @@ namespace phi {
 class DeviceContext;
 class CPUContext;
 class GPUContext;
+class Allocator;
+class CUDAStream;
 }  // namespace phi
 
 namespace paddle {
@@ -88,9 +90,18 @@ class PADDLE_API DeviceContextPool {
 }  // namespace experimental
 }  // namespace paddle
 
-namespace phi {
-class Allocator;
+namespace paddle {
 
-PADDLE_API Allocator* GetAllocator(const Place& place);
+/**
+ * Get the Allocator for the passed place.
+ */
+PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place);
 
-}  // namespace phi
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+/**
+ * Get the current CUDA stream for the passed CUDA device.
+ */
+PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
index a17df04183e82..460622698b963 100644
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/enforce.h"
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/phi/core/cuda_stream.h"
+#endif
+
 #include "paddle/fluid/platform/init.h"
 
 namespace paddle {
@@ -52,12 +56,27 @@ phi::DeviceContext* DeviceContextPool::GetMutable(const Place& place) {
 }  // namespace experimental
 }  // namespace paddle
 
-namespace phi {
+namespace paddle {
 
-PADDLE_API Allocator* GetAllocator(const Place& place) {
-  const DeviceContext* dev_ctx =
+PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) {
+  const phi::DeviceContext* dev_ctx =
       paddle::experimental::DeviceContextPool::Instance().Get(place);
   return const_cast<phi::Allocator*>(&dev_ctx->GetAllocator());
 }
 
-}  // namespace phi
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) {
+  PADDLE_ENFORCE(place.GetType() == phi::AllocationType::GPU,
+                 phi::errors::InvalidArgument(
+                     "GetCurrentCUDAStream only supports GPUPlace input. "
+                     "However, your input is place=%s",
+                     place));
+
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  const phi::GPUContext* dev_ctx =
+      static_cast<const phi::GPUContext*>(pool.Get(place));
+  return dev_ctx->cuda_stream();
+}
+#endif
+
+}  // namespace paddle
diff --git a/paddle/phi/core/allocator.cc b/paddle/phi/core/allocator.cc
index 76e5c38c51ae1..4d766d7003f6b 100644
--- a/paddle/phi/core/allocator.cc
+++ b/paddle/phi/core/allocator.cc
@@ -14,7 +14,4 @@ limitations under the License. */
 
 #include "paddle/phi/core/allocator.h"
 
-#include "paddle/phi/api/include/context_pool.h"
-#include "paddle/phi/core/device_context.h"
-
 namespace phi {}  // namespace phi
diff --git a/paddle/phi/core/allocator.h b/paddle/phi/core/allocator.h
index 9595a51ec0316..849fc1548c7ec 100644
--- a/paddle/phi/core/allocator.h
+++ b/paddle/phi/core/allocator.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <functional>
 #include <memory>
 
-#include "paddle/phi/api/include/dll_decl.h"
 #include "paddle/phi/common/place.h"
 
 namespace phi {
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index 7ee6bd692b7e5..d4f1ecaab66fd 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -9,6 +9,10 @@ if(WITH_GPU)
     test_allocator
     SRCS test_allocator.cu
     DEPS memory place device_context context_pool)
+  nv_test(
+    test_cuda_stream
+    SRCS test_cuda_stream.cu
+    DEPS context_pool)
 elseif(WITH_ROCM)
   hip_test(
     test_phi_tensor
@@ -18,6 +22,10 @@ elseif(WITH_ROCM)
     test_allocator
     SRCS test_allocator.cu
     DEPS memory place device_context context_pool)
+  hip_test(
+    test_cuda_stream
+    SRCS test_cuda_stream.cu
+    DEPS context_pool)
 else()
   cc_test(
     test_phi_tensor
diff --git a/paddle/phi/tests/api/test_allocator.cu b/paddle/phi/tests/api/test_allocator.cu
index 23738d9a5ff42..eafbe84019b5f 100644
--- a/paddle/phi/tests/api/test_allocator.cu
+++ b/paddle/phi/tests/api/test_allocator.cu
@@ -22,8 +22,20 @@ limitations under the License. */
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/device_context.h"
 
+using paddle::memory::Copy;
+
+template <typename T>
+class Scale {
+ public:
+  explicit Scale(const T& scale) : scale_(scale) {}
+  HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
+
+ private:
+  T scale_;
+};
+
 TEST(Allocator, CPU) {
-  phi::Allocator* allocator = phi::GetAllocator(phi::CPUPlace());
+  phi::Allocator* allocator = paddle::GetAllocator(phi::CPUPlace());
   auto cpu_allocation = allocator->Allocate(sizeof(float) * 4);
   float* cpu_buf = static_cast<float*>(cpu_allocation->ptr());
   ASSERT_NE(cpu_buf, nullptr);
@@ -39,23 +51,10 @@ TEST(Allocator, CPU) {
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-using paddle::memory::Copy;
-
-template <typename T>
-class Scale {
- public:
-  explicit Scale(const T& scale) : scale_(scale) {}
-  HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
-
- private:
-  T scale_;
-};
-
 TEST(Allocator, GPU) {
   phi::GPUPlace gpu0(0);
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
-  phi::Allocator* allocator = phi::GetAllocator(gpu0);
+  phi::Allocator* allocator = paddle::GetAllocator(gpu0);
   auto gpu_allocation = allocator->Allocate(sizeof(cpu_buf));
   float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
 
@@ -70,4 +69,3 @@ TEST(Allocator, GPU) {
     ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
   }
 }
-#endif
diff --git a/paddle/phi/tests/api/test_cuda_stream.cu b/paddle/phi/tests/api/test_cuda_stream.cu
new file mode 100644
index 0000000000000..698f161c356f1
--- /dev/null
+++ b/paddle/phi/tests/api/test_cuda_stream.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/phi/api/include/context_pool.h"
+#include "paddle/phi/core/cuda_stream.h"
+
+TEST(CUDAStream, GPU) {
+  phi::GPUPlace gpu0(0);
+  phi::CUDAStream* stream = paddle::GetCurrentCUDAStream(gpu0);
+  EXPECT_TRUE(stream != nullptr);
+  gpuStream_t raw_stream = stream->raw_stream();
+  EXPECT_TRUE(raw_stream != nullptr);
+}