Enhance executor interfaces (#222)

* Support to explicitly set executor's stream * Support data copy between ARK tensors and external CUDA arrays * Support non-loop execution mode * Minor fixes & interface updates
microsoft · Aug 8, 2024 · eb300e6 · eb300e6
1 parent ee0895e
commit eb300e6
Show file tree

Hide file tree

Showing 31 changed files with 697 additions and 325 deletions.
diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
@@ -0,0 +1,192 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/executor.hpp"
+
+#include "gpu/gpu.hpp"
+#include "model/model_json.hpp"
+#include "unittest/unittest_utils.h"
+
+template <bool LoopMode>
+ark::unittest::State test_executor() {
+    ark::gpuStream stream;
+    UNITTEST_EQ(
+        ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
+        ark::gpuSuccess);
+
+    ark::Model empty;
+    {
+        ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
+        UNITTEST_EQ(executor.device_id(), 0);
+        UNITTEST_EQ(executor.stream(), stream);
+
+        executor.compile();
+        executor.launch();
+        executor.run(1);
+        executor.wait();
+        executor.stop();
+        executor.destroy();
+
+        UNITTEST_TRUE(executor.destroyed());
+    }
+    {
+        ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
+        executor.compile();
+        executor.launch();
+        executor.run(1);
+        executor.wait();
+        executor.stop();
+
+        executor.launch();
+        executor.run(1);
+        executor.wait();
+        executor.stop();
+
+        executor.destroy();
+    }
+    {
+        ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
+        UNITTEST_THROW(executor.launch(), ark::InvalidUsageError);
+
+        executor.compile();
+        executor.launch();
+        executor.launch();  // Will be ignored with a warning.
+        executor.run(1);
+        executor.wait();
+        executor.wait();  // nothing to do
+
+        // Stop & destroy automatically.
+    }
+
+    UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess);
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_executor_loop() { return test_executor<true>(); }
+
+ark::unittest::State test_executor_no_loop() { return test_executor<false>(); }
+
+ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
+                                                     ark::Dims stride,
+                                                     ark::Dims offset) {
+    // Alloc CPU array
+    std::vector<float> host_data(shape.nelems());
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        host_data[i] = static_cast<float>(i);
+    }
+
+    // Alloc GPU array
+    void *dev_ptr;
+    UNITTEST_EQ(ark::gpuMalloc(&dev_ptr, shape.nelems() * sizeof(float)),
+                ark::gpuSuccess);
+
+    // Create an ARK tensor
+    ark::Model m;
+    auto tensor = m.tensor(shape, ark::FP32, stride, offset);
+    m.noop(tensor);
+
+    ark::DefaultExecutor executor(m, 0);
+    executor.compile();
+    executor.launch();
+    UNITTEST_GT(executor.tensor_address(tensor), 0);
+
+    // Copy data from CPU array to ARK tensor
+    executor.tensor_write(tensor, host_data.data(),
+                          shape.nelems() * sizeof(float));
+
+    // Copy data from ARK tensor to GPU array
+    executor.tensor_read(tensor, dev_ptr, shape.nelems() * sizeof(float),
+                         nullptr, true);
+
+    // Check the data
+    std::vector<float> dev_data(shape.nelems());
+    executor.tensor_read(tensor, dev_data.data(),
+                         shape.nelems() * sizeof(float));
+    for (size_t i = 0; i < dev_data.size(); ++i) {
+        UNITTEST_EQ(dev_data[i], static_cast<float>(i));
+        dev_data[i] = -1;
+    }
+
+    UNITTEST_EQ(
+        ark::gpuMemcpy(dev_data.data(), dev_ptr, shape.nelems() * sizeof(float),
+                       ark::gpuMemcpyDeviceToHost),
+        ark::gpuSuccess);
+    for (size_t i = 0; i < dev_data.size(); ++i) {
+        UNITTEST_EQ(dev_data[i], static_cast<float>(i));
+        dev_data[i] = -1;
+    }
+
+    // Copy -1s back to GPU array
+    UNITTEST_EQ(
+        ark::gpuMemcpy(dev_ptr, dev_data.data(), shape.nelems() * sizeof(float),
+                       ark::gpuMemcpyHostToDevice),
+        ark::gpuSuccess);
+
+    // Copy data from GPU array to ARK tensor
+    executor.tensor_write(tensor, dev_ptr, shape.nelems() * sizeof(float),
+                          nullptr, true);
+
+    // Copy data from ARK tensor to CPU array
+    executor.tensor_read(tensor, host_data.data(),
+                         shape.nelems() * sizeof(float));
+
+    // Check the data
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        UNITTEST_EQ(host_data[i], -1);
+    }
+
+    // Provide a stream
+    ark::gpuStream stream;
+    UNITTEST_EQ(
+        ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
+        ark::gpuSuccess);
+    executor.tensor_read(tensor, host_data.data(),
+                         shape.nelems() * sizeof(float), stream);
+    executor.tensor_write(tensor, host_data.data(),
+                          shape.nelems() * sizeof(float), stream);
+    UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess);
+
+    // Invalid copy size
+    UNITTEST_THROW(executor.tensor_read(tensor, host_data.data(),
+                                        shape.nelems() * sizeof(float) + 1),
+                   ark::InvalidUsageError);
+    UNITTEST_THROW(executor.tensor_write(tensor, host_data.data(),
+                                         shape.nelems() * sizeof(float) + 1),
+                   ark::InvalidUsageError);
+
+    executor.stop();
+
+    UNITTEST_EQ(ark::gpuFree(dev_ptr), ark::gpuSuccess);
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_executor_tensor_read_write_no_stride() {
+    return test_executor_tensor_read_write({1024}, {}, {});
+}
+
+ark::unittest::State test_executor_tensor_read_write_stride_offset() {
+    return test_executor_tensor_read_write({4, 512}, {4, 1024}, {0, 512});
+}
+
+ark::unittest::State test_executor_invalid() {
+    // Invalid device ID.
+    UNITTEST_THROW(ark::Executor(-1, nullptr, "test", ""),
+                   ark::InvalidUsageError);
+
+    // Invalid rank.
+    ark::PlanJson plan;
+    plan["Rank"] = 1;
+    UNITTEST_THROW(ark::Executor(0, nullptr, "test", plan.dump(), true),
+                   ark::InvalidUsageError);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_executor_loop);
+    UNITTEST(test_executor_no_loop);
+    UNITTEST(test_executor_tensor_read_write_no_stride);
+    UNITTEST(test_executor_tensor_read_write_stride_offset);
+    UNITTEST(test_executor_invalid);
+    return 0;
+}
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
@@ -7,7 +7,7 @@
 #include "context_impl.hpp"
 #include "env.h"
 #include "file_io.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu_manager.hpp"
 #include "model/model_json.hpp"
 #include "model/model_node.hpp"
 #include "model/model_op.hpp"

diff --git a/ark/codegen.cpp b/ark/codegen.cpp
@@ -174,7 +174,7 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
         {"@NUM_WARPS_PER_BLOCK@", std::to_string(num_warps_per_proc_)},
         {"@DEFINITIONS@", definitions_ss.str()},
         {"@BODY@", body_ss.str()},
-        {"@NAME@", name_},
+        {"@NAME@", (name_.empty() ? "" : "_" + name_)},
     };
     code_ = replace(template_code, replacements);
 }

diff --git a/ark/gpu/gpu.h → ark/gpu/gpu.hpp b/ark/gpu/gpu.h → ark/gpu/gpu.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_H_
-#define ARK_GPU_H_
+#ifndef ARK_GPU_HPP_
+#define ARK_GPU_HPP_
 
 #include <functional>
 
@@ -125,6 +125,7 @@ ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuPointerAttributeSyncMemops,
 // runtime API
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetErrorString, cudaGetErrorString,
                           hipGetErrorString);
+ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetLastError, cudaGetLastError, hipGetLastError);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceGetAttribute, cudaDeviceGetAttribute,
                           hipDeviceGetAttribute);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceSynchronize, cudaDeviceSynchronize,
@@ -183,4 +184,4 @@ ARK_GPU_DEFINE_FUNC_ALIAS(gpuPointerSetAttribute, cuPointerSetAttribute,
 
 }  // namespace ark
 
-#endif  // ARK_GPU_H_
+#endif  // ARK_GPU_HPP_
diff --git a/ark/gpu/gpu_compile.cpp b/ark/gpu/gpu_compile.cpp
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_compile.h"
+#include "gpu/gpu_compile.hpp"
 
 #include <sys/types.h>
 #include <sys/wait.h>
@@ -22,7 +22,7 @@
 #include "cpu_timer.h"
 #include "env.h"
 #include "file_io.h"
-#include "gpu/gpu_logging.h"
+#include "gpu/gpu_logging.hpp"
 #include "utils/utils_string.hpp"
 
 #define ARK_DEBUG_KERNEL 0

diff --git a/ark/gpu/gpu_compile.h → ark/gpu/gpu_compile.hpp b/ark/gpu/gpu_compile.h → ark/gpu/gpu_compile.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_COMPILE_H_
-#define ARK_GPU_COMPILE_H_
+#ifndef ARK_GPU_COMPILE_HPP_
+#define ARK_GPU_COMPILE_HPP_
 
 #include <string>
 #include <vector>
@@ -16,4 +16,4 @@ const std::string gpu_compile(const std::vector<std::string> &codes,
 
 }  // namespace ark
 
-#endif  // ARK_GPU_COMPILE_H_
+#endif  // ARK_GPU_COMPILE_HPP_
diff --git a/ark/gpu/gpu_event.cpp b/ark/gpu/gpu_event.cpp
@@ -1,11 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_event.h"
+#include "gpu/gpu_event.hpp"
 
-#include "gpu/gpu.h"
-#include "gpu/gpu_logging.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu_logging.hpp"
+#include "gpu/gpu_manager.hpp"
 
 namespace ark {
 class GpuEvent::Impl {
@@ -15,7 +14,7 @@ class GpuEvent::Impl {
     Impl(const Impl&) = delete;
     Impl& operator=(const Impl&) = delete;
 
-    void record(std::shared_ptr<GpuStream> stream);
+    void record(gpuStream stream);
     float elapsed_msec(const GpuEvent& other) const;
 
    private:
@@ -32,8 +31,8 @@ GpuEvent::Impl::Impl(bool disable_timing) {
 
 GpuEvent::Impl::~Impl() { GLOG(gpuEventDestroy(event_)); }
 
-void GpuEvent::Impl::record(std::shared_ptr<GpuStream> stream) {
-    GLOG(gpuEventRecord(event_, stream->get()));
+void GpuEvent::Impl::record(gpuStream stream) {
+    GLOG(gpuEventRecord(event_, stream));
 }
 
 float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
@@ -45,9 +44,7 @@ float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
 GpuEvent::GpuEvent(bool disable_timing)
     : pimpl_(std::make_shared<Impl>(disable_timing)) {}
 
-void GpuEvent::record(std::shared_ptr<GpuStream> stream) {
-    pimpl_->record(stream);
-}
+void GpuEvent::record(gpuStream stream) { pimpl_->record(stream); }
 
 float GpuEvent::elapsed_msec(const GpuEvent& other) const {
     return pimpl_->elapsed_msec(other);

diff --git a/ark/gpu/gpu_event.h → ark/gpu/gpu_event.hpp b/ark/gpu/gpu_event.h → ark/gpu/gpu_event.hpp
@@ -1,11 +1,13 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_EVENT_H_
-#define ARK_GPU_EVENT_H_
+#ifndef ARK_GPU_EVENT_HPP_
+#define ARK_GPU_EVENT_HPP_
 
 #include <memory>
 
+#include "gpu/gpu.hpp"
+
 namespace ark {
 
 class GpuStream;
@@ -17,7 +19,7 @@ class GpuEvent {
     GpuEvent(const GpuEvent &) = delete;
     GpuEvent &operator=(const GpuEvent &) = delete;
 
-    void record(std::shared_ptr<GpuStream> stream);
+    void record(gpuStream stream);
     float elapsed_msec(const GpuEvent &other) const;
 
    protected:
@@ -31,4 +33,4 @@ class GpuEvent {
 };
 }  // namespace ark
 
-#endif  // ARK_GPU_EVENT_H_
+#endif  // ARK_GPU_EVENT_HPP_