Skip to content

Commit

Permalink
Enhance executor interfaces (#222)
Browse files Browse the repository at this point in the history
* Support to explicitly set executor's stream
* Support data copy between ARK tensors and external CUDA arrays
* Support non-loop execution mode
* Minor fixes & interface updates
  • Loading branch information
chhwang authored Aug 8, 2024
1 parent ee0895e commit eb300e6
Show file tree
Hide file tree
Showing 31 changed files with 697 additions and 325 deletions.
413 changes: 268 additions & 145 deletions ark/api/executor.cpp

Large diffs are not rendered by default.

192 changes: 192 additions & 0 deletions ark/api/executor_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include "ark/executor.hpp"

#include "gpu/gpu.hpp"
#include "model/model_json.hpp"
#include "unittest/unittest_utils.h"

template <bool LoopMode>
ark::unittest::State test_executor() {
ark::gpuStream stream;
UNITTEST_EQ(
ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
ark::gpuSuccess);

ark::Model empty;
{
ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
UNITTEST_EQ(executor.device_id(), 0);
UNITTEST_EQ(executor.stream(), stream);

executor.compile();
executor.launch();
executor.run(1);
executor.wait();
executor.stop();
executor.destroy();

UNITTEST_TRUE(executor.destroyed());
}
{
ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
executor.compile();
executor.launch();
executor.run(1);
executor.wait();
executor.stop();

executor.launch();
executor.run(1);
executor.wait();
executor.stop();

executor.destroy();
}
{
ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
UNITTEST_THROW(executor.launch(), ark::InvalidUsageError);

executor.compile();
executor.launch();
executor.launch(); // Will be ignored with a warning.
executor.run(1);
executor.wait();
executor.wait(); // nothing to do

// Stop & destroy automatically.
}

UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess);
return ark::unittest::SUCCESS;
}

ark::unittest::State test_executor_loop() { return test_executor<true>(); }

ark::unittest::State test_executor_no_loop() { return test_executor<false>(); }

ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
ark::Dims stride,
ark::Dims offset) {
// Alloc CPU array
std::vector<float> host_data(shape.nelems());
for (size_t i = 0; i < host_data.size(); ++i) {
host_data[i] = static_cast<float>(i);
}

// Alloc GPU array
void *dev_ptr;
UNITTEST_EQ(ark::gpuMalloc(&dev_ptr, shape.nelems() * sizeof(float)),
ark::gpuSuccess);

// Create an ARK tensor
ark::Model m;
auto tensor = m.tensor(shape, ark::FP32, stride, offset);
m.noop(tensor);

ark::DefaultExecutor executor(m, 0);
executor.compile();
executor.launch();
UNITTEST_GT(executor.tensor_address(tensor), 0);

// Copy data from CPU array to ARK tensor
executor.tensor_write(tensor, host_data.data(),
shape.nelems() * sizeof(float));

// Copy data from ARK tensor to GPU array
executor.tensor_read(tensor, dev_ptr, shape.nelems() * sizeof(float),
nullptr, true);

// Check the data
std::vector<float> dev_data(shape.nelems());
executor.tensor_read(tensor, dev_data.data(),
shape.nelems() * sizeof(float));
for (size_t i = 0; i < dev_data.size(); ++i) {
UNITTEST_EQ(dev_data[i], static_cast<float>(i));
dev_data[i] = -1;
}

UNITTEST_EQ(
ark::gpuMemcpy(dev_data.data(), dev_ptr, shape.nelems() * sizeof(float),
ark::gpuMemcpyDeviceToHost),
ark::gpuSuccess);
for (size_t i = 0; i < dev_data.size(); ++i) {
UNITTEST_EQ(dev_data[i], static_cast<float>(i));
dev_data[i] = -1;
}

// Copy -1s back to GPU array
UNITTEST_EQ(
ark::gpuMemcpy(dev_ptr, dev_data.data(), shape.nelems() * sizeof(float),
ark::gpuMemcpyHostToDevice),
ark::gpuSuccess);

// Copy data from GPU array to ARK tensor
executor.tensor_write(tensor, dev_ptr, shape.nelems() * sizeof(float),
nullptr, true);

// Copy data from ARK tensor to CPU array
executor.tensor_read(tensor, host_data.data(),
shape.nelems() * sizeof(float));

// Check the data
for (size_t i = 0; i < host_data.size(); ++i) {
UNITTEST_EQ(host_data[i], -1);
}

// Provide a stream
ark::gpuStream stream;
UNITTEST_EQ(
ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
ark::gpuSuccess);
executor.tensor_read(tensor, host_data.data(),
shape.nelems() * sizeof(float), stream);
executor.tensor_write(tensor, host_data.data(),
shape.nelems() * sizeof(float), stream);
UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess);

// Invalid copy size
UNITTEST_THROW(executor.tensor_read(tensor, host_data.data(),
shape.nelems() * sizeof(float) + 1),
ark::InvalidUsageError);
UNITTEST_THROW(executor.tensor_write(tensor, host_data.data(),
shape.nelems() * sizeof(float) + 1),
ark::InvalidUsageError);

executor.stop();

UNITTEST_EQ(ark::gpuFree(dev_ptr), ark::gpuSuccess);
return ark::unittest::SUCCESS;
}

ark::unittest::State test_executor_tensor_read_write_no_stride() {
return test_executor_tensor_read_write({1024}, {}, {});
}

ark::unittest::State test_executor_tensor_read_write_stride_offset() {
return test_executor_tensor_read_write({4, 512}, {4, 1024}, {0, 512});
}

ark::unittest::State test_executor_invalid() {
// Invalid device ID.
UNITTEST_THROW(ark::Executor(-1, nullptr, "test", ""),
ark::InvalidUsageError);

// Invalid rank.
ark::PlanJson plan;
plan["Rank"] = 1;
UNITTEST_THROW(ark::Executor(0, nullptr, "test", plan.dump(), true),
ark::InvalidUsageError);

return ark::unittest::SUCCESS;
}

int main() {
UNITTEST(test_executor_loop);
UNITTEST(test_executor_no_loop);
UNITTEST(test_executor_tensor_read_write_no_stride);
UNITTEST(test_executor_tensor_read_write_stride_offset);
UNITTEST(test_executor_invalid);
return 0;
}
2 changes: 1 addition & 1 deletion ark/api/planner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "context_impl.hpp"
#include "env.h"
#include "file_io.h"
#include "gpu/gpu_manager.h"
#include "gpu/gpu_manager.hpp"
#include "model/model_json.hpp"
#include "model/model_node.hpp"
#include "model/model_op.hpp"
Expand Down
2 changes: 1 addition & 1 deletion ark/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
{"@NUM_WARPS_PER_BLOCK@", std::to_string(num_warps_per_proc_)},
{"@DEFINITIONS@", definitions_ss.str()},
{"@BODY@", body_ss.str()},
{"@NAME@", name_},
{"@NAME@", (name_.empty() ? "" : "_" + name_)},
};
code_ = replace(template_code, replacements);
}
Expand Down
7 changes: 4 additions & 3 deletions ark/gpu/gpu.h → ark/gpu/gpu.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#ifndef ARK_GPU_H_
#define ARK_GPU_H_
#ifndef ARK_GPU_HPP_
#define ARK_GPU_HPP_

#include <functional>

Expand Down Expand Up @@ -125,6 +125,7 @@ ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuPointerAttributeSyncMemops,
// runtime API
ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetErrorString, cudaGetErrorString,
hipGetErrorString);
ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetLastError, cudaGetLastError, hipGetLastError);
ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceGetAttribute, cudaDeviceGetAttribute,
hipDeviceGetAttribute);
ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceSynchronize, cudaDeviceSynchronize,
Expand Down Expand Up @@ -183,4 +184,4 @@ ARK_GPU_DEFINE_FUNC_ALIAS(gpuPointerSetAttribute, cuPointerSetAttribute,

} // namespace ark

#endif // ARK_GPU_H_
#endif // ARK_GPU_HPP_
4 changes: 2 additions & 2 deletions ark/gpu/gpu_compile.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include "gpu/gpu_compile.h"
#include "gpu/gpu_compile.hpp"

#include <sys/types.h>
#include <sys/wait.h>
Expand All @@ -22,7 +22,7 @@
#include "cpu_timer.h"
#include "env.h"
#include "file_io.h"
#include "gpu/gpu_logging.h"
#include "gpu/gpu_logging.hpp"
#include "utils/utils_string.hpp"

#define ARK_DEBUG_KERNEL 0
Expand Down
6 changes: 3 additions & 3 deletions ark/gpu/gpu_compile.h → ark/gpu/gpu_compile.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#ifndef ARK_GPU_COMPILE_H_
#define ARK_GPU_COMPILE_H_
#ifndef ARK_GPU_COMPILE_HPP_
#define ARK_GPU_COMPILE_HPP_

#include <string>
#include <vector>
Expand All @@ -16,4 +16,4 @@ const std::string gpu_compile(const std::vector<std::string> &codes,

} // namespace ark

#endif // ARK_GPU_COMPILE_H_
#endif // ARK_GPU_COMPILE_HPP_
17 changes: 7 additions & 10 deletions ark/gpu/gpu_event.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include "gpu/gpu_event.h"
#include "gpu/gpu_event.hpp"

#include "gpu/gpu.h"
#include "gpu/gpu_logging.h"
#include "gpu/gpu_manager.h"
#include "gpu/gpu_logging.hpp"
#include "gpu/gpu_manager.hpp"

namespace ark {
class GpuEvent::Impl {
Expand All @@ -15,7 +14,7 @@ class GpuEvent::Impl {
Impl(const Impl&) = delete;
Impl& operator=(const Impl&) = delete;

void record(std::shared_ptr<GpuStream> stream);
void record(gpuStream stream);
float elapsed_msec(const GpuEvent& other) const;

private:
Expand All @@ -32,8 +31,8 @@ GpuEvent::Impl::Impl(bool disable_timing) {

GpuEvent::Impl::~Impl() { GLOG(gpuEventDestroy(event_)); }

void GpuEvent::Impl::record(std::shared_ptr<GpuStream> stream) {
GLOG(gpuEventRecord(event_, stream->get()));
void GpuEvent::Impl::record(gpuStream stream) {
GLOG(gpuEventRecord(event_, stream));
}

float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
Expand All @@ -45,9 +44,7 @@ float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
GpuEvent::GpuEvent(bool disable_timing)
: pimpl_(std::make_shared<Impl>(disable_timing)) {}

void GpuEvent::record(std::shared_ptr<GpuStream> stream) {
pimpl_->record(stream);
}
void GpuEvent::record(gpuStream stream) { pimpl_->record(stream); }

float GpuEvent::elapsed_msec(const GpuEvent& other) const {
return pimpl_->elapsed_msec(other);
Expand Down
10 changes: 6 additions & 4 deletions ark/gpu/gpu_event.h → ark/gpu/gpu_event.hpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#ifndef ARK_GPU_EVENT_H_
#define ARK_GPU_EVENT_H_
#ifndef ARK_GPU_EVENT_HPP_
#define ARK_GPU_EVENT_HPP_

#include <memory>

#include "gpu/gpu.hpp"

namespace ark {

class GpuStream;
Expand All @@ -17,7 +19,7 @@ class GpuEvent {
GpuEvent(const GpuEvent &) = delete;
GpuEvent &operator=(const GpuEvent &) = delete;

void record(std::shared_ptr<GpuStream> stream);
void record(gpuStream stream);
float elapsed_msec(const GpuEvent &other) const;

protected:
Expand All @@ -31,4 +33,4 @@ class GpuEvent {
};
} // namespace ark

#endif // ARK_GPU_EVENT_H_
#endif // ARK_GPU_EVENT_HPP_
Loading

0 comments on commit eb300e6

Please sign in to comment.