Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance executor interfaces #222

Merged
merged 33 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
b18bdb2
Enhance executor interfaces
chhwang Jul 10, 2024
2154690
Update lint workflow
chhwang Jul 10, 2024
705f9f8
Optimize operators
chhwang Jul 11, 2024
a3114e4
fix
chhwang Jul 11, 2024
6a0cd1a
Merge branch 'main' into chhwang/fix-kernels
chhwang Jul 11, 2024
d5bc723
Merge branch 'main' into chhwang/executor-interface
chhwang Jul 11, 2024
f01afaa
Merge branch 'chhwang/fix-kernels' into chhwang/executor-interface
chhwang Jul 11, 2024
67e3b26
update test
chhwang Jul 11, 2024
f293383
Merge branch 'chhwang/fix-kernels' into chhwang/executor-interface
chhwang Jul 11, 2024
ce1959e
Add `loop_mode` argument
chhwang Jul 12, 2024
55755bb
do not force noinline
chhwang Jul 14, 2024
cfabc2f
Merge branch 'chhwang/fix-kernels' into chhwang/executor-interface
chhwang Jul 14, 2024
a7a5d46
Fix CK tile indexing
chhwang Jul 14, 2024
14f3a97
Merge branch 'chhwang/fix-kernels' into chhwang/executor-interface
chhwang Jul 14, 2024
c72b907
Merge branch 'main' into chhwang/executor-interface
chhwang Jul 15, 2024
8192b90
Merge branch 'main' into chhwang/executor-interface
chhwang Aug 6, 2024
ffd43fd
Merge branch 'main' into chhwang/executor-interface
chhwang Aug 6, 2024
78ac0da
fix merge
chhwang Aug 6, 2024
afb518a
fix merge
chhwang Aug 6, 2024
01fa569
Merge branch 'main' into chhwang/executor-interface
chhwang Aug 6, 2024
762bf4a
fix merge
chhwang Aug 6, 2024
f654f0b
add a python method
chhwang Aug 6, 2024
498926c
submodule update
chhwang Aug 6, 2024
3e331a2
fix
chhwang Aug 6, 2024
10bfa75
Rename CMake environments
chhwang Aug 6, 2024
745fa1b
Merge branch 'chhwang/cmake' into chhwang/executor-interface
chhwang Aug 6, 2024
3dda44a
A few fixes & improved coverage
chhwang Aug 6, 2024
e96669b
Merge branch 'main' into chhwang/executor-interface
chhwang Aug 6, 2024
28b8395
Update runtime.py
chhwang Aug 6, 2024
11901c4
fix
chhwang Aug 7, 2024
44ecf1d
minor changes & improve coverage
chhwang Aug 7, 2024
550ed45
Merge branch 'main' into chhwang/executor-interface
chhwang Aug 7, 2024
d0b0432
debugging ut
chhwang Aug 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
413 changes: 268 additions & 145 deletions ark/api/executor.cpp

Large diffs are not rendered by default.

192 changes: 192 additions & 0 deletions ark/api/executor_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include "ark/executor.hpp"

#include "gpu/gpu.hpp"
#include "model/model_json.hpp"
#include "unittest/unittest_utils.h"

template <bool LoopMode>
ark::unittest::State test_executor() {
ark::gpuStream stream;
UNITTEST_EQ(
ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
ark::gpuSuccess);

ark::Model empty;
{
ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
UNITTEST_EQ(executor.device_id(), 0);
UNITTEST_EQ(executor.stream(), stream);

executor.compile();
executor.launch();
executor.run(1);
executor.wait();
executor.stop();
executor.destroy();

UNITTEST_TRUE(executor.destroyed());
}
{
ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
executor.compile();
executor.launch();
executor.run(1);
executor.wait();
executor.stop();

executor.launch();
executor.run(1);
executor.wait();
executor.stop();

executor.destroy();
}
{
ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
UNITTEST_THROW(executor.launch(), ark::InvalidUsageError);

executor.compile();
executor.launch();
executor.launch(); // Will be ignored with a warning.
executor.run(1);
executor.wait();
executor.wait(); // nothing to do

// Stop & destroy automatically.
}

UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess);
return ark::unittest::SUCCESS;
}

ark::unittest::State test_executor_loop() { return test_executor<true>(); }

ark::unittest::State test_executor_no_loop() { return test_executor<false>(); }

ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
ark::Dims stride,
ark::Dims offset) {
// Alloc CPU array
std::vector<float> host_data(shape.nelems());
for (size_t i = 0; i < host_data.size(); ++i) {
host_data[i] = static_cast<float>(i);
}

// Alloc GPU array
void *dev_ptr;
UNITTEST_EQ(ark::gpuMalloc(&dev_ptr, shape.nelems() * sizeof(float)),
ark::gpuSuccess);

// Create an ARK tensor
ark::Model m;
auto tensor = m.tensor(shape, ark::FP32, stride, offset);
m.noop(tensor);

ark::DefaultExecutor executor(m, 0);
executor.compile();
executor.launch();
UNITTEST_GT(executor.tensor_address(tensor), 0);

// Copy data from CPU array to ARK tensor
executor.tensor_write(tensor, host_data.data(),
shape.nelems() * sizeof(float));

// Copy data from ARK tensor to GPU array
executor.tensor_read(tensor, dev_ptr, shape.nelems() * sizeof(float),
nullptr, true);

// Check the data
std::vector<float> dev_data(shape.nelems());
executor.tensor_read(tensor, dev_data.data(),
shape.nelems() * sizeof(float));
for (size_t i = 0; i < dev_data.size(); ++i) {
UNITTEST_EQ(dev_data[i], static_cast<float>(i));
dev_data[i] = -1;
}

UNITTEST_EQ(
ark::gpuMemcpy(dev_data.data(), dev_ptr, shape.nelems() * sizeof(float),
ark::gpuMemcpyDeviceToHost),
ark::gpuSuccess);
for (size_t i = 0; i < dev_data.size(); ++i) {
UNITTEST_EQ(dev_data[i], static_cast<float>(i));
dev_data[i] = -1;
}

// Copy -1s back to GPU array
UNITTEST_EQ(
ark::gpuMemcpy(dev_ptr, dev_data.data(), shape.nelems() * sizeof(float),
ark::gpuMemcpyHostToDevice),
ark::gpuSuccess);

// Copy data from GPU array to ARK tensor
executor.tensor_write(tensor, dev_ptr, shape.nelems() * sizeof(float),
nullptr, true);

// Copy data from ARK tensor to CPU array
executor.tensor_read(tensor, host_data.data(),
shape.nelems() * sizeof(float));

// Check the data
for (size_t i = 0; i < host_data.size(); ++i) {
UNITTEST_EQ(host_data[i], -1);
}

// Provide a stream
ark::gpuStream stream;
UNITTEST_EQ(
ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
ark::gpuSuccess);
executor.tensor_read(tensor, host_data.data(),
shape.nelems() * sizeof(float), stream);
executor.tensor_write(tensor, host_data.data(),
shape.nelems() * sizeof(float), stream);
UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess);

// Invalid copy size
UNITTEST_THROW(executor.tensor_read(tensor, host_data.data(),
shape.nelems() * sizeof(float) + 1),
ark::InvalidUsageError);
UNITTEST_THROW(executor.tensor_write(tensor, host_data.data(),
shape.nelems() * sizeof(float) + 1),
ark::InvalidUsageError);

executor.stop();

UNITTEST_EQ(ark::gpuFree(dev_ptr), ark::gpuSuccess);
return ark::unittest::SUCCESS;
}

ark::unittest::State test_executor_tensor_read_write_no_stride() {
return test_executor_tensor_read_write({1024}, {}, {});
}

ark::unittest::State test_executor_tensor_read_write_stride_offset() {
return test_executor_tensor_read_write({4, 512}, {4, 1024}, {0, 512});
}

ark::unittest::State test_executor_invalid() {
// Invalid device ID.
UNITTEST_THROW(ark::Executor(-1, nullptr, "test", ""),
ark::InvalidUsageError);

// Invalid rank.
ark::PlanJson plan;
plan["Rank"] = 1;
UNITTEST_THROW(ark::Executor(0, nullptr, "test", plan.dump(), true),
ark::InvalidUsageError);

return ark::unittest::SUCCESS;
}

int main() {
UNITTEST(test_executor_loop);
UNITTEST(test_executor_no_loop);
UNITTEST(test_executor_tensor_read_write_no_stride);
UNITTEST(test_executor_tensor_read_write_stride_offset);
UNITTEST(test_executor_invalid);
return 0;
}
2 changes: 1 addition & 1 deletion ark/api/planner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "context_impl.hpp"
#include "env.h"
#include "file_io.h"
#include "gpu/gpu_manager.h"
#include "gpu/gpu_manager.hpp"
#include "model/model_json.hpp"
#include "model/model_node.hpp"
#include "model/model_op.hpp"
Expand Down
2 changes: 1 addition & 1 deletion ark/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
{"@NUM_WARPS_PER_BLOCK@", std::to_string(num_warps_per_proc_)},
{"@DEFINITIONS@", definitions_ss.str()},
{"@BODY@", body_ss.str()},
{"@NAME@", name_},
{"@NAME@", (name_.empty() ? "" : "_" + name_)},
};
code_ = replace(template_code, replacements);
}
Expand Down
7 changes: 4 additions & 3 deletions ark/gpu/gpu.h → ark/gpu/gpu.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#ifndef ARK_GPU_H_
#define ARK_GPU_H_
#ifndef ARK_GPU_HPP_
#define ARK_GPU_HPP_

#include <functional>

Expand Down Expand Up @@ -125,6 +125,7 @@ ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuPointerAttributeSyncMemops,
// runtime API
ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetErrorString, cudaGetErrorString,
hipGetErrorString);
ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetLastError, cudaGetLastError, hipGetLastError);
ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceGetAttribute, cudaDeviceGetAttribute,
hipDeviceGetAttribute);
ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceSynchronize, cudaDeviceSynchronize,
Expand Down Expand Up @@ -183,4 +184,4 @@ ARK_GPU_DEFINE_FUNC_ALIAS(gpuPointerSetAttribute, cuPointerSetAttribute,

} // namespace ark

#endif // ARK_GPU_H_
#endif // ARK_GPU_HPP_
4 changes: 2 additions & 2 deletions ark/gpu/gpu_compile.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include "gpu/gpu_compile.h"
#include "gpu/gpu_compile.hpp"

#include <sys/types.h>
#include <sys/wait.h>
Expand All @@ -22,7 +22,7 @@
#include "cpu_timer.h"
#include "env.h"
#include "file_io.h"
#include "gpu/gpu_logging.h"
#include "gpu/gpu_logging.hpp"
#include "utils/utils_string.hpp"

#define ARK_DEBUG_KERNEL 0
Expand Down
6 changes: 3 additions & 3 deletions ark/gpu/gpu_compile.h → ark/gpu/gpu_compile.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#ifndef ARK_GPU_COMPILE_H_
#define ARK_GPU_COMPILE_H_
#ifndef ARK_GPU_COMPILE_HPP_
#define ARK_GPU_COMPILE_HPP_

#include <string>
#include <vector>
Expand All @@ -16,4 +16,4 @@ const std::string gpu_compile(const std::vector<std::string> &codes,

} // namespace ark

#endif // ARK_GPU_COMPILE_H_
#endif // ARK_GPU_COMPILE_HPP_
17 changes: 7 additions & 10 deletions ark/gpu/gpu_event.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include "gpu/gpu_event.h"
#include "gpu/gpu_event.hpp"

#include "gpu/gpu.h"
#include "gpu/gpu_logging.h"
#include "gpu/gpu_manager.h"
#include "gpu/gpu_logging.hpp"
#include "gpu/gpu_manager.hpp"

namespace ark {
class GpuEvent::Impl {
Expand All @@ -15,7 +14,7 @@ class GpuEvent::Impl {
Impl(const Impl&) = delete;
Impl& operator=(const Impl&) = delete;

void record(std::shared_ptr<GpuStream> stream);
void record(gpuStream stream);
float elapsed_msec(const GpuEvent& other) const;

private:
Expand All @@ -32,8 +31,8 @@ GpuEvent::Impl::Impl(bool disable_timing) {

GpuEvent::Impl::~Impl() { GLOG(gpuEventDestroy(event_)); }

void GpuEvent::Impl::record(std::shared_ptr<GpuStream> stream) {
GLOG(gpuEventRecord(event_, stream->get()));
void GpuEvent::Impl::record(gpuStream stream) {
GLOG(gpuEventRecord(event_, stream));
}

float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
Expand All @@ -45,9 +44,7 @@ float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
GpuEvent::GpuEvent(bool disable_timing)
: pimpl_(std::make_shared<Impl>(disable_timing)) {}

void GpuEvent::record(std::shared_ptr<GpuStream> stream) {
pimpl_->record(stream);
}
void GpuEvent::record(gpuStream stream) { pimpl_->record(stream); }

float GpuEvent::elapsed_msec(const GpuEvent& other) const {
return pimpl_->elapsed_msec(other);
Expand Down
10 changes: 6 additions & 4 deletions ark/gpu/gpu_event.h → ark/gpu/gpu_event.hpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#ifndef ARK_GPU_EVENT_H_
#define ARK_GPU_EVENT_H_
#ifndef ARK_GPU_EVENT_HPP_
#define ARK_GPU_EVENT_HPP_

#include <memory>

#include "gpu/gpu.hpp"

namespace ark {

class GpuStream;
Expand All @@ -17,7 +19,7 @@ class GpuEvent {
GpuEvent(const GpuEvent &) = delete;
GpuEvent &operator=(const GpuEvent &) = delete;

void record(std::shared_ptr<GpuStream> stream);
void record(gpuStream stream);
float elapsed_msec(const GpuEvent &other) const;

protected:
Expand All @@ -31,4 +33,4 @@ class GpuEvent {
};
} // namespace ark

#endif // ARK_GPU_EVENT_H_
#endif // ARK_GPU_EVENT_HPP_
Loading
Loading