Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for multiple OpenCL platforms #1345

Merged
merged 10 commits into from
Jul 9, 2018
3 changes: 3 additions & 0 deletions cmake/config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ set(USE_CUDA OFF)
# - /path/to/rocm: use specific path to rocm
set(USE_ROCM OFF)

# Whether enable SDAccel runtime
set(USE_SDACCEL OFF)

# Whether enable OpenCL runtime
set(USE_OPENCL OFF)

Expand Down
12 changes: 12 additions & 0 deletions cmake/modules/OpenCL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@ if(OpenCL_FOUND)
include_directories(${OpenCL_INCLUDE_DIRS})
endif(OpenCL_FOUND)

if(USE_SDACCEL)
message(STATUS "Build with SDAccel support")
file(GLOB RUNTIME_SDACCEL_SRCS src/runtime/opencl/sdaccel/*.cc)
list(APPEND RUNTIME_SRCS ${RUNTIME_SDACCEL_SRCS})
if(NOT USE_OPENCL)
message(STATUS "Enable OpenCL support required for SDAccel")
set(USE_OPENCL ON)
endif()
else()
list(APPEND COMPILER_SRCS src/codegen/opt/build_sdaccel_off.cc)
endif(USE_SDACCEL)

if(USE_OPENCL)
find_package(OpenCL REQUIRED)
message(STATUS "Build with OpenCL support")
Expand Down
1 change: 1 addition & 0 deletions include/tvm/runtime/c_runtime_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ typedef int64_t tvm_index_t;

/*! \brief Extension device types in TVM */
typedef enum {
kDLSDAccel = 6,
kDLVulkan = 7,
kOpenGL = 11,
// Extension DRAM type, used for quickly test extension device
Expand Down
3 changes: 2 additions & 1 deletion python/tvm/_ffi/runtime_ctypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class TVMContext(ctypes.Structure):
1 : 'cpu',
2 : 'gpu',
4 : 'opencl',
6 : 'sdaccel',
7 : 'vulkan',
8 : 'metal',
9 : 'vpi',
Expand All @@ -111,7 +112,7 @@ class TVMContext(ctypes.Structure):
'nvptx': 2,
'cl': 4,
'opencl': 4,
'sdaccel': 4,
'sdaccel': 6,
'vulkan': 7,
'metal': 8,
'vpi': 9,
Expand Down
52 changes: 1 addition & 51 deletions python/tvm/contrib/sdaccel.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,10 @@
"""Utility for Interacting with SDAccel Tools"""
import subprocess
import os
import re
from . import util
from ..api import register_func


def _vhls_to_opencl(code):
"""Convert source code from Vivado HLS to OpenCL."""
out = ''
for line in code.split('\n'):
if re.match(r'#include', line):
# OpenCL doesn't support include.
continue
if re.match(r'#pragma', line):
# Remove Vivado HLS specific pragmas.
continue

if re.match(r'extern "C"', line):
line = re.sub(r'^extern "C"', "__kernel", line)
# Add __global to pointer parameters.
line = re.sub(r'(\w+)\s*\*', r"__global \1*", line)

out += line + '\n'

return out


def _fake_compile_vhls(code):
"""Fake compile Vivado HLS code for SDAccel.

Compile the Vivado HLS code as an OpenCL code, and generate a program
binary for GPU which can be used instead of xclbin.

Parameters
----------
code : str
The Vivado HLS code.

Return
------
binary : bytearray
The program binary which can be passed to clCreateProgramWithBinary
"""
try:
import pyopencl as cl
except ImportError:
raise ImportError('PyOpenCL is required for testing SDAccel backend.')
ctx = cl.Context(dev_type=cl.device_type.GPU)
program = cl.Program(ctx, _vhls_to_opencl(code)).build()
binary = bytearray(program.binaries[0])
return binary


@register_func("tvm_callback_sdaccel_compile")
def compile_vhls(code, kernel):
"""Compile Vivado HLS code for SDAccel.
Expand Down Expand Up @@ -87,9 +39,7 @@ def compile_vhls(code, kernel):
platform = os.environ.get("XCL_PLATFORM", os.environ.get("AWS_PLATFORM"))

if platform is None:
# If we don't have the Xilinx toolchain, create a program binary for
# GPU and use it for testing.
return _fake_compile_vhls(code)
raise RuntimeError("No Xlinx device specified.")

# build xo
args = [xocc, "-c", "-t", target, "--platform", platform, "-o", tmp_xo, "-k", kernel] + \
Expand Down
4 changes: 2 additions & 2 deletions src/codegen/codegen_vhls.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include <string>
#include "./codegen_vhls.h"
#include "./build_common.h"
#include "../runtime/opencl/opencl_module.h"
#include "../runtime/opencl/sdaccel/sdaccel_module.h"

namespace tvm {
namespace codegen {
Expand Down Expand Up @@ -91,7 +91,7 @@ runtime::Module BuildSDAccel(Array<LoweredFunc> funcs) {
} else {
LOG(FATAL) << "Cannot compile Vivado HLS code.";
}
return OpenCLModuleCreate(xclbin, "xclbin", ExtractFuncInfo(funcs), code);
return SDAccelModuleCreate(xclbin, "xclbin", ExtractFuncInfo(funcs), code);
}

TVM_REGISTER_API("codegen.build_sdaccel")
Expand Down
21 changes: 21 additions & 0 deletions src/codegen/opt/build_sdaccel_off.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*!
* Copyright (c) 2018 by Contributors
* Optional module when build opencl is switched to off
*/
#include "../codegen_source_base.h"
#include "../../runtime/opencl/opencl_module.h"

namespace tvm {
namespace runtime {

Module SDAccelModuleCreate(
std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source) {
LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "sdaccel");
}

} // namespace runtime
} // namespace tvm
6 changes: 5 additions & 1 deletion src/pass/verify_memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class MemoryAccessVerifier final : protected IRVisitor {

/// Interface to perform memory access verification
void Run() {
if (!IsGPUDevice(dev_type_)) return;
if (!IsGPUDevice(dev_type_) && !IsFPGADevice(dev_type_)) return;
IRVisitor::Visit(func_->body);
}

Expand Down Expand Up @@ -143,6 +143,10 @@ class MemoryAccessVerifier final : protected IRVisitor {
kDLVulkan == dev_type || kDLMetal == dev_type ||
kDLROCM == dev_type || kOpenGL == dev_type;
}
/// Check if a given DLDeviceType/TVMDeviceExtType value denotes FPGA device.
static bool IsFPGADevice(int dev_type) {
return kDLSDAccel == dev_type;
}

private:
/// Status of visitor
Expand Down
1 change: 1 addition & 0 deletions src/runtime/c_runtime_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ inline std::string DeviceName(int type) {
case kDLCPU: return "cpu";
case kDLGPU: return "gpu";
case kDLOpenCL: return "opencl";
case kDLSDAccel: return "sdaccel";
case kDLVulkan: return "vulkan";
case kDLMetal: return "metal";
case kDLVPI: return "vpi";
Expand Down
107 changes: 101 additions & 6 deletions src/runtime/opencl/opencl_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
#include <string>
#include <vector>
#include "../workspace_pool.h"
#include "../pack_args.h"
#include "../thread_storage_scope.h"
#include "../meta_data.h"
#include "../file_util.h"

namespace tvm {
namespace runtime {
Expand Down Expand Up @@ -97,17 +101,23 @@ inline const char* CLGetErrorString(cl_int error) {
OPENCL_CHECK_ERROR(e); \
}

class OpenCLThreadEntry;

/*!
* \brief Process global OpenCL workspace.
*/
class OpenCLWorkspace final : public DeviceAPI {
class OpenCLWorkspace : public DeviceAPI {
public:
// global platform id
cl_platform_id platform_id;
// global platform name
std::string platform_name;
// global context of this process
cl_context context{nullptr};
// whether the workspace it initialized.
bool initialized_{false};
// the device type
std::string device_type;
// the devices
std::vector<cl_device_id> devices;
// the queues
Expand All @@ -128,10 +138,17 @@ class OpenCLWorkspace final : public DeviceAPI {
}
}
// Initialzie the device.
void Init();
void Init(const std::vector<std::string>& device_types, const std::string& platform_name = "");
virtual void Init() {
Init({"gpu", "cpu"});
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only initialize gpu, to be consistent with existing seeting

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This initializes only GPU basically, and falls back to CPU only when GPU is not available. The behavior is consistent with the original one.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main problem is that the current opencl schedules assumes GPU, so they may not work when falling back to CPUs

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I'm fine with removing the fallback feature in this PR.

}
// Check whether the context is OpenCL or not.
virtual bool IsOpenCLDevice(TVMContext ctx) {
return ctx.device_type == kDLOpenCL;
}
// get the queue of the context
cl_command_queue GetQueue(TVMContext ctx) {
CHECK_EQ(ctx.device_type, kDLOpenCL);
CHECK(IsOpenCLDevice(ctx));
this->Init();
CHECK(ctx.device_id >= 0 && static_cast<size_t>(ctx.device_id) < queues.size())
<< "Invalid OpenCL device_id=" << ctx.device_id;
Expand All @@ -157,6 +174,12 @@ class OpenCLWorkspace final : public DeviceAPI {
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
void FreeWorkspace(TVMContext ctx, void* data) final;

/*!
* \brief Get the thread local ThreadEntry
*/
virtual OpenCLThreadEntry* GetThreadEntry();

// get the global workspace
static const std::shared_ptr<OpenCLWorkspace>& Global();
};
Expand All @@ -179,15 +202,87 @@ class OpenCLThreadEntry {
/*! \brief workspace pool */
WorkspacePool pool;
// constructor
OpenCLThreadEntry()
: pool(kDLOpenCL, OpenCLWorkspace::Global()) {
OpenCLThreadEntry(DLDeviceType device_type, std::shared_ptr<DeviceAPI> device)
: pool(device_type, device) {
context.device_id = 0;
context.device_type = kDLOpenCL;
context.device_type = device_type;
}
OpenCLThreadEntry()
: OpenCLThreadEntry(kDLOpenCL, OpenCLWorkspace::Global()) {}

// get the global workspace
static OpenCLThreadEntry* ThreadLocal();
};
} // namespace cl

// Module to support thread-safe multi-device execution.
// OpenCL runtime is a bit tricky because clSetKernelArg is not thread-safe
// To make the call thread-safe, we create a thread-local kernel table
// and lazily install new kernels into the kernel table when the kernel is called.
// The kernels are recycled when the module get destructed.
class OpenCLModuleNode : public ModuleNode {
public:
// Kernel table reference entry.
struct KTRefEntry {
size_t kernel_id;
size_t version;
};
explicit OpenCLModuleNode(std::string data,
std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source)
: data_(data), fmt_(fmt), fmap_(fmap), source_(source) {}
// destructor
~OpenCLModuleNode();

/*!
* \brief Get the global workspace
*/
virtual std::shared_ptr<cl::OpenCLWorkspace> GetGlobalWorkspace();

virtual const char* type_key() const;

PackedFunc GetFunction(
const std::string& name,
const std::shared_ptr<ModuleNode>& sptr_to_self) final;
void SaveToFile(const std::string& file_name,
const std::string& format) final;
void SaveToBinary(dmlc::Stream* stream) final;
std::string GetSource(const std::string& format) final;
// Initialize the programs
void Init();
// install a new kernel to thread local entry
cl_kernel InstallKernel(cl::OpenCLWorkspace* w,
cl::OpenCLThreadEntry* t,
const std::string& func_name,
const KTRefEntry& e);

protected:
// The workspace, need to keep reference to use it in destructor.
// In case of static destruction order problem.
std::shared_ptr<cl::OpenCLWorkspace> workspace_;
// the binary data
std::string data_;

private:
// The format
std::string fmt_;
// function information table.
std::unordered_map<std::string, FunctionInfo> fmap_;
// Module local mutex
std::mutex build_lock_;
// The OpenCL source.
std::string source_;
// the binary data
cl_program program_{nullptr};
// build info
std::vector<bool> device_built_flag_;
// kernel id cache
std::unordered_map<std::string, KTRefEntry> kid_map_;
// kernels build so far.
std::vector<cl_kernel> kernels_;
};

} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
Loading