diff --git a/Makefile b/Makefile index 514ffa665491..0cee2e36ed15 100644 --- a/Makefile +++ b/Makefile @@ -26,6 +26,7 @@ endif export LDFLAGS = -pthread -lm export CFLAGS = -std=c++11 -Wall -O2\ -Iinclude -Idmlc-core/include -IHalideIR/src -fPIC +export FRAMEWORKS= ifneq ($(ADD_CFLAGS), NONE) CFLAGS += $(ADD_CFLAGS) @@ -43,6 +44,20 @@ else CFLAGS += -DTVM_CUDA_RUNTIME=0 endif + +ifeq ($(USE_OPENCL), 1) + CFLAGS += -DTVM_OPENCL_RUNTIME=1 + UNAME_S := $(shell uname -s) + ifeq ($(UNAME_S), Darwin) + FRAMEWORKS += -framework OpenCL + else + LDFLAGS += -lOpenCL + endif +else + CFLAGS += -DTVM_OPENCL_RUNTIME=0 +endif + + include tests/cpp/unittest.mk test: $(TEST) @@ -59,7 +74,7 @@ lib/libtvm.a: $(ALL_DEP) lib/libtvm.so: $(ALL_DEP) @mkdir -p $(@D) - $(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) + $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) $(LIB_HALIDE_IR): LIBHALIDEIR diff --git a/include/tvm/c_runtime_api.h b/include/tvm/c_runtime_api.h index 7be198b122ea..1a21adc41cd6 100644 --- a/include/tvm/c_runtime_api.h +++ b/include/tvm/c_runtime_api.h @@ -150,6 +150,23 @@ typedef TVMArray* TVMArrayHandle; */ TVM_DLL const char *TVMGetLastError(void); +/*! + * \brief Initialize certain type of devices, this may + * not be necessary for all device types. But is needed for OpenCL. + * + * \param dev_mask The device mask of device type to be initialized + * \param option_keys Additional option keys to pass. + * \param option_vals Additional option values to pass + * \param num_options Number of options to be passed into it. + * \param out_code 1: success, 0: already initialized + * \return Whether the function is successful. + */ +TVM_DLL int TVMDeviceInit(int dev_mask, + const char** option_keys, + const char** option_vals, + int num_options, + int *out_code); + /*! * \brief Whether the specified context is enabled. * diff --git a/make/config.mk b/make/config.mk index 955bc6c8cb3c..26530827ea0e 100644 --- a/make/config.mk +++ b/make/config.mk @@ -37,6 +37,9 @@ ADD_CFLAGS = # whether use CUDA during compile USE_CUDA = 1 +# whether use OpenCL during compile +USE_OPENCL = 0 + # add the path to CUDA library to link and compile flag # if you have already add them to environment variable, leave it as NONE # USE_CUDA_PATH = /usr/local/cuda diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index 00729bcf6a85..b3a376de3d9f 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -12,7 +12,7 @@ from . import schedule from . import ndarray as nd -from .ndarray import cpu, gpu, opencl +from .ndarray import cpu, gpu, opencl, init_opencl from ._base import TVMError from .function import * diff --git a/python/tvm/_ctypes/_runtime_api.py b/python/tvm/_ctypes/_runtime_api.py index d12f727f6f8a..5b3b81904465 100644 --- a/python/tvm/_ctypes/_runtime_api.py +++ b/python/tvm/_ctypes/_runtime_api.py @@ -7,7 +7,7 @@ import numpy as np from .._base import _LIB -from .._base import c_array +from .._base import c_array, c_str from .._base import check_call @@ -182,6 +182,30 @@ def sync(ctx): check_call(_LIB.TVMSynchronize(ctx, None)) +def init_opencl(**kwargs): + """Initialize the opencl with the options. + + Parameters + ---------- + kwargs : dict + The options + """ + keys = [] + vals = [] + for k, v in kwargs.items(): + keys.append(c_str(k)) + vals.append(c_str(v)) + dev_mask = ctypes.c_int(4) + out_code = ctypes.c_int() + check_call(_LIB.TVMDeviceInit( + dev_mask, + c_array(ctypes.c_char_p, keys), + c_array(ctypes.c_char_p, vals), + ctypes.c_int(len(keys)), + ctypes.byref(out_code))) + return out_code.value != 0 + + class NDArrayBase(object): """A simple Device/CPU Array object in runtime.""" __slots__ = ["handle"] diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py index fc74c28fde20..eafc065b1683 100644 --- a/python/tvm/ndarray.py +++ b/python/tvm/ndarray.py @@ -9,6 +9,7 @@ from ._ctypes._runtime_api import TVMContext, TVMDataType, NDArrayBase from ._ctypes._runtime_api import cpu, gpu, opencl, empty, sync from ._ctypes._runtime_api import _init_runtime_module +from ._ctypes._runtime_api import init_opencl class NDArray(NDArrayBase): diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py index a8ecb97bf27b..b46c5866082f 100644 --- a/python/tvm/schedule.py +++ b/python/tvm/schedule.py @@ -24,7 +24,7 @@ def __getitem__(self, k): k = k.op if not isinstance(k, _tensor.Operation): raise ValueError("Expect schedule key to be Tensor or Operation") - if not k in self.stage_map: + if k not in self.stage_map: raise ValueError("Cannot find the operation %s in schedule" % (str(k))) return self.stage_map[k] diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index efa355a3b3a1..e68790b583cf 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -64,6 +64,23 @@ inline size_t GetDataAlignment(TVMArray* arr) { using namespace tvm::runtime; +int TVMDeviceInit(int dev_mask, + const char** option_keys, + const char** option_vals, + int num_options, + int* out_code) { + API_BEGIN(); + *out_code = 1; + switch (dev_mask) { + case kOpenCL: { + *out_code = DeviceInit(option_keys, option_vals, num_options); + break; + } + default: break; + } + API_END(); +} + int TVMContextEnabled(TVMContext ctx, int* out_enabled) { API_BEGIN(); diff --git a/src/runtime/device_api.h b/src/runtime/device_api.h index b74b41ae245b..c2b163624786 100644 --- a/src/runtime/device_api.h +++ b/src/runtime/device_api.h @@ -1,6 +1,6 @@ /*! * Copyright (c) 2016 by Contributors - * \file device_api.hx + * \file device_api.h * \brief Device specific API */ #ifndef TVM_RUNTIME_DEVICE_API_H_ @@ -11,6 +11,21 @@ namespace tvm { namespace runtime { +/*! + * \brief Initialize the device. + * \param option_keys Additional option keys to pass. + * \param option_vals Additional option values to pass + * \param num_options Number of options to be passed into it. + * \return 0 if success, 1: if already initialized + * \tparam xpu The device mask. + */ +template +inline bool DeviceInit(const char** option_keys, + const char** option_vals, + int num_options) { + return true; +} + /*! * \brief Whether ctx is enabled. * \param ctx The device context to perform operation. @@ -93,7 +108,8 @@ inline void StreamSync(TVMContext ctx, TVMStreamHandle stream); } // namespace runtime } // namespace tvm -#include "./device_api_gpu.h" #include "./device_api_cpu.h" +#include "./device_api_gpu.h" +#include "./device_api_opencl.h" #endif // TVM_RUNTIME_DEVICE_API_H_ diff --git a/src/runtime/device_api_gpu.h b/src/runtime/device_api_gpu.h index 970450657b2e..b18a95dcb0a6 100644 --- a/src/runtime/device_api_gpu.h +++ b/src/runtime/device_api_gpu.h @@ -1,6 +1,6 @@ /*! * Copyright (c) 2016 by Contributors - * \file ctxice_api_gpu.h + * \file device_api_gpu.h * \brief GPU specific API */ #ifndef TVM_RUNTIME_DEVICE_API_GPU_H_ @@ -14,15 +14,6 @@ namespace tvm { namespace runtime { -/*! - * \brief Check CUDA error. - * \param msg Message to print if an error occured. - */ -#define CHECK_CUDA_ERROR(msg) \ - { \ - cudaError_t e = cudaGetLastError(); \ - CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \ - } /*! * \brief Protected CUDA call. diff --git a/src/runtime/device_api_opencl.h b/src/runtime/device_api_opencl.h new file mode 100644 index 000000000000..257262beb0d7 --- /dev/null +++ b/src/runtime/device_api_opencl.h @@ -0,0 +1,310 @@ +/*! + * Copyright (c) 2016 by Contributors + * \file device_api_opencl.h + * \brief OpenCL specific API + */ +#ifndef TVM_RUNTIME_DEVICE_API_OPENCL_H_ +#define TVM_RUNTIME_DEVICE_API_OPENCL_H_ + +#if TVM_OPENCL_RUNTIME + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include +#include +#include + + +namespace tvm { +namespace runtime { +namespace cl { + +static_assert(sizeof(cl_mem) ==sizeof(void*), + "Required to store cl_mem inside void*"); + +inline const char* CLGetErrorString(cl_int error) { + switch (error) { + case CL_SUCCESS: return "CL_SUCCESS"; + case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND"; + case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE"; + case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY"; + case CL_PROFILING_INFO_NOT_AVAILABLE: return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case CL_MEM_COPY_OVERLAP: return "CL_MEM_COPY_OVERLAP"; + case CL_IMAGE_FORMAT_MISMATCH: return "CL_IMAGE_FORMAT_MISMATCH"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case CL_BUILD_PROGRAM_FAILURE: return "CL_BUILD_PROGRAM_FAILURE"; + case CL_MAP_FAILURE: return "CL_MAP_FAILURE"; + case CL_INVALID_VALUE: return "CL_INVALID_VALUE"; + case CL_INVALID_DEVICE_TYPE: return "CL_INVALID_DEVICE_TYPE"; + case CL_INVALID_PLATFORM: return "CL_INVALID_PLATFORM"; + case CL_INVALID_DEVICE: return "CL_INVALID_DEVICE"; + case CL_INVALID_CONTEXT: return "CL_INVALID_CONTEXT"; + case CL_INVALID_QUEUE_PROPERTIES: return "CL_INVALID_QUEUE_PROPERTIES"; + case CL_INVALID_COMMAND_QUEUE: return "CL_INVALID_COMMAND_QUEUE"; + case CL_INVALID_HOST_PTR: return "CL_INVALID_HOST_PTR"; + case CL_INVALID_MEM_OBJECT: return "CL_INVALID_MEM_OBJECT"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case CL_INVALID_IMAGE_SIZE: return "CL_INVALID_IMAGE_SIZE"; + case CL_INVALID_SAMPLER: return "CL_INVALID_SAMPLER"; + case CL_INVALID_BINARY: return "CL_INVALID_BINARY"; + case CL_INVALID_BUILD_OPTIONS: return "CL_INVALID_BUILD_OPTIONS"; + case CL_INVALID_PROGRAM: return "CL_INVALID_PROGRAM"; + case CL_INVALID_PROGRAM_EXECUTABLE: return "CL_INVALID_PROGRAM_EXECUTABLE"; + case CL_INVALID_KERNEL_NAME: return "CL_INVALID_KERNEL_NAME"; + case CL_INVALID_KERNEL_DEFINITION: return "CL_INVALID_KERNEL_DEFINITION"; + case CL_INVALID_KERNEL: return "CL_INVALID_KERNEL"; + case CL_INVALID_ARG_INDEX: return "CL_INVALID_ARG_INDEX"; + case CL_INVALID_ARG_VALUE: return "CL_INVALID_ARG_VALUE"; + case CL_INVALID_ARG_SIZE: return "CL_INVALID_ARG_SIZE"; + case CL_INVALID_KERNEL_ARGS: return "CL_INVALID_KERNEL_ARGS"; + case CL_INVALID_WORK_DIMENSION: return "CL_INVALID_WORK_DIMENSION"; + case CL_INVALID_WORK_GROUP_SIZE: return "CL_INVALID_WORK_GROUP_SIZE"; + case CL_INVALID_WORK_ITEM_SIZE: return "CL_INVALID_WORK_ITEM_SIZE"; + case CL_INVALID_GLOBAL_OFFSET: return "CL_INVALID_GLOBAL_OFFSET"; + case CL_INVALID_EVENT_WAIT_LIST: return "CL_INVALID_EVENT_WAIT_LIST"; + case CL_INVALID_EVENT: return "CL_INVALID_EVENT"; + case CL_INVALID_OPERATION: return "CL_INVALID_OPERATION"; + case CL_INVALID_GL_OBJECT: return "CL_INVALID_GL_OBJECT"; + case CL_INVALID_BUFFER_SIZE: return "CL_INVALID_BUFFER_SIZE"; + case CL_INVALID_MIP_LEVEL: return "CL_INVALID_MIP_LEVEL"; + default: return "Unknown OpenCL error code"; + } +} + +/*! + * \brief Protected OpenCL call + * \param func Expression to call. + */ +#define OPENCL_CHECK_ERROR(e) \ + { \ + CHECK(e == CL_SUCCESS) \ + << "OpenCL Error, code=" << e << ": " << cl::CLGetErrorString(e); \ + } + +#define OPENCL_CALL(func) \ + { \ + cl_int e = (func); \ + OPENCL_CHECK_ERROR(e); \ + } + +// Process local opencl workspace +class OpenCLWorkspace { + public: + // global platform id + cl_platform_id platform_id; + // global context of this process + cl_context context{nullptr}; + // the devices + std::vector devices; + // the queues + std::vector queues; + // the mutex for initialization + std::mutex mu; + // destructor + ~OpenCLWorkspace() { + if (context != nullptr) { + OPENCL_CALL(clReleaseContext(context)); + } + } + // whether the workspace is initialized. + inline bool initialized() const { + return context != nullptr; + } + // get the queue of the context + cl_command_queue GetQueue(TVMContext ctx) const { + CHECK_EQ(ctx.dev_mask, kOpenCL); + CHECK(initialized()) + << "The OpenCL is not initialized"; + CHECK(ctx.dev_id >= 0 && static_cast(ctx.dev_id) < queues.size()) + << "Invalid OpenCL dev_id=" << ctx.dev_id; + return queues[ctx.dev_id]; + } + // get the global workspace + static OpenCLWorkspace* Global() { + static OpenCLWorkspace inst; + return &inst; + } +}; + +inline std::string GetPlatformInfo( + cl_platform_id pid, cl_platform_info param_name) { + size_t ret_size; + OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size)); + std::string ret; + ret.resize(ret_size); + OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr)); + return ret; +} + +inline std::string GetDeviceInfo( + cl_device_id pid, cl_device_info param_name) { + size_t ret_size; + OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size)); + std::string ret; + ret.resize(ret_size); + OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, &ret[0], nullptr)); + return ret; +} + +inline std::vector GetPlatformIDs() { + cl_uint ret_size; + OPENCL_CALL(clGetPlatformIDs(0, nullptr, &ret_size)); + std::vector ret; + ret.resize(ret_size); + OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr)); + return ret; +} + +inline std::vector GetDeviceIDs( + cl_platform_id pid, std::string device_type) { + cl_device_type dtype = CL_DEVICE_TYPE_ALL; + if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU; + if (device_type == "gpu") dtype = CL_DEVICE_TYPE_CPU; + if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR; + cl_uint ret_size; + OPENCL_CALL(clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size)); + std::vector ret; + ret.resize(ret_size); + OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr)); + return ret; +} + +inline bool MatchPlatformInfo( + cl_platform_id pid, + cl_platform_info param_name, + std::string value) { + if (value.length() == 0) return true; + std::string param_value = GetPlatformInfo(pid, param_name); + return param_value.find(value) != std::string::npos; +} + +} // namespace cl + +template<> +inline bool DeviceInit(const char** option_keys, + const char** option_vals, + int num_options) { + cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global(); + std::lock_guard(w->mu); + if (w->initialized()) return false; + // matching conditions + std::string platform_name, device_type; + for (int i = 0; i < num_options; ++i) { + std::string key = option_keys[i]; + std::string val = option_vals[i]; + if (key == "platform_name") { + platform_name = val; + } else if (key == "device_type") { + device_type = val; + } else { + LOG(FATAL) << "unknown DeviceInit option " << key; + } + } + // matched platforms + std::vector platform_matched; + for (cl_platform_id pid : cl::GetPlatformIDs()) { + bool matched = true; + if (!cl::MatchPlatformInfo(pid, CL_PLATFORM_NAME, platform_name)) matched = false; + if (matched) platform_matched.push_back(pid); + } + if (platform_matched.size() == 0) { + LOG(FATAL) << "No OpenCL platform matched given existing options ..."; + } + if (platform_matched.size() > 1) { + LOG(WARNING) << "Multiple OpenCL platforms matched, use the first one ... "; + } + w->platform_id = platform_matched[0]; + + LOG(INFO) << "Initialize OpenCL platform \'" + << cl::GetPlatformInfo(w->platform_id, CL_PLATFORM_NAME) << '\''; + std::vector devices_matched = + cl::GetDeviceIDs(w->platform_id, device_type); + CHECK_GT(devices_matched.size(), 0U) + << "No OpenCL device any device matched given the options"; + w->devices = devices_matched; + cl_int err_code; + w->context = clCreateContext( + nullptr, w->devices.size(), &(w->devices[0]), + nullptr, nullptr, &err_code); + OPENCL_CHECK_ERROR(err_code); + CHECK_EQ(w->queues.size(), 0U); + for (size_t i = 0; i < w->devices.size(); ++i) { + cl_device_id did = w->devices[i]; + w->queues.push_back( + clCreateCommandQueue(w->context, did, 0, &err_code)); + OPENCL_CHECK_ERROR(err_code); + LOG(INFO) << "opencl(" << i + << ")=\'" << cl::GetDeviceInfo(did, CL_DEVICE_NAME) + << "\' cl_device_id=" << did; + } + return true; +} + +template<> +inline void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) { + cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global(); + cl_int err_code; + cl_mem mptr = clCreateBuffer( + w->context, CL_MEM_READ_WRITE, size, nullptr, &err_code); + OPENCL_CHECK_ERROR(err_code); + return mptr; +} + +template<> +inline void FreeDataSpace(TVMContext ctx, void* ptr) { + cl_mem mptr = static_cast(ptr); + OPENCL_CALL(clReleaseMemObject(mptr)); +} + +template<> +inline void CopyDataFromTo(const void* from, + void* to, + size_t size, + TVMContext ctx_from, + TVMContext ctx_to, + TVMStreamHandle stream) { + CHECK(stream == nullptr); + cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global(); + if (ctx_from.dev_mask == kOpenCL && ctx_to.dev_mask == kOpenCL) { + OPENCL_CALL(clEnqueueCopyBuffer( + w->GetQueue(ctx_to), + static_cast((void*)from), // NOLINT(*) + static_cast(to), + 0, 0, size, 0, nullptr, nullptr)); + } else if (ctx_from.dev_mask == kOpenCL && ctx_to.dev_mask == kCPU) { + OPENCL_CALL(clEnqueueReadBuffer( + w->GetQueue(ctx_from), + static_cast((void*)from), // NOLINT(*) + CL_FALSE, 0, size, to, + 0, nullptr, nullptr)); + OPENCL_CALL(clFinish(w->GetQueue(ctx_from))); + } else if (ctx_from.dev_mask == kCPU && ctx_to.dev_mask == kOpenCL) { + OPENCL_CALL(clEnqueueWriteBuffer( + w->GetQueue(ctx_to), + static_cast(to), + CL_FALSE, 0, size, from, + 0, nullptr, nullptr)); + OPENCL_CALL(clFinish(w->GetQueue(ctx_to))); + } else { + LOG(FATAL) << "Expect copy from/to GPU or between GPU"; + } +} + +template<> +inline void StreamSync(TVMContext ctx, TVMStreamHandle stream) { + CHECK(stream == nullptr); + cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global(); + OPENCL_CALL(clFinish(w->GetQueue(ctx))); +} + +} // namespace runtime +} // namespace tvm +#endif // TVM_OPENCL_RUNTIME +#endif // TVM_RUNTIME_DEVICE_API_OPENCL_H_ diff --git a/tests/python/test_runtime_ndarray.py b/tests/python/test_runtime_ndarray.py index 602343b3efef..6731c8f2394a 100644 --- a/tests/python/test_runtime_ndarray.py +++ b/tests/python/test_runtime_ndarray.py @@ -2,6 +2,7 @@ import numpy as np def enabled_ctx_list(): + tvm.init_opencl() ctx_list = [tvm.cpu(0), tvm.gpu(0), tvm.opencl(0)] ctx_list = [ctx for ctx in ctx_list if ctx.enabled] return ctx_list diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index 02cd71308f16..f1a070b5ac1e 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -16,13 +16,15 @@ fi cp make/config.mk config.mk echo "USE_CUDA=0" >> config.mk -echo "USE_OPENCL=0" >> config.mk -if [ ! ${TRAVIS_OS_NAME} == "osx" ]; then +if [ ${TRAVIS_OS_NAME} == "osx" ]; then + echo "USE_OPENCL=1" >> config.mk +else # use g++-4.8 for linux if [ ${CXX} == "g++" ]; then export CXX=g++-4.8 fi + echo "USE_OPENCL=0" >> config.mk fi if [ ${TASK} == "cpp_test" ] || [ ${TASK} == "all_test" ]; then