From c29a2927f7f8f531a8cac73b5ad56d183b0563c6 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 17 Jan 2017 18:59:40 -0800
Subject: [PATCH] [RUNTIME] Enable OpenCL

---
 Makefile                             |  17 +-
 include/tvm/c_runtime_api.h          |  17 ++
 make/config.mk                       |   3 +
 python/tvm/__init__.py               |   2 +-
 python/tvm/_ctypes/_runtime_api.py   |  26 ++-
 python/tvm/ndarray.py                |   1 +
 python/tvm/schedule.py               |   2 +-
 src/runtime/c_runtime_api.cc         |  17 ++
 src/runtime/device_api.h             |  20 +-
 src/runtime/device_api_gpu.h         |  11 +-
 src/runtime/device_api_opencl.h      | 310 +++++++++++++++++++++++++++
 tests/python/test_runtime_ndarray.py |   1 +
 tests/travis/run_test.sh             |   6 +-
 13 files changed, 415 insertions(+), 18 deletions(-)
 create mode 100644 src/runtime/device_api_opencl.h

diff --git a/Makefile b/Makefile
index 514ffa665491..0cee2e36ed15 100644
--- a/Makefile
+++ b/Makefile
@@ -26,6 +26,7 @@ endif
 export LDFLAGS = -pthread -lm
 export CFLAGS =  -std=c++11 -Wall -O2\
 	 -Iinclude -Idmlc-core/include -IHalideIR/src  -fPIC
+export FRAMEWORKS=
 
 ifneq ($(ADD_CFLAGS), NONE)
 	CFLAGS += $(ADD_CFLAGS)
@@ -43,6 +44,20 @@ else
 	CFLAGS += -DTVM_CUDA_RUNTIME=0
 endif
 
+
+ifeq ($(USE_OPENCL), 1)
+	CFLAGS += -DTVM_OPENCL_RUNTIME=1
+	UNAME_S := $(shell uname -s)
+	ifeq ($(UNAME_S), Darwin)
+		FRAMEWORKS += -framework OpenCL
+	else
+		LDFLAGS += -lOpenCL
+	endif
+else
+	CFLAGS += -DTVM_OPENCL_RUNTIME=0
+endif
+
+
 include tests/cpp/unittest.mk
 
 test: $(TEST)
@@ -59,7 +74,7 @@ lib/libtvm.a: $(ALL_DEP)
 
 lib/libtvm.so: $(ALL_DEP)
 	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
+	$(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
 
 $(LIB_HALIDE_IR): LIBHALIDEIR
 
diff --git a/include/tvm/c_runtime_api.h b/include/tvm/c_runtime_api.h
index 7be198b122ea..1a21adc41cd6 100644
--- a/include/tvm/c_runtime_api.h
+++ b/include/tvm/c_runtime_api.h
@@ -150,6 +150,23 @@ typedef TVMArray* TVMArrayHandle;
  */
 TVM_DLL const char *TVMGetLastError(void);
 
+/*!
+ * \brief Initialize certain type of devices, this may
+ *  not be necessary for all device types. But is needed for OpenCL.
+ *
+ * \param dev_mask The device mask of device type to be initialized
+ * \param option_keys Additional option  keys to pass.
+ * \param option_vals Additional option values to pass
+ * \param num_options Number of options to be passed into it.
+ * \param out_code 1: success, 0: already initialized
+ * \return Whether the function is successful.
+ */
+TVM_DLL int TVMDeviceInit(int dev_mask,
+                          const char** option_keys,
+                          const char** option_vals,
+                          int num_options,
+                          int *out_code);
+
 /*!
  * \brief Whether the specified context is enabled.
  *
diff --git a/make/config.mk b/make/config.mk
index 955bc6c8cb3c..26530827ea0e 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -37,6 +37,9 @@ ADD_CFLAGS =
 # whether use CUDA during compile
 USE_CUDA = 1
 
+# whether use OpenCL during compile
+USE_OPENCL = 0
+
 # add the path to CUDA library to link and compile flag
 # if you have already add them to environment variable, leave it as NONE
 # USE_CUDA_PATH = /usr/local/cuda
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 00729bcf6a85..b3a376de3d9f 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -12,7 +12,7 @@
 from . import schedule
 
 from . import ndarray as nd
-from .ndarray import cpu, gpu, opencl
+from .ndarray import cpu, gpu, opencl, init_opencl
 
 from ._base import TVMError
 from .function import *
diff --git a/python/tvm/_ctypes/_runtime_api.py b/python/tvm/_ctypes/_runtime_api.py
index d12f727f6f8a..5b3b81904465 100644
--- a/python/tvm/_ctypes/_runtime_api.py
+++ b/python/tvm/_ctypes/_runtime_api.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 from .._base import _LIB
-from .._base import c_array
+from .._base import c_array, c_str
 from .._base import check_call
 
 
@@ -182,6 +182,30 @@ def sync(ctx):
     check_call(_LIB.TVMSynchronize(ctx, None))
 
 
+def init_opencl(**kwargs):
+    """Initialize the opencl with the options.
+
+    Parameters
+    ----------
+    kwargs : dict
+        The options
+    """
+    keys = []
+    vals = []
+    for k, v in kwargs.items():
+        keys.append(c_str(k))
+        vals.append(c_str(v))
+    dev_mask = ctypes.c_int(4)
+    out_code = ctypes.c_int()
+    check_call(_LIB.TVMDeviceInit(
+        dev_mask,
+        c_array(ctypes.c_char_p, keys),
+        c_array(ctypes.c_char_p, vals),
+        ctypes.c_int(len(keys)),
+        ctypes.byref(out_code)))
+    return out_code.value != 0
+
+
 class NDArrayBase(object):
     """A simple Device/CPU Array object in runtime."""
     __slots__ = ["handle"]
diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py
index fc74c28fde20..eafc065b1683 100644
--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -9,6 +9,7 @@
 from ._ctypes._runtime_api import TVMContext, TVMDataType, NDArrayBase
 from ._ctypes._runtime_api import cpu, gpu, opencl, empty, sync
 from ._ctypes._runtime_api import _init_runtime_module
+from ._ctypes._runtime_api import init_opencl
 
 
 class NDArray(NDArrayBase):
diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py
index a8ecb97bf27b..b46c5866082f 100644
--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -24,7 +24,7 @@ def __getitem__(self, k):
             k = k.op
         if not isinstance(k, _tensor.Operation):
             raise ValueError("Expect schedule key to be Tensor or Operation")
-        if not k in self.stage_map:
+        if k not in self.stage_map:
             raise ValueError("Cannot find the operation %s in schedule" % (str(k)))
         return self.stage_map[k]
 
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index efa355a3b3a1..e68790b583cf 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -64,6 +64,23 @@ inline size_t GetDataAlignment(TVMArray* arr) {
 
 using namespace tvm::runtime;
 
+int TVMDeviceInit(int dev_mask,
+                  const char** option_keys,
+                  const char** option_vals,
+                  int num_options,
+                  int* out_code) {
+  API_BEGIN();
+  *out_code = 1;
+  switch (dev_mask) {
+    case kOpenCL: {
+      *out_code = DeviceInit<kOpenCL>(option_keys, option_vals, num_options);
+      break;
+    }
+    default: break;
+  }
+  API_END();
+}
+
 int TVMContextEnabled(TVMContext ctx,
                       int* out_enabled) {
   API_BEGIN();
diff --git a/src/runtime/device_api.h b/src/runtime/device_api.h
index b74b41ae245b..c2b163624786 100644
--- a/src/runtime/device_api.h
+++ b/src/runtime/device_api.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file device_api.hx
+ * \file device_api.h
  * \brief Device specific API
  */
 #ifndef TVM_RUNTIME_DEVICE_API_H_
@@ -11,6 +11,21 @@
 
 namespace tvm {
 namespace runtime {
+/*!
+ * \brief Initialize the device.
+ * \param option_keys Additional option  keys to pass.
+ * \param option_vals Additional option values to pass
+ * \param num_options Number of options to be passed into it.
+ * \return 0 if success, 1: if already initialized
+ * \tparam xpu The device mask.
+ */
+template<TVMDeviceMask xpu>
+inline bool DeviceInit(const char** option_keys,
+                       const char** option_vals,
+                       int num_options) {
+  return true;
+}
+
 /*!
  * \brief Whether ctx is enabled.
  * \param ctx The device context to perform operation.
@@ -93,7 +108,8 @@ inline void StreamSync(TVMContext ctx, TVMStreamHandle stream);
 }  // namespace runtime
 }  // namespace tvm
 
-#include "./device_api_gpu.h"
 #include "./device_api_cpu.h"
+#include "./device_api_gpu.h"
+#include "./device_api_opencl.h"
 
 #endif  // TVM_RUNTIME_DEVICE_API_H_
diff --git a/src/runtime/device_api_gpu.h b/src/runtime/device_api_gpu.h
index 970450657b2e..b18a95dcb0a6 100644
--- a/src/runtime/device_api_gpu.h
+++ b/src/runtime/device_api_gpu.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file ctxice_api_gpu.h
+ * \file device_api_gpu.h
  * \brief GPU specific API
  */
 #ifndef TVM_RUNTIME_DEVICE_API_GPU_H_
@@ -14,15 +14,6 @@
 
 namespace tvm {
 namespace runtime {
-/*!
- * \brief Check CUDA error.
- * \param msg Message to print if an error occured.
- */
-#define CHECK_CUDA_ERROR(msg)                                           \
-  {                                                                     \
-    cudaError_t e = cudaGetLastError();                                 \
-    CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \
-  }
 
 /*!
  * \brief Protected CUDA call.
diff --git a/src/runtime/device_api_opencl.h b/src/runtime/device_api_opencl.h
new file mode 100644
index 000000000000..257262beb0d7
--- /dev/null
+++ b/src/runtime/device_api_opencl.h
@@ -0,0 +1,310 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file device_api_opencl.h
+ * \brief OpenCL specific API
+ */
+#ifndef TVM_RUNTIME_DEVICE_API_OPENCL_H_
+#define TVM_RUNTIME_DEVICE_API_OPENCL_H_
+
+#if TVM_OPENCL_RUNTIME
+
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif
+
+#include <mutex>
+#include <string>
+#include <vector>
+
+
+namespace tvm {
+namespace runtime {
+namespace cl {
+
+static_assert(sizeof(cl_mem) ==sizeof(void*),
+              "Required to store cl_mem inside void*");
+
+inline const char* CLGetErrorString(cl_int error) {
+  switch (error) {
+    case CL_SUCCESS: return "CL_SUCCESS";
+    case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND";
+    case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE";
+    case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE";
+    case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES";
+    case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY";
+    case CL_PROFILING_INFO_NOT_AVAILABLE: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+    case CL_MEM_COPY_OVERLAP: return "CL_MEM_COPY_OVERLAP";
+    case CL_IMAGE_FORMAT_MISMATCH: return "CL_IMAGE_FORMAT_MISMATCH";
+    case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+    case CL_BUILD_PROGRAM_FAILURE: return "CL_BUILD_PROGRAM_FAILURE";
+    case CL_MAP_FAILURE: return "CL_MAP_FAILURE";
+    case CL_INVALID_VALUE: return "CL_INVALID_VALUE";
+    case CL_INVALID_DEVICE_TYPE: return "CL_INVALID_DEVICE_TYPE";
+    case CL_INVALID_PLATFORM: return "CL_INVALID_PLATFORM";
+    case CL_INVALID_DEVICE: return "CL_INVALID_DEVICE";
+    case CL_INVALID_CONTEXT: return "CL_INVALID_CONTEXT";
+    case CL_INVALID_QUEUE_PROPERTIES: return "CL_INVALID_QUEUE_PROPERTIES";
+    case CL_INVALID_COMMAND_QUEUE: return "CL_INVALID_COMMAND_QUEUE";
+    case CL_INVALID_HOST_PTR: return "CL_INVALID_HOST_PTR";
+    case CL_INVALID_MEM_OBJECT: return "CL_INVALID_MEM_OBJECT";
+    case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+    case CL_INVALID_IMAGE_SIZE: return "CL_INVALID_IMAGE_SIZE";
+    case CL_INVALID_SAMPLER: return "CL_INVALID_SAMPLER";
+    case CL_INVALID_BINARY: return "CL_INVALID_BINARY";
+    case CL_INVALID_BUILD_OPTIONS: return "CL_INVALID_BUILD_OPTIONS";
+    case CL_INVALID_PROGRAM: return "CL_INVALID_PROGRAM";
+    case CL_INVALID_PROGRAM_EXECUTABLE: return "CL_INVALID_PROGRAM_EXECUTABLE";
+    case CL_INVALID_KERNEL_NAME: return "CL_INVALID_KERNEL_NAME";
+    case CL_INVALID_KERNEL_DEFINITION: return "CL_INVALID_KERNEL_DEFINITION";
+    case CL_INVALID_KERNEL: return "CL_INVALID_KERNEL";
+    case CL_INVALID_ARG_INDEX: return "CL_INVALID_ARG_INDEX";
+    case CL_INVALID_ARG_VALUE: return "CL_INVALID_ARG_VALUE";
+    case CL_INVALID_ARG_SIZE: return "CL_INVALID_ARG_SIZE";
+    case CL_INVALID_KERNEL_ARGS: return "CL_INVALID_KERNEL_ARGS";
+    case CL_INVALID_WORK_DIMENSION: return "CL_INVALID_WORK_DIMENSION";
+    case CL_INVALID_WORK_GROUP_SIZE: return "CL_INVALID_WORK_GROUP_SIZE";
+    case CL_INVALID_WORK_ITEM_SIZE: return "CL_INVALID_WORK_ITEM_SIZE";
+    case CL_INVALID_GLOBAL_OFFSET: return "CL_INVALID_GLOBAL_OFFSET";
+    case CL_INVALID_EVENT_WAIT_LIST: return "CL_INVALID_EVENT_WAIT_LIST";
+    case CL_INVALID_EVENT: return "CL_INVALID_EVENT";
+    case CL_INVALID_OPERATION: return "CL_INVALID_OPERATION";
+    case CL_INVALID_GL_OBJECT: return "CL_INVALID_GL_OBJECT";
+    case CL_INVALID_BUFFER_SIZE: return "CL_INVALID_BUFFER_SIZE";
+    case CL_INVALID_MIP_LEVEL: return "CL_INVALID_MIP_LEVEL";
+    default: return "Unknown OpenCL error code";
+  }
+}
+
+/*!
+ * \brief Protected OpenCL call
+ * \param func Expression to call.
+ */
+#define OPENCL_CHECK_ERROR(e)                                           \
+  {                                                                     \
+    CHECK(e == CL_SUCCESS)                                              \
+        << "OpenCL Error, code=" << e << ": " << cl::CLGetErrorString(e); \
+  }
+
+#define OPENCL_CALL(func)                                             \
+  {                                                                   \
+    cl_int e = (func);                                                \
+    OPENCL_CHECK_ERROR(e);                                            \
+  }
+
+// Process local opencl workspace
+class OpenCLWorkspace {
+ public:
+  // global platform id
+  cl_platform_id platform_id;
+  // global context of this process
+  cl_context context{nullptr};
+  // the devices
+  std::vector<cl_device_id> devices;
+  // the queues
+  std::vector<cl_command_queue> queues;
+  // the mutex for initialization
+  std::mutex mu;
+  // destructor
+  ~OpenCLWorkspace() {
+    if (context != nullptr) {
+      OPENCL_CALL(clReleaseContext(context));
+    }
+  }
+  // whether the workspace is initialized.
+  inline bool initialized() const {
+    return context != nullptr;
+  }
+  // get the queue of the context
+  cl_command_queue GetQueue(TVMContext ctx) const {
+    CHECK_EQ(ctx.dev_mask, kOpenCL);
+    CHECK(initialized())
+        << "The OpenCL is not initialized";
+    CHECK(ctx.dev_id >= 0  && static_cast<size_t>(ctx.dev_id) < queues.size())
+        << "Invalid OpenCL dev_id=" << ctx.dev_id;
+    return queues[ctx.dev_id];
+  }
+  // get the global workspace
+  static OpenCLWorkspace* Global() {
+    static OpenCLWorkspace inst;
+    return &inst;
+  }
+};
+
+inline std::string GetPlatformInfo(
+    cl_platform_id pid, cl_platform_info param_name) {
+  size_t ret_size;
+  OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size));
+  std::string ret;
+  ret.resize(ret_size);
+  OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr));
+  return ret;
+}
+
+inline std::string GetDeviceInfo(
+    cl_device_id pid, cl_device_info param_name) {
+  size_t ret_size;
+  OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size));
+  std::string ret;
+  ret.resize(ret_size);
+  OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, &ret[0], nullptr));
+  return ret;
+}
+
+inline std::vector<cl_platform_id> GetPlatformIDs() {
+  cl_uint ret_size;
+  OPENCL_CALL(clGetPlatformIDs(0, nullptr, &ret_size));
+  std::vector<cl_platform_id> ret;
+  ret.resize(ret_size);
+  OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr));
+  return ret;
+}
+
+inline std::vector<cl_device_id> GetDeviceIDs(
+    cl_platform_id pid, std::string device_type) {
+  cl_device_type dtype = CL_DEVICE_TYPE_ALL;
+  if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU;
+  if (device_type == "gpu") dtype = CL_DEVICE_TYPE_CPU;
+  if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR;
+  cl_uint ret_size;
+  OPENCL_CALL(clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size));
+  std::vector<cl_device_id> ret;
+  ret.resize(ret_size);
+  OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr));
+  return ret;
+}
+
+inline bool MatchPlatformInfo(
+    cl_platform_id pid,
+    cl_platform_info param_name,
+    std::string value) {
+  if (value.length() == 0) return true;
+  std::string param_value = GetPlatformInfo(pid, param_name);
+  return param_value.find(value) != std::string::npos;
+}
+
+}  // namespace cl
+
+template<>
+inline bool DeviceInit<kOpenCL>(const char** option_keys,
+                                const char** option_vals,
+                                int num_options) {
+  cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
+  std::lock_guard<std::mutex>(w->mu);
+  if (w->initialized()) return false;
+  // matching conditions
+  std::string platform_name, device_type;
+  for (int i = 0; i < num_options; ++i) {
+    std::string key = option_keys[i];
+    std::string val = option_vals[i];
+    if (key == "platform_name") {
+      platform_name = val;
+    } else if (key == "device_type") {
+      device_type = val;
+    } else {
+      LOG(FATAL) << "unknown DeviceInit option " << key;
+    }
+  }
+  // matched platforms
+  std::vector<cl_platform_id> platform_matched;
+  for (cl_platform_id pid : cl::GetPlatformIDs()) {
+    bool matched = true;
+    if (!cl::MatchPlatformInfo(pid, CL_PLATFORM_NAME, platform_name)) matched = false;
+    if (matched) platform_matched.push_back(pid);
+  }
+  if (platform_matched.size() == 0) {
+    LOG(FATAL) << "No OpenCL platform matched given existing options ...";
+  }
+  if (platform_matched.size() > 1) {
+    LOG(WARNING) << "Multiple OpenCL platforms matched, use the first one ... ";
+  }
+  w->platform_id = platform_matched[0];
+
+  LOG(INFO) << "Initialize OpenCL platform \'"
+            << cl::GetPlatformInfo(w->platform_id, CL_PLATFORM_NAME) << '\'';
+  std::vector<cl_device_id> devices_matched =
+      cl::GetDeviceIDs(w->platform_id, device_type);
+  CHECK_GT(devices_matched.size(), 0U)
+      << "No OpenCL device any device matched given the options";
+  w->devices = devices_matched;
+  cl_int err_code;
+  w->context = clCreateContext(
+      nullptr, w->devices.size(), &(w->devices[0]),
+      nullptr, nullptr, &err_code);
+  OPENCL_CHECK_ERROR(err_code);
+  CHECK_EQ(w->queues.size(), 0U);
+  for (size_t i = 0; i < w->devices.size(); ++i) {
+    cl_device_id did = w->devices[i];
+    w->queues.push_back(
+        clCreateCommandQueue(w->context, did, 0, &err_code));
+    OPENCL_CHECK_ERROR(err_code);
+    LOG(INFO) << "opencl(" << i
+              << ")=\'" << cl::GetDeviceInfo(did, CL_DEVICE_NAME)
+              << "\' cl_device_id=" << did;
+  }
+  return true;
+}
+
+template<>
+inline void* AllocDataSpace<kOpenCL>(TVMContext ctx, size_t size, size_t alignment) {
+  cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
+  cl_int err_code;
+  cl_mem mptr = clCreateBuffer(
+      w->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
+  OPENCL_CHECK_ERROR(err_code);
+  return mptr;
+}
+
+template<>
+inline void FreeDataSpace<kOpenCL>(TVMContext ctx, void* ptr) {
+  cl_mem mptr = static_cast<cl_mem>(ptr);
+  OPENCL_CALL(clReleaseMemObject(mptr));
+}
+
+template<>
+inline void CopyDataFromTo<kOpenCL>(const void* from,
+                                    void* to,
+                                    size_t size,
+                                    TVMContext ctx_from,
+                                    TVMContext ctx_to,
+                                    TVMStreamHandle stream) {
+  CHECK(stream == nullptr);
+  cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
+  if (ctx_from.dev_mask == kOpenCL && ctx_to.dev_mask == kOpenCL) {
+    OPENCL_CALL(clEnqueueCopyBuffer(
+        w->GetQueue(ctx_to),
+        static_cast<cl_mem>((void*)from),  // NOLINT(*)
+        static_cast<cl_mem>(to),
+        0, 0, size, 0, nullptr, nullptr));
+  } else if (ctx_from.dev_mask == kOpenCL && ctx_to.dev_mask == kCPU) {
+    OPENCL_CALL(clEnqueueReadBuffer(
+        w->GetQueue(ctx_from),
+        static_cast<cl_mem>((void*)from),  // NOLINT(*)
+        CL_FALSE, 0, size, to,
+        0, nullptr, nullptr));
+    OPENCL_CALL(clFinish(w->GetQueue(ctx_from)));
+  } else if (ctx_from.dev_mask == kCPU && ctx_to.dev_mask == kOpenCL) {
+    OPENCL_CALL(clEnqueueWriteBuffer(
+        w->GetQueue(ctx_to),
+        static_cast<cl_mem>(to),
+        CL_FALSE, 0, size, from,
+        0, nullptr, nullptr));
+    OPENCL_CALL(clFinish(w->GetQueue(ctx_to)));
+  } else {
+    LOG(FATAL) << "Expect copy from/to GPU or between GPU";
+  }
+}
+
+template<>
+inline void StreamSync<kOpenCL>(TVMContext ctx, TVMStreamHandle stream) {
+  CHECK(stream == nullptr);
+  cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
+  OPENCL_CALL(clFinish(w->GetQueue(ctx)));
+}
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_OPENCL_RUNTIME
+#endif  // TVM_RUNTIME_DEVICE_API_OPENCL_H_
diff --git a/tests/python/test_runtime_ndarray.py b/tests/python/test_runtime_ndarray.py
index 602343b3efef..6731c8f2394a 100644
--- a/tests/python/test_runtime_ndarray.py
+++ b/tests/python/test_runtime_ndarray.py
@@ -2,6 +2,7 @@
 import numpy as np
 
 def enabled_ctx_list():
+    tvm.init_opencl()
     ctx_list = [tvm.cpu(0), tvm.gpu(0), tvm.opencl(0)]
     ctx_list = [ctx for ctx in ctx_list if ctx.enabled]
     return ctx_list
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index 02cd71308f16..f1a070b5ac1e 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -16,13 +16,15 @@ fi
 
 cp make/config.mk config.mk
 echo "USE_CUDA=0" >> config.mk
-echo "USE_OPENCL=0" >> config.mk
 
-if [ ! ${TRAVIS_OS_NAME} == "osx" ]; then
+if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+    echo "USE_OPENCL=1" >> config.mk
+else
     # use g++-4.8 for linux
     if [ ${CXX} == "g++" ]; then
         export CXX=g++-4.8
     fi
+    echo "USE_OPENCL=0" >> config.mk
 fi
 
 if [ ${TASK} == "cpp_test" ] || [ ${TASK} == "all_test" ]; then