apache · tqchen · Jul 9, 2018 · Jun 21, 2018 · Jun 28, 2018 · Jul 3, 2018
diff --git a/cmake/config.cmake b/cmake/config.cmake
@@ -39,6 +39,9 @@ set(USE_CUDA OFF)
 # - /path/to/rocm: use specific path to rocm
 set(USE_ROCM OFF)
 
+# Whether enable SDAccel runtime
+set(USE_SDACCEL OFF)
+
 # Whether enable OpenCL runtime
 set(USE_OPENCL OFF)
 

diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
@@ -7,6 +7,18 @@ if(OpenCL_FOUND)
   include_directories(${OpenCL_INCLUDE_DIRS})
 endif(OpenCL_FOUND)
 
+if(USE_SDACCEL)
+  message(STATUS "Build with SDAccel support")
+  file(GLOB RUNTIME_SDACCEL_SRCS src/runtime/opencl/sdaccel/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_SDACCEL_SRCS})
+  if(NOT USE_OPENCL)
+    message(STATUS "Enable OpenCL support required for SDAccel")
+    set(USE_OPENCL ON)
+  endif()
+else()
+  list(APPEND COMPILER_SRCS src/codegen/opt/build_sdaccel_off.cc)
+endif(USE_SDACCEL)
+
 if(USE_OPENCL)
   find_package(OpenCL REQUIRED)
   message(STATUS "Build with OpenCL support")

diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
@@ -60,6 +60,7 @@ typedef int64_t tvm_index_t;
 
 /*! \brief Extension device types in TVM */
 typedef enum {
+  kDLSDAccel = 6,
   kDLVulkan = 7,
   kOpenGL = 11,
   // Extension DRAM type, used for quickly test extension device

diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
@@ -95,6 +95,7 @@ class TVMContext(ctypes.Structure):
         1 : 'cpu',
         2 : 'gpu',
         4 : 'opencl',
+        6 : 'sdaccel',
         7 : 'vulkan',
         8 : 'metal',
         9 : 'vpi',
@@ -111,7 +112,7 @@ class TVMContext(ctypes.Structure):
         'nvptx': 2,
         'cl': 4,
         'opencl': 4,
-        'sdaccel': 4,
+        'sdaccel': 6,
         'vulkan': 7,
         'metal': 8,
         'vpi': 9,

diff --git a/python/tvm/contrib/sdaccel.py b/python/tvm/contrib/sdaccel.py
@@ -1,58 +1,10 @@
 """Utility for Interacting with SDAccel Tools"""
 import subprocess
 import os
-import re
 from . import util
 from ..api import register_func
 
 
-def _vhls_to_opencl(code):
-    """Convert source code from Vivado HLS to OpenCL."""
-    out = ''
-    for line in code.split('\n'):
-        if re.match(r'#include', line):
-            # OpenCL doesn't support include.
-            continue
-        if re.match(r'#pragma', line):
-            # Remove Vivado HLS specific pragmas.
-            continue
-
-        if re.match(r'extern "C"', line):
-            line = re.sub(r'^extern "C"', "__kernel", line)
-            # Add __global to pointer parameters.
-            line = re.sub(r'(\w+)\s*\*', r"__global \1*", line)
-
-        out += line + '\n'
-
-    return out
-
-
-def _fake_compile_vhls(code):
-    """Fake compile Vivado HLS code for SDAccel.
-
-    Compile the Vivado HLS code as an OpenCL code, and generate a program
-    binary for GPU which can be used instead of xclbin.
-
-    Parameters
-    ----------
-    code : str
-        The Vivado HLS code.
-
-    Return
-    ------
-    binary : bytearray
-        The program binary which can be passed to clCreateProgramWithBinary
-    """
-    try:
-        import pyopencl as cl
-    except ImportError:
-        raise ImportError('PyOpenCL is required for testing SDAccel backend.')
-    ctx = cl.Context(dev_type=cl.device_type.GPU)
-    program = cl.Program(ctx, _vhls_to_opencl(code)).build()
-    binary = bytearray(program.binaries[0])
-    return binary
-
-
 @register_func("tvm_callback_sdaccel_compile")
 def compile_vhls(code, kernel):
     """Compile Vivado HLS code for SDAccel.
@@ -87,9 +39,7 @@ def compile_vhls(code, kernel):
     platform = os.environ.get("XCL_PLATFORM", os.environ.get("AWS_PLATFORM"))
 
     if platform is None:
-        # If we don't have the Xilinx toolchain, create a program binary for
-        # GPU and use it for testing.
-        return _fake_compile_vhls(code)
+        raise RuntimeError("No Xlinx device specified.")
 
     # build xo
     args = [xocc, "-c", "-t", target, "--platform", platform, "-o", tmp_xo, "-k", kernel] + \

diff --git a/src/codegen/codegen_vhls.cc b/src/codegen/codegen_vhls.cc
@@ -6,7 +6,7 @@
 #include <string>
 #include "./codegen_vhls.h"
 #include "./build_common.h"
-#include "../runtime/opencl/opencl_module.h"
+#include "../runtime/opencl/sdaccel/sdaccel_module.h"
 
 namespace tvm {
 namespace codegen {
@@ -91,7 +91,7 @@ runtime::Module BuildSDAccel(Array<LoweredFunc> funcs) {
   } else {
     LOG(FATAL) << "Cannot compile Vivado HLS code.";
   }
-  return OpenCLModuleCreate(xclbin, "xclbin", ExtractFuncInfo(funcs), code);
+  return SDAccelModuleCreate(xclbin, "xclbin", ExtractFuncInfo(funcs), code);
 }
 
 TVM_REGISTER_API("codegen.build_sdaccel")

diff --git a/src/codegen/opt/build_sdaccel_off.cc b/src/codegen/opt/build_sdaccel_off.cc
@@ -0,0 +1,21 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ *  Optional module when build opencl is switched to off
+ */
+#include "../codegen_source_base.h"
+#include "../../runtime/opencl/opencl_module.h"
+
+namespace tvm {
+namespace runtime {
+
+Module SDAccelModuleCreate(
+    std::string data,
+    std::string fmt,
+    std::unordered_map<std::string, FunctionInfo> fmap,
+    std::string source) {
+  LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
+  return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "sdaccel");
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/pass/verify_memory.cc b/src/pass/verify_memory.cc
@@ -36,7 +36,7 @@ class MemoryAccessVerifier final : protected IRVisitor {
 
   /// Interface to perform memory access verification
   void Run() {
-    if (!IsGPUDevice(dev_type_)) return;
+    if (!IsGPUDevice(dev_type_) && !IsFPGADevice(dev_type_)) return;
     IRVisitor::Visit(func_->body);
   }
 
@@ -143,6 +143,10 @@ class MemoryAccessVerifier final : protected IRVisitor {
            kDLVulkan == dev_type || kDLMetal == dev_type ||
            kDLROCM == dev_type || kOpenGL == dev_type;
   }
+  /// Check if a given DLDeviceType/TVMDeviceExtType value denotes FPGA device.
+  static bool IsFPGADevice(int dev_type) {
+    return kDLSDAccel == dev_type;
+  }
 
  private:
   /// Status of visitor

diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
@@ -31,6 +31,7 @@ inline std::string DeviceName(int type) {
     case kDLCPU: return "cpu";
     case kDLGPU: return "gpu";
     case kDLOpenCL: return "opencl";
+    case kDLSDAccel: return "sdaccel";
     case kDLVulkan: return "vulkan";
     case kDLMetal: return "metal";
     case kDLVPI: return "vpi";

diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
@@ -21,6 +21,10 @@
 #include <string>
 #include <vector>
 #include "../workspace_pool.h"
+#include "../pack_args.h"
+#include "../thread_storage_scope.h"
+#include "../meta_data.h"
+#include "../file_util.h"
 
 namespace tvm {
 namespace runtime {
@@ -97,17 +101,23 @@ inline const char* CLGetErrorString(cl_int error) {
     OPENCL_CHECK_ERROR(e);                                            \
   }
 
+class OpenCLThreadEntry;
+
 /*!
  * \brief Process global OpenCL workspace.
  */
-class OpenCLWorkspace final : public DeviceAPI {
+class OpenCLWorkspace : public DeviceAPI {
  public:
   // global platform id
   cl_platform_id platform_id;
+  // global platform name
+  std::string platform_name;
   // global context of this process
   cl_context context{nullptr};
   // whether the workspace it initialized.
   bool initialized_{false};
+  // the device type
+  std::string device_type;
   // the devices
   std::vector<cl_device_id> devices;
   // the queues
@@ -128,10 +138,17 @@ class OpenCLWorkspace final : public DeviceAPI {
     }
   }
   // Initialzie the device.
-  void Init();
+  void Init(const std::vector<std::string>& device_types, const std::string& platform_name = "");
+  virtual void Init() {
+    Init({"gpu", "cpu"});
+  }
+  // Check whether the context is OpenCL or not.
+  virtual bool IsOpenCLDevice(TVMContext ctx) {
+    return ctx.device_type == kDLOpenCL;
+  }
   // get the queue of the context
   cl_command_queue GetQueue(TVMContext ctx) {
-    CHECK_EQ(ctx.device_type, kDLOpenCL);
+    CHECK(IsOpenCLDevice(ctx));
     this->Init();
     CHECK(ctx.device_id >= 0  && static_cast<size_t>(ctx.device_id) < queues.size())
         << "Invalid OpenCL device_id=" << ctx.device_id;
@@ -157,6 +174,12 @@ class OpenCLWorkspace final : public DeviceAPI {
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
   void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
   void FreeWorkspace(TVMContext ctx, void* data) final;
+
+  /*!
+   * \brief Get the thread local ThreadEntry
+   */
+  virtual OpenCLThreadEntry* GetThreadEntry();
+
   // get the global workspace
   static const std::shared_ptr<OpenCLWorkspace>& Global();
 };
@@ -179,15 +202,87 @@ class OpenCLThreadEntry {
   /*! \brief workspace pool */
   WorkspacePool pool;
   // constructor
-  OpenCLThreadEntry()
-      : pool(kDLOpenCL, OpenCLWorkspace::Global()) {
+  OpenCLThreadEntry(DLDeviceType device_type, std::shared_ptr<DeviceAPI> device)
+      : pool(device_type, device) {
     context.device_id = 0;
-    context.device_type = kDLOpenCL;
+    context.device_type = device_type;
   }
+  OpenCLThreadEntry()
+      : OpenCLThreadEntry(kDLOpenCL, OpenCLWorkspace::Global()) {}
+
   // get the global workspace
   static OpenCLThreadEntry* ThreadLocal();
 };
 }  // namespace cl
+
+// Module to support thread-safe multi-device execution.
+// OpenCL runtime is a bit tricky because clSetKernelArg is not thread-safe
+// To make the call thread-safe, we create a thread-local kernel table
+// and lazily install new kernels into the kernel table when the kernel is called.
+// The kernels are recycled when the module get destructed.
+class OpenCLModuleNode : public ModuleNode {
+ public:
+  // Kernel table reference entry.
+  struct KTRefEntry {
+    size_t kernel_id;
+    size_t version;
+  };
+  explicit OpenCLModuleNode(std::string data,
+                            std::string fmt,
+                            std::unordered_map<std::string, FunctionInfo> fmap,
+                            std::string source)
+      : data_(data), fmt_(fmt), fmap_(fmap), source_(source) {}
+  // destructor
+  ~OpenCLModuleNode();
+
+  /*!
+   * \brief Get the global workspace
+   */
+  virtual std::shared_ptr<cl::OpenCLWorkspace> GetGlobalWorkspace();
+
+  virtual const char* type_key() const;
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final;
+  void SaveToBinary(dmlc::Stream* stream) final;
+  std::string GetSource(const std::string& format) final;
+  // Initialize the programs
+  void Init();
+  // install a new kernel to thread local entry
+  cl_kernel InstallKernel(cl::OpenCLWorkspace* w,
+                          cl::OpenCLThreadEntry* t,
+                          const std::string& func_name,
+                          const KTRefEntry& e);
+
+ protected:
+  // The workspace, need to keep reference to use it in destructor.
+  // In case of static destruction order problem.
+  std::shared_ptr<cl::OpenCLWorkspace> workspace_;
+  // the binary data
+  std::string data_;
+
+ private:
+  // The format
+  std::string fmt_;
+  // function information table.
+  std::unordered_map<std::string, FunctionInfo> fmap_;
+  // Module local mutex
+  std::mutex build_lock_;
+  // The OpenCL source.
+  std::string source_;
+  // the binary data
+  cl_program program_{nullptr};
+  // build info
+  std::vector<bool> device_built_flag_;
+  // kernel id cache
+  std::unordered_map<std::string, KTRefEntry> kid_map_;
+  // kernels build so far.
+  std::vector<cl_kernel> kernels_;
+};
+
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_