diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index f3b03570db33..55eb5c0ac9c4 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -723,6 +723,7 @@ build_ubuntu_gpu_mkldnn() { CC=gcc-7 CXX=g++-7 cmake \ -DCMAKE_BUILD_TYPE="RelWithDebInfo" \ -DUSE_MKL_IF_AVAILABLE=OFF \ + -DUSE_TVM_OP=ON \ -DUSE_CUDA=ON \ -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ -DUSE_CPP_PACKAGE=ON \ @@ -736,6 +737,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() { CC=gcc-7 CXX=g++-7 cmake \ -DCMAKE_BUILD_TYPE="RelWithDebInfo" \ -DUSE_MKL_IF_AVAILABLE=OFF \ + -DUSE_TVM_OP=ON \ -DUSE_CUDA=ON \ -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ -DUSE_CUDNN=OFF \ @@ -750,6 +752,7 @@ build_ubuntu_gpu_cuda101_cudnn7() { CC=gcc-7 CXX=g++-7 cmake \ -DCMAKE_BUILD_TYPE="RelWithDebInfo" \ -DUSE_MKL_IF_AVAILABLE=OFF \ + -DUSE_TVM_OP=ON \ -DUSE_CUDA=ON \ -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ -DUSE_CUDNN=ON \ @@ -789,6 +792,7 @@ build_ubuntu_gpu_cuda101_cudnn7_make() { USE_CUDA=1 \ USE_CUDA_PATH=/usr/local/cuda \ USE_CUDNN=1 \ + USE_TVM_OP=1 \ USE_CPP_PACKAGE=1 \ USE_DIST_KVSTORE=1 \ CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \ @@ -808,6 +812,7 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() { USE_CUDA=1 \ USE_CUDA_PATH=/usr/local/cuda \ USE_CUDNN=1 \ + USE_TVM_OP=0 \ USE_CPP_PACKAGE=1 \ USE_DIST_KVSTORE=1 \ CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \ @@ -817,6 +822,23 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() { make cython PYTHON=python3 } +build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() { + set -ex + cd /work/build + CC=gcc-7 CXX=g++-7 cmake \ + -DCMAKE_BUILD_TYPE="RelWithDebInfo" \ + -DUSE_MKL_IF_AVAILABLE=OFF \ + -DUSE_TVM_OP=OFF \ + -DUSE_CUDA=ON \ + -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ + -DUSE_CUDNN=ON \ + -DUSE_MKLDNN=OFF \ + -DBUILD_CYTHON_MODULES=ON \ + -DUSE_DIST_KVSTORE=ON \ + -G Ninja /work/mxnet + ninja +} + build_ubuntu_amalgamation() { set -ex # Amalgamation can not be run with -j nproc @@ -847,6 +869,7 @@ build_ubuntu_gpu_cmake() { -DUSE_SIGNAL_HANDLER=ON \ -DUSE_CUDA=ON \ -DUSE_CUDNN=ON \ + -DUSE_TVM_OP=ON \ -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_MKLML_MKL=OFF \ -DUSE_MKLDNN=OFF \ @@ -867,6 +890,7 @@ build_ubuntu_gpu_cmake_no_rtc() { -DUSE_SIGNAL_HANDLER=ON \ -DUSE_CUDA=ON \ -DUSE_CUDNN=ON \ + -DUSE_TVM_OP=ON \ -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_MKLML_MKL=OFF \ -DUSE_MKLDNN=ON \ @@ -881,6 +905,27 @@ build_ubuntu_gpu_cmake_no_rtc() { ninja } +build_ubuntu_gpu_cmake_no_tvm_op() { + set -ex + cd /work/build + CC=gcc-7 CXX=g++-7 cmake \ + -DUSE_SIGNAL_HANDLER=ON \ + -DUSE_CUDA=ON \ + -DUSE_CUDNN=ON \ + -DUSE_TVM_OP=OFF \ + -DUSE_MKL_IF_AVAILABLE=OFF \ + -DUSE_MKLML_MKL=OFF \ + -DUSE_MKLDNN=OFF \ + -DUSE_DIST_KVSTORE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ + -DBUILD_CYTHON_MODULES=1 \ + -G Ninja \ + /work/mxnet + + ninja +} + build_ubuntu_cpu_large_tensor() { set -ex cd /work/build @@ -903,6 +948,7 @@ build_ubuntu_gpu_large_tensor() { -DUSE_SIGNAL_HANDLER=ON \ -DUSE_CUDA=ON \ -DUSE_CUDNN=ON \ + -DUSE_TVM_OP=ON \ -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_MKLML_MKL=OFF \ -DUSE_MKLDNN=OFF \ diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy index 8ac63ef5a9d6..61640182b3ca 100644 --- a/ci/jenkins/Jenkins_steps.groovy +++ b/ci/jenkins/Jenkins_steps.groovy @@ -310,6 +310,20 @@ def compile_unix_full_gpu_mkldnn_cpp_test(lib_name) { }] } +def compile_unix_full_gpu_no_tvm_op() { + return ['GPU: CUDA10.1+cuDNN7 TVM_OP OFF': { + node(NODE_LINUX_CPU) { + ws('workspace/build-gpu-no-tvm-op') { + timeout(time: max_time, unit: 'MINUTES') { + utils.init_git() + utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op', false) + utils.pack_lib('gpu_no_tvm_op', mx_lib_cpp_examples_no_tvm_op) + } + } + } + }] +} + def compile_unix_cmake_gpu(lib_name) { return ['GPU: CMake': { node(NODE_LINUX_CPU) { @@ -324,6 +338,19 @@ def compile_unix_cmake_gpu(lib_name) { }] } +def compile_unix_cmake_gpu_no_tvm_op() { + return ['GPU: CMake TVM_OP OFF': { + node(NODE_LINUX_CPU) { + ws('workspace/build-cmake-gpu-no-tvm-op') { + timeout(time: max_time, unit: 'MINUTES') { + utils.init_git() + utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake_no_tvm_op', false) + } + } + } + }] +} + def compile_unix_cmake_gpu_no_rtc(lib_name) { return ['GPU: CMake CUDA RTC OFF': { node(NODE_LINUX_CPU) { @@ -338,6 +365,22 @@ def compile_unix_cmake_gpu_no_rtc(lib_name) { }] } +def test_unix_python3_gpu_no_tvm_op() { + return ['Python3: GPU TVM_OP OFF': { + node(NODE_LINUX_GPU) { + ws('workspace/ut-python3-gpu-no-tvm-op') { + try { + utils.unpack_and_init('gpu_no_tvm_op', mx_lib_cpp_examples_no_tvm_op) + python3_gpu_ut_cython('ubuntu_gpu_cu101') + utils.publish_test_coverage() + } finally { + utils.collect_test_results_unix('tests_gpu.xml', 'tests_python3_gpu.xml') + } + } + } + }] +} + def compile_unix_tensorrt_gpu(lib_name) { return ['TensorRT': { node(NODE_LINUX_CPU) { diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu index 84ac2bcb4623..c35af02598f2 100644 --- a/ci/jenkins/Jenkinsfile_unix_gpu +++ b/ci/jenkins/Jenkinsfile_unix_gpu @@ -42,6 +42,8 @@ core_logic: { custom_steps.compile_unix_cmake_gpu('cmake_gpu'), custom_steps.compile_unix_tensorrt_gpu('tensorrt'), custom_steps.compile_unix_int64_gpu('gpu_int64'), + custom_steps.compile_unix_full_gpu_no_tvm_op(), + custom_steps.compile_unix_cmake_gpu_no_tvm_op(), custom_steps.compile_unix_cmake_gpu_no_rtc('gpu_no_rtc'), custom_steps.compile_unix_full_gpu_mkldnn_cpp_test('gpu_mkldnn_cpp_test_make') ]) @@ -61,6 +63,7 @@ core_logic: { // TODO(szha): fix and reenable the hanging issue. tracked in #18098 // custom_steps.test_unix_distributed_kvstore_gpu('gpu'), custom_steps.test_unix_byteps_gpu('gpu'), + custom_steps.test_unix_python3_gpu_no_tvm_op(), custom_steps.test_unix_capi_cpp_package('gpu_mkldnn_cpp_test_make'), ]) } diff --git a/contrib/tvmop/compile.py b/contrib/tvmop/compile.py index 6341e70fe766..a104f0861f2d 100644 --- a/contrib/tvmop/compile.py +++ b/contrib/tvmop/compile.py @@ -152,6 +152,12 @@ def get_cuda_arch(arch): # we create libtvmop.o first, which gives us chance to link tvm_runtime together with the libtvmop # to allow mxnet find external helper functions in libtvm_runtime func_binary.save(arguments.target_path + "/libtvmop.o") + try: + func_binary.imported_modules + except NameError: + func_binary.imported_modules = [] + if len(func_binary.imported_modules): + func_binary.imported_modules[0].save(arguments.target_path + "/libtvmop.cubin") ld_path = arguments.target_path if arguments.ld_path is None else arguments.ld_path create_shared(arguments.target_path + "/libtvmop.so", arguments.target_path + "/libtvmop.o", diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index ea39d9ac6e5b..a1e5027ce5f1 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -1363,7 +1363,15 @@ int MXGetVersion(int *out) { #if MXNET_USE_TVM_OP int MXLoadTVMOp(const char *libpath) { API_BEGIN(); - tvm::runtime::TVMOpModule::Get()->Load(libpath); + tvm::runtime::TVMOpModule *libpath_module = tvm::runtime::TVMOpModule::Get(); + libpath_module->Load(libpath); +#if MXNET_USE_CUDA + std::string libpathstr(libpath); + std::string cubinpath = libpathstr.substr(0, libpathstr.size() - 11) + "libtvmop.cubin"; + tvm::runtime::TVMOpModule cubin_module; + cubin_module.Load(cubinpath); + libpath_module->Import(cubin_module); +#endif API_END(); } diff --git a/src/operator/tvmop/op_module.cc b/src/operator/tvmop/op_module.cc index c75e5a990086..d833ae0585c2 100644 --- a/src/operator/tvmop/op_module.cc +++ b/src/operator/tvmop/op_module.cc @@ -46,6 +46,12 @@ void TVMOpModule::Load(const std::string &filepath) { *module_ptr_ = module; } +void TVMOpModule::Import(const TVMOpModule& module) { + CHECK(module_ptr_ != nullptr) << "module_ptr_ is not initialized."; + std::lock_guard lock(mutex_); + module_ptr_->Import(*(module.module_ptr_)); +} + PackedFunc GetFunction(const std::shared_ptr &module, const std::string &op_name, const std::vector &args) { diff --git a/src/operator/tvmop/op_module.h b/src/operator/tvmop/op_module.h index 269a0aa50c11..8a25b3b9951e 100644 --- a/src/operator/tvmop/op_module.h +++ b/src/operator/tvmop/op_module.h @@ -44,6 +44,8 @@ class TVMOpModule { // Load TVM operators binary void Load(const std::string& filepath); + void Import(const TVMOpModule& module); + void Call(const std::string& func_name, const mxnet::OpContext& ctx, const std::vector& args) const;