diff --git a/3rdparty/mshadow/mshadow/dot_engine-inl.h b/3rdparty/mshadow/mshadow/dot_engine-inl.h index 93273154b429..1adfdf600326 100644 --- a/3rdparty/mshadow/mshadow/dot_engine-inl.h +++ b/3rdparty/mshadow/mshadow/dot_engine-inl.h @@ -314,12 +314,12 @@ struct BLASEngine { #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000) // since same m/n/k is used for all single gemms, so we put all gemms into one group const int GROUP_SIZE = 1; - MKL_INT p_m[GROUP_SIZE] = {m}; - MKL_INT p_n[GROUP_SIZE] = {n}; - MKL_INT p_k[GROUP_SIZE] = {k}; - MKL_INT p_lda[GROUP_SIZE] = {lda}; - MKL_INT p_ldb[GROUP_SIZE] = {ldb}; - MKL_INT p_ldc[GROUP_SIZE] = {ldc}; + MKL_INT p_m[GROUP_SIZE] = {static_cast(m)}; + MKL_INT p_n[GROUP_SIZE] = {static_cast(n)}; + MKL_INT p_k[GROUP_SIZE] = {static_cast(k)}; + MKL_INT p_lda[GROUP_SIZE] = {static_cast(lda)}; + MKL_INT p_ldb[GROUP_SIZE] = {static_cast(ldb)}; + MKL_INT p_ldc[GROUP_SIZE] = {static_cast(ldc)}; float p_alpha[GROUP_SIZE] = {alpha}; float p_beta[GROUP_SIZE] = {beta}; @@ -327,7 +327,7 @@ struct BLASEngine { CBLAS_TRANSPOSE cblas_a_trans = GetT(transa); CBLAS_TRANSPOSE cblas_b_trans = GetT(transb); - MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count}; + MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast(batch_count)}; CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans}; CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans}; @@ -423,12 +423,12 @@ struct BLASEngine { #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000) // since same m/n/k is used for all single gemms, so we put all gemms into one group const int GROUP_SIZE = 1; - MKL_INT p_m[GROUP_SIZE] = {m}; - MKL_INT p_n[GROUP_SIZE] = {n}; - MKL_INT p_k[GROUP_SIZE] = {k}; - MKL_INT p_lda[GROUP_SIZE] = {lda}; - MKL_INT p_ldb[GROUP_SIZE] = {ldb}; - MKL_INT p_ldc[GROUP_SIZE] = {ldc}; + MKL_INT p_m[GROUP_SIZE] = {static_cast(m)}; + MKL_INT p_n[GROUP_SIZE] = {static_cast(n)}; + MKL_INT p_k[GROUP_SIZE] = {static_cast(k)}; + MKL_INT p_lda[GROUP_SIZE] = {static_cast(lda)}; + MKL_INT p_ldb[GROUP_SIZE] = {static_cast(ldb)}; + MKL_INT p_ldc[GROUP_SIZE] = {static_cast(ldc)}; double p_alpha[GROUP_SIZE] = {alpha}; double p_beta[GROUP_SIZE] = {beta}; @@ -436,7 +436,7 @@ struct BLASEngine { CBLAS_TRANSPOSE cblas_a_trans = GetT(transa); CBLAS_TRANSPOSE cblas_b_trans = GetT(transb); - MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count}; + MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast(batch_count)}; CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans}; CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans}; diff --git a/CMakeLists.txt b/CMakeLists.txt index 30839b45c339..2413c5679e95 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,7 +88,6 @@ option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON) option(USE_TENSORRT "Enable inference optimization with TensorRT." OFF) option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF) cmake_dependent_option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF "NOT MSVC" OFF) -option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF) option(BUILD_EXTENSION_PATH "Path to extension to build" "") option(BUILD_CYTHON_MODULES "Build cython modules." OFF) option(LOG_FATAL_THROW "Log exceptions but do not abort" ON) @@ -306,6 +305,8 @@ endif() include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) +cmake_dependent_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" ON "CMAKE_SIZEOF_VOID_P EQUAL 8" OFF) + include(cmake/ChooseBlas.cmake) if(USE_ASAN) @@ -984,3 +985,4 @@ if(BUILD_CYTHON_MODULES) message(FATAL_ERROR "No python interpreter found to build cython modules") endif() endif() + diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index f5ee9ba6f0ab..08f10dbca90d 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -268,6 +268,7 @@ build_centos7_cpu() { -DUSE_DIST_KVSTORE=ON \ -DUSE_CUDA=OFF \ -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \ + -DUSE_INT64_TENSOR_SIZE=OFF \ -G Ninja /work/mxnet ninja } @@ -282,6 +283,7 @@ build_centos7_mkldnn() { -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_MKLDNN=ON \ -DUSE_CUDA=OFF \ + -DUSE_INT64_TENSOR_SIZE=OFF \ -G Ninja /work/mxnet ninja } @@ -298,8 +300,9 @@ build_centos7_gpu() { -DUSE_MKLDNN=ON \ -DUSE_CUDA=ON \ -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ - -DUSE_DIST_KVSTORE=ON\ + -DUSE_DIST_KVSTORE=ON \ -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \ + -DUSE_INT64_TENSOR_SIZE=OFF \ -G Ninja /work/mxnet ninja } @@ -694,7 +697,6 @@ build_ubuntu_cpu_large_tensor() { -DUSE_CUDA=OFF \ -DUSE_CUDNN=OFF \ -DUSE_MKLDNN=ON \ - -DUSE_INT64_TENSOR_SIZE=ON \ -G Ninja \ /work/mxnet @@ -714,7 +716,6 @@ build_ubuntu_gpu_large_tensor() { -DUSE_DIST_KVSTORE=ON \ -DCMAKE_BUILD_TYPE=Release \ -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \ - -DUSE_INT64_TENSOR_SIZE=ON \ -G Ninja \ /work/mxnet diff --git a/config/darwin.cmake b/config/darwin.cmake index 65e93efb7373..5a7899e018e7 100644 --- a/config/darwin.cmake +++ b/config/darwin.cmake @@ -122,7 +122,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package") # This will cause performance degradation reported in issue #14496 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647 # Note: the size of each dimension is still bounded by INT32_MAX -set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") +set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") # Other GPU features set(USE_NCCL "Use NVidia NCCL with CUDA" OFF) diff --git a/config/linux.cmake b/config/linux.cmake index 8881402ede8e..55c1d0810d81 100644 --- a/config/linux.cmake +++ b/config/linux.cmake @@ -121,7 +121,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package") # This will cause performance degradation reported in issue #14496 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647 # Note: the size of each dimension is still bounded by INT32_MAX -set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") +set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") # Other GPU features set(USE_NCCL "Use NVidia NCCL with CUDA" OFF) diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake index 933857ce6739..cedcbac9c5f9 100644 --- a/config/linux_gpu.cmake +++ b/config/linux_gpu.cmake @@ -125,7 +125,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package") # This will cause performance degradation reported in issue #14496 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647 # Note: the size of each dimension is still bounded by INT32_MAX -set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") +set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor") # Other GPU features set(USE_NCCL "Use NVidia NCCL with CUDA" OFF) diff --git a/src/operator/contrib/transformer.cc b/src/operator/contrib/transformer.cc index 43c322e9ca21..e85e1d22b6b0 100644 --- a/src/operator/contrib/transformer.cc +++ b/src/operator/contrib/transformer.cc @@ -140,12 +140,12 @@ void strided_batch_sgemm(bool transA, bool transB, #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000) const int GROUP_SIZE = 1; - MKL_INT p_m[GROUP_SIZE] = {m}; - MKL_INT p_n[GROUP_SIZE] = {n}; - MKL_INT p_k[GROUP_SIZE] = {k}; - MKL_INT p_lda[GROUP_SIZE] = {lda}; - MKL_INT p_ldb[GROUP_SIZE] = {ldb}; - MKL_INT p_ldc[GROUP_SIZE] = {ldc}; + MKL_INT p_m[GROUP_SIZE] = {static_cast(m)}; + MKL_INT p_n[GROUP_SIZE] = {static_cast(n)}; + MKL_INT p_k[GROUP_SIZE] = {static_cast(k)}; + MKL_INT p_lda[GROUP_SIZE] = {static_cast(lda)}; + MKL_INT p_ldb[GROUP_SIZE] = {static_cast(ldb)}; + MKL_INT p_ldc[GROUP_SIZE] = {static_cast(ldc)}; float p_alpha[GROUP_SIZE] = {alpha}; float p_beta[GROUP_SIZE] = {beta}; @@ -153,7 +153,7 @@ void strided_batch_sgemm(bool transA, bool transB, CBLAS_TRANSPOSE cblas_a_trans = transA ? CblasTrans : CblasNoTrans; CBLAS_TRANSPOSE cblas_b_trans = transB ? CblasTrans : CblasNoTrans; - MKL_INT p_group_sizeb[GROUP_SIZE] = {batchCount}; + MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast(batchCount)}; CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans}; CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans}; diff --git a/src/operator/numpy/np_insert_op_slice-inl.h b/src/operator/numpy/np_insert_op_slice-inl.h index 8fd8e0bfc9e5..fd32d9bc7bc1 100644 --- a/src/operator/numpy/np_insert_op_slice-inl.h +++ b/src/operator/numpy/np_insert_op_slice-inl.h @@ -154,7 +154,7 @@ void NumpyInsertSliceCompute(const nnvm::NodeAttrs& attrs, CHECK((values.shape_[i] == 1) || (values.shape_[i] == sz)); } size_t temp_storage_bytes, temp_mem_size; - temp_storage_bytes = SortByKeyWorkspaceSize(indices_len, false, true); + temp_storage_bytes = SortByKeyWorkspaceSize(indices_len, false, true); temp_mem_size = indices_len * sizeof(int64_t) * 2 + indices_len * sizeof(index_t) + outshape[axis] * sizeof(index_t) * 2 + diff --git a/src/operator/numpy/np_polynomial_op.cc b/src/operator/numpy/np_polynomial_op.cc index 3fc94395946a..4018964263dd 100644 --- a/src/operator/numpy/np_polynomial_op.cc +++ b/src/operator/numpy/np_polynomial_op.cc @@ -53,7 +53,8 @@ struct polyval_backward_p { DType igrad_p = 0; index_t j = x_size - 1; while (j >= 0) { - igrad_p += pow(x_dptr[j], p_size - i - 1) * ograd_dptr[j]; + igrad_p += pow(x_dptr[j], static_cast(p_size) - + static_cast(i + 1)) * ograd_dptr[j]; j--; } KERNEL_ASSIGN(igrad_p_dptr[i], req, igrad_p);