diff --git a/3rdparty/mshadow/mshadow/dot_engine-inl.h b/3rdparty/mshadow/mshadow/dot_engine-inl.h
index 93273154b429..1adfdf600326 100644
--- a/3rdparty/mshadow/mshadow/dot_engine-inl.h
+++ b/3rdparty/mshadow/mshadow/dot_engine-inl.h
@@ -314,12 +314,12 @@ struct BLASEngine<cpu, float> {
 #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000)
   // since same m/n/k is used for all single gemms, so we put all gemms into one group
   const int GROUP_SIZE = 1;
-  MKL_INT p_m[GROUP_SIZE] = {m};
-  MKL_INT p_n[GROUP_SIZE] = {n};
-  MKL_INT p_k[GROUP_SIZE] = {k};
-  MKL_INT p_lda[GROUP_SIZE] = {lda};
-  MKL_INT p_ldb[GROUP_SIZE] = {ldb};
-  MKL_INT p_ldc[GROUP_SIZE] = {ldc};
+  MKL_INT p_m[GROUP_SIZE] = {static_cast<MKL_INT>(m)};
+  MKL_INT p_n[GROUP_SIZE] = {static_cast<MKL_INT>(n)};
+  MKL_INT p_k[GROUP_SIZE] = {static_cast<MKL_INT>(k)};
+  MKL_INT p_lda[GROUP_SIZE] = {static_cast<MKL_INT>(lda)};
+  MKL_INT p_ldb[GROUP_SIZE] = {static_cast<MKL_INT>(ldb)};
+  MKL_INT p_ldc[GROUP_SIZE] = {static_cast<MKL_INT>(ldc)};
 
   float p_alpha[GROUP_SIZE] = {alpha};
   float p_beta[GROUP_SIZE] = {beta};
@@ -327,7 +327,7 @@ struct BLASEngine<cpu, float> {
   CBLAS_TRANSPOSE cblas_a_trans = GetT(transa);
   CBLAS_TRANSPOSE cblas_b_trans = GetT(transb);
 
-  MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count};
+  MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast<MKL_INT>(batch_count)};
   CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans};
   CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans};
 
@@ -423,12 +423,12 @@ struct BLASEngine<cpu, double> {
 #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000)
   // since same m/n/k is used for all single gemms, so we put all gemms into one group
   const int GROUP_SIZE = 1;
-  MKL_INT p_m[GROUP_SIZE] = {m};
-  MKL_INT p_n[GROUP_SIZE] = {n};
-  MKL_INT p_k[GROUP_SIZE] = {k};
-  MKL_INT p_lda[GROUP_SIZE] = {lda};
-  MKL_INT p_ldb[GROUP_SIZE] = {ldb};
-  MKL_INT p_ldc[GROUP_SIZE] = {ldc};
+  MKL_INT p_m[GROUP_SIZE] = {static_cast<MKL_INT>(m)};
+  MKL_INT p_n[GROUP_SIZE] = {static_cast<MKL_INT>(n)};
+  MKL_INT p_k[GROUP_SIZE] = {static_cast<MKL_INT>(k)};
+  MKL_INT p_lda[GROUP_SIZE] = {static_cast<MKL_INT>(lda)};
+  MKL_INT p_ldb[GROUP_SIZE] = {static_cast<MKL_INT>(ldb)};
+  MKL_INT p_ldc[GROUP_SIZE] = {static_cast<MKL_INT>(ldc)};
 
   double p_alpha[GROUP_SIZE] = {alpha};
   double p_beta[GROUP_SIZE] = {beta};
@@ -436,7 +436,7 @@ struct BLASEngine<cpu, double> {
   CBLAS_TRANSPOSE cblas_a_trans = GetT(transa);
   CBLAS_TRANSPOSE cblas_b_trans = GetT(transb);
 
-  MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count};
+  MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast<MKL_INT>(batch_count)};
   CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans};
   CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans};
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30839b45c339..2413c5679e95 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,7 +88,6 @@ option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON)
 option(USE_TENSORRT "Enable inference optimization with TensorRT." OFF)
 option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF)
 cmake_dependent_option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF "NOT MSVC" OFF)
-option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF)
 option(BUILD_EXTENSION_PATH "Path to extension to build" "")
 option(BUILD_CYTHON_MODULES "Build cython modules." OFF)
 option(LOG_FATAL_THROW "Log exceptions but do not abort" ON)
@@ -306,6 +305,8 @@ endif()
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
+cmake_dependent_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" ON "CMAKE_SIZEOF_VOID_P EQUAL 8" OFF)
+
 include(cmake/ChooseBlas.cmake)
 
 if(USE_ASAN)
@@ -984,3 +985,4 @@ if(BUILD_CYTHON_MODULES)
     message(FATAL_ERROR "No python interpreter found to build cython modules")
   endif()
 endif()
+
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index f5ee9ba6f0ab..08f10dbca90d 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -268,6 +268,7 @@ build_centos7_cpu() {
         -DUSE_DIST_KVSTORE=ON \
         -DUSE_CUDA=OFF \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
+        -DUSE_INT64_TENSOR_SIZE=OFF \
         -G Ninja /work/mxnet
     ninja
 }
@@ -282,6 +283,7 @@ build_centos7_mkldnn() {
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_MKLDNN=ON \
         -DUSE_CUDA=OFF \
+        -DUSE_INT64_TENSOR_SIZE=OFF \
         -G Ninja /work/mxnet
     ninja
 }
@@ -298,8 +300,9 @@ build_centos7_gpu() {
         -DUSE_MKLDNN=ON \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -DUSE_DIST_KVSTORE=ON\
+        -DUSE_DIST_KVSTORE=ON \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
+        -DUSE_INT64_TENSOR_SIZE=OFF \
         -G Ninja /work/mxnet
     ninja
 }
@@ -694,7 +697,6 @@ build_ubuntu_cpu_large_tensor() {
         -DUSE_CUDA=OFF                          \
         -DUSE_CUDNN=OFF                         \
         -DUSE_MKLDNN=ON                         \
-        -DUSE_INT64_TENSOR_SIZE=ON              \
         -G Ninja                                \
         /work/mxnet
 
@@ -714,7 +716,6 @@ build_ubuntu_gpu_large_tensor() {
         -DUSE_DIST_KVSTORE=ON                   \
         -DCMAKE_BUILD_TYPE=Release              \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -DUSE_INT64_TENSOR_SIZE=ON              \
         -G Ninja                                \
         /work/mxnet
 
diff --git a/config/darwin.cmake b/config/darwin.cmake
index 65e93efb7373..5a7899e018e7 100644
--- a/config/darwin.cmake
+++ b/config/darwin.cmake
@@ -122,7 +122,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package")
 # This will cause performance degradation reported in issue #14496
 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
 # Note: the size of each dimension is still bounded by INT32_MAX
-set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
+set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
 
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
diff --git a/config/linux.cmake b/config/linux.cmake
index 8881402ede8e..55c1d0810d81 100644
--- a/config/linux.cmake
+++ b/config/linux.cmake
@@ -121,7 +121,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package")
 # This will cause performance degradation reported in issue #14496
 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
 # Note: the size of each dimension is still bounded by INT32_MAX
-set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
+set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
 
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
index 933857ce6739..cedcbac9c5f9 100644
--- a/config/linux_gpu.cmake
+++ b/config/linux_gpu.cmake
@@ -125,7 +125,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package")
 # This will cause performance degradation reported in issue #14496
 # Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
 # Note: the size of each dimension is still bounded by INT32_MAX
-set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
+set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
 
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
diff --git a/src/operator/contrib/transformer.cc b/src/operator/contrib/transformer.cc
index 43c322e9ca21..e85e1d22b6b0 100644
--- a/src/operator/contrib/transformer.cc
+++ b/src/operator/contrib/transformer.cc
@@ -140,12 +140,12 @@ void strided_batch_sgemm(bool transA, bool transB,
 
 #if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000)
   const int GROUP_SIZE = 1;
-  MKL_INT p_m[GROUP_SIZE] = {m};
-  MKL_INT p_n[GROUP_SIZE] = {n};
-  MKL_INT p_k[GROUP_SIZE] = {k};
-  MKL_INT p_lda[GROUP_SIZE] = {lda};
-  MKL_INT p_ldb[GROUP_SIZE] = {ldb};
-  MKL_INT p_ldc[GROUP_SIZE] = {ldc};
+  MKL_INT p_m[GROUP_SIZE] = {static_cast<MKL_INT>(m)};
+  MKL_INT p_n[GROUP_SIZE] = {static_cast<MKL_INT>(n)};
+  MKL_INT p_k[GROUP_SIZE] = {static_cast<MKL_INT>(k)};
+  MKL_INT p_lda[GROUP_SIZE] = {static_cast<MKL_INT>(lda)};
+  MKL_INT p_ldb[GROUP_SIZE] = {static_cast<MKL_INT>(ldb)};
+  MKL_INT p_ldc[GROUP_SIZE] = {static_cast<MKL_INT>(ldc)};
 
   float p_alpha[GROUP_SIZE] = {alpha};
   float p_beta[GROUP_SIZE] = {beta};
@@ -153,7 +153,7 @@ void strided_batch_sgemm(bool transA, bool transB,
   CBLAS_TRANSPOSE cblas_a_trans = transA ? CblasTrans : CblasNoTrans;
   CBLAS_TRANSPOSE cblas_b_trans = transB ? CblasTrans : CblasNoTrans;
 
-  MKL_INT p_group_sizeb[GROUP_SIZE] = {batchCount};
+  MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast<MKL_INT>(batchCount)};
   CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans};
   CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans};
 
diff --git a/src/operator/numpy/np_insert_op_slice-inl.h b/src/operator/numpy/np_insert_op_slice-inl.h
index 8fd8e0bfc9e5..fd32d9bc7bc1 100644
--- a/src/operator/numpy/np_insert_op_slice-inl.h
+++ b/src/operator/numpy/np_insert_op_slice-inl.h
@@ -154,7 +154,7 @@ void NumpyInsertSliceCompute(const nnvm::NodeAttrs& attrs,
       CHECK((values.shape_[i] == 1) || (values.shape_[i] == sz));
     }
     size_t temp_storage_bytes, temp_mem_size;
-    temp_storage_bytes = SortByKeyWorkspaceSize<int64_t, int, xpu>(indices_len, false, true);
+    temp_storage_bytes = SortByKeyWorkspaceSize<int64_t, index_t, xpu>(indices_len, false, true);
     temp_mem_size = indices_len * sizeof(int64_t) * 2 +
                     indices_len * sizeof(index_t) +
                     outshape[axis] * sizeof(index_t) * 2 +
diff --git a/src/operator/numpy/np_polynomial_op.cc b/src/operator/numpy/np_polynomial_op.cc
index 3fc94395946a..4018964263dd 100644
--- a/src/operator/numpy/np_polynomial_op.cc
+++ b/src/operator/numpy/np_polynomial_op.cc
@@ -53,7 +53,8 @@ struct polyval_backward_p {
     DType igrad_p = 0;
     index_t j = x_size - 1;
     while (j >= 0) {
-        igrad_p += pow(x_dptr[j], p_size - i - 1) * ograd_dptr[j];
+        igrad_p += pow(x_dptr[j], static_cast<DType>(p_size) -
+                                  static_cast<DType>(i + 1)) * ograd_dptr[j];
         j--;
     }
     KERNEL_ASSIGN(igrad_p_dptr[i], req, igrad_p);