[BACKPORT]Enable CUDA 11.0 on nightly + CUDA 11.2 on pip (apache#19295)(

apache#19764) (apache#19930) * Enable CUDA 11.0 on nightly development builds (apache#19295) Remove CUDA 9.2 and CUDA 10.0 * [PIP] add build variant for cuda 11.2 (apache#19764) * adding ci docker files for cu111 and cu112 * removing previous CUDA make versions and adding support for cuda11.2 Co-authored-by: waytrue17 <[email protected]> Co-authored-by: Sheng Zha <[email protected]> Co-authored-by: Rohit Kumar Srivastava <[email protected]>
access2rohit · Mar 12, 2021 · 408471a · 408471a
1 parent eccba71
commit 408471a
Show file tree

Hide file tree

Showing 29 changed files with 130 additions and 863 deletions.
diff --git a/cd/Jenkinsfile_cd_pipeline b/cd/Jenkinsfile_cd_pipeline
@@ -36,7 +36,7 @@ pipeline {
 
   parameters {
     // Release parameters
-    string(defaultValue: "cpu,native,cu92,cu100,cu101,cu102,cu110", description: "Comma separated list of variants", name: "MXNET_VARIANTS")
+    string(defaultValue: "cpu,native,cu100,cu101,cu102,cu110,cu112", description: "Comma separated list of variants", name: "MXNET_VARIANTS")
     booleanParam(defaultValue: false, description: 'Whether this is a release build or not', name: "RELEASE_BUILD")
   }
 

diff --git a/cd/Jenkinsfile_release_job b/cd/Jenkinsfile_release_job
@@ -42,8 +42,8 @@ pipeline {
     // Using string instead of choice parameter to keep the changes to the parameters minimal to avoid
     // any disruption caused by different COMMIT_ID values chaning the job parameter configuration on
     // Jenkins.
-    string(defaultValue: "mxnet_lib", description: "Pipeline to build", name: "RELEASE_JOB_TYPE")
-    string(defaultValue: "cpu,native,cu100,cu101,cu102,cu110", description: "Comma separated list of variants", name: "MXNET_VARIANTS")
+    string(defaultValue: "mxnet_lib/static", description: "Pipeline to build", name: "RELEASE_JOB_TYPE")
+    string(defaultValue: "cpu,native,cu100,cu101,cu102,cu110,cu112", description: "Comma separated list of variants", name: "MXNET_VARIANTS")
     booleanParam(defaultValue: false, description: 'Whether this is a release build or not', name: "RELEASE_BUILD")
     string(defaultValue: "nightly_v1.x", description: "String used for naming docker images", name: "VERSION")
   }

diff --git a/cd/README.md b/cd/README.md
@@ -25,7 +25,7 @@ MXNet aims to support a variety of frontends, e.g. Python, Java, Perl, R, etc. a
 
 The CD process is driven by the [CD pipeline job](Jenkinsfile_cd_pipeline), which orchestrates the order in which the artifacts are delivered. For instance, first publish the libmxnet library before publishing the pip package. It does this by triggering the [release job](Jenkinsfile_release_job) with a specific set of parameters for each delivery channel. The release job executes the specific release pipeline for a delivery channel across all MXNet *variants*.
 
-A variant is a specific environment or features for which MXNet is compiled. For instance CPU, GPU with CUDA v10.0, CUDA v9.0 with MKL-DNN support, etc. 
+A variant is a specific environment or features for which MXNet is compiled. For instance CPU, GPU with CUDA v10.0, CUDA v11.0 with MKL-DNN support, etc.
 
 Currently, below variants are supported. All of these variants except native have MKL-DNN backend enabled.
 
@@ -36,6 +36,7 @@ Currently, below variants are supported. All of these variants except native hav
 * *cu101*: CUDA 10.1
 * *cu102*: CUDA 10.2
 * *cu110*: CUDA 11.0
+* *cu112*: CUDA 11.2
 
 *For more on variants, see [here](https://github.com/apache/incubator-mxnet/issues/8671)*
 
@@ -121,7 +122,7 @@ The "first mile" of the CD process is posting the mxnet binaries to the [artifac
 
 ##### Timeout
 
-We shouldn't set global timeouts for the pipelines. Rather, the `step` being executed should be rapped with a `timeout` function (as in the pipeline example above). The `max_time` is a global variable set at the [release job](Jenkinsfile_release_job) level. 
+We shouldn't set global timeouts for the pipelines. Rather, the `step` being executed should be rapped with a `timeout` function (as in the pipeline example above). The `max_time` is a global variable set at the [release job](Jenkinsfile_release_job) level.
 
 ##### Node of execution
 

diff --git a/cd/python/pypi/pypi_package.sh b/cd/python/pypi/pypi_package.sh
@@ -18,7 +18,7 @@
 
 set -ex
 
-# variant = cpu, native, cu80, cu100, etc.
+# variant = cpu, native, cu100, cu101, cu102, cu110, cu112 etc.
 export mxnet_variant=${1:?"Please specify the mxnet variant"}
 
 # Due to this PR: https://github.com/apache/incubator-mxnet/pull/14899

diff --git a/cd/utils/artifact_repository.md b/cd/utils/artifact_repository.md
@@ -17,7 +17,7 @@
 
 # Artifact Repository - Pushing and Pulling libmxnet
 
-The artifact repository is an S3 bucket accessible only to restricted Jenkins nodes. It is used to store compiled MXNet artifacts that can be used by downstream CD pipelines to package the compiled libraries for different delivery channels (e.g. DockerHub, PyPI, Maven, etc.). The S3 object keys for the files being posted will be prefixed with the following distinguishing characteristics of the binary: branch, commit id, operating system, variant and dependency linking strategy (static or dynamic). For instance, s3://bucket/73b29fa90d3eac0b1fae403b7583fdd1529942dc/ubuntu16.04/cu92mkl/static/libmxnet.so
+The artifact repository is an S3 bucket accessible only to restricted Jenkins nodes. It is used to store compiled MXNet artifacts that can be used by downstream CD pipelines to package the compiled libraries for different delivery channels (e.g. DockerHub, PyPI, Maven, etc.). The S3 object keys for the files being posted will be prefixed with the following distinguishing characteristics of the binary: branch, commit id, operating system, variant and dependency linking strategy (static or dynamic). For instance, s3://bucket/73b29fa90d3eac0b1fae403b7583fdd1529942dc/ubuntu16.04/cu100/static/libmxnet.so
 
 An MXNet artifact is defined as the following set of files:
 
@@ -53,13 +53,13 @@ If not set, derived through the value of sys.platform (https://docs.python.org/3
 
 **Variant**
 
-Manually configured through the --variant argument. The current variants are: cpu, native, cu92, cu100, cu101, cu102 and cu110.
+Manually configured through the --variant argument. The current variants are: cpu, native, cu100, cu101, cu102, cu110 and cu112.
 
 As long as the tool is being run from the MXNet code base, the runtime feature detection tool (https://github.com/larroy/mxnet/blob/dd432b7f241c9da2c96bcb877c2dc84e6a1f74d4/docs/api/python/libinfo/libinfo.md) can be used to detect whether the library has been compiled with MKL (library has MKL-DNN feature enabled) and/or CUDA support (compiled with CUDA feature enabled).
 
-If it has been compiled with CUDA support, the output of /usr/local/cuda/bin/nvcc --version can be mined for the exact CUDA version (eg. 8.0, 9.0, etc.).
+If it has been compiled with CUDA support, the output of /usr/local/cuda/bin/nvcc --version can be mined for the exact CUDA version (eg. 10.0, 11.0, etc.).
 
-By knowing which features are enabled on the binary, and if necessary, which CUDA version is installed on the machine, the value for the variant argument can be calculated. Eg. if CUDA features are enabled, and nvcc reports cuda version 10, then the variant would be cu100. If neither MKL-DNN nor CUDA features are enabled, the variant would be native. 
+By knowing which features are enabled on the binary, and if necessary, which CUDA version is installed on the machine, the value for the variant argument can be calculated. Eg. if CUDA features are enabled, and nvcc reports cuda version 10.0, then the variant would be cu100. If neither MKL-DNN nor CUDA features are enabled, the variant would be native. 
 
 **Dependency Linking**
 

diff --git a/cd/utils/mxnet_base_image.sh b/cd/utils/mxnet_base_image.sh
@@ -33,6 +33,9 @@ case ${mxnet_variant} in
     cu110*)
     echo "nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04"
     ;;
+    cu112*)
+    echo "nvidia/cuda:11.2.1-cudnn8-runtime-ubuntu18.04"
+    ;;
     cpu)
     echo "ubuntu:18.04"
     ;;

diff --git a/cd/utils/test_artifact_repository.py b/cd/utils/test_artifact_repository.py
@@ -144,9 +144,9 @@ def test_get_cuda_version(self, mock):
         cuda_version = get_cuda_version()
         self.assertEqual(cuda_version, '100')
 
-        mock.return_value = b'Cuda compilation tools, release 9.2, V9.2.148'
+        mock.return_value = b'Cuda compilation tools, release 11.0, V11.0.148'
         cuda_version = get_cuda_version()
-        self.assertEqual(cuda_version, '92')
+        self.assertEqual(cuda_version, '110')
 
     @patch('artifact_repository.check_output')
     def test_get_cuda_version_not_found(self, mock):

diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu112 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu112
@@ -0,0 +1,44 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to run MXNet on Ubuntu 16.04 for GPU
+
+FROM nvidia/cuda:11.2-cudnn8-devel-ubuntu16.04
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_core.sh /work/
+RUN /work/ubuntu_core.sh
+
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
+
+COPY install/ubuntu_python.sh /work/
+COPY install/requirements /work/
+RUN /work/ubuntu_python.sh
+
+# Always last
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/compat
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
@@ -224,7 +224,7 @@ build_ubuntu_gpu_mkldnn_release() {
 
 # Compiles the dynamic mxnet library
 # Parameters:
-# $1 -> mxnet_variant: the mxnet variant to build, e.g. cpu, cu100, cu92mkl, etc.
+# $1 -> mxnet_variant: the mxnet variant to build, e.g. cpu, native, cu100, cu101, cu102, cu110, cu112, etc.
 build_dynamic_libmxnet() {
     set -ex
 

diff --git a/config/distribution/linux_cu92.cmake → config/distribution/linux_cu112.cmake b/config/distribution/linux_cu92.cmake → config/distribution/linux_cu112.cmake
@@ -30,5 +30,5 @@ set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
 set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
 set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
 
-set(CUDACXX "/usr/local/cuda-9.2/bin/nvcc" CACHE STRING "Cuda compiler")
-set(MXNET_CUDA_ARCH "3.0;5.0;6.0;7.0;7.2" CACHE STRING "Cuda architectures")
+set(CUDACXX "/usr/local/cuda-11.2/bin/nvcc" CACHE STRING "Cuda compiler")
+set(MXNET_CUDA_ARCH "5.0;6.0;7.0;8.0;8.6" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu75.cmake b/config/distribution/linux_cu75.cmake
diff --git a/config/distribution/linux_cu80.cmake b/config/distribution/linux_cu80.cmake
diff --git a/config/distribution/linux_cu90.cmake b/config/distribution/linux_cu90.cmake
diff --git a/config/distribution/linux_cu91.cmake b/config/distribution/linux_cu91.cmake
diff --git a/make/staticbuild/linux_cu92.mk → make/staticbuild/linux_cu112.mk b/make/staticbuild/linux_cu92.mk → make/staticbuild/linux_cu112.mk
@@ -66,7 +66,7 @@ USE_CUDA = 1
 # add the path to CUDA library to link and compile flag
 # if you have already add them to environment variable, leave it as NONE
 # USE_CUDA_PATH = /usr/local/cuda
-USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.2
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-11.2
 
 # whether to use CuDNN library
 USE_CUDNN = 1
@@ -170,3 +170,4 @@ EXTRA_OPERATORS =
 # [email protected]:dato-code/SFrame.git
 # SFRAME_PATH = $(HOME)/SFrame
 # MXNET_PLUGINS += plugin/sframe/plugin.mk
+