[v1.x] CI fixes to make more stable and upgradable (apache#19895)

* Test moving pipelines from p3 to g4. * Remove fallback codecov command - the existing (first) command works and the second always fails a few times before finally succeeding (and also doesn't support the -P parameter, which causes an error.) * Stop using docker python client, since it still doesn't support latest nvidia 'gpus' attribute. Switch to using subprocess calls using list parameter (to avoid shell injections). See docker/docker-py#2395 * Remove old files. * Fix comment * Set default environment variables * Fix GPU syntax. * Use subprocess.run and redirect output to stdout, don't run docker in interactive mode. * Check if codecov works without providing parameters now. * Send docker stderr to sys.stderr * Support both nvidia-docker configurations, first try '--gpus all', and if that fails, then try '--runtime nvidia'. Co-authored-by: Joe Evans <[email protected]>
josephevans · Feb 24, 2021 · fe354b5 · fe354b5
1 parent e35eb2e
commit fe354b5
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 752 deletions.
diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
@@ -112,20 +112,7 @@ def get_git_commit_hash() {
 }
 
 def publish_test_coverage() {
-    // CodeCovs auto detection has trouble with our CIs PR validation due the merging strategy
-    git_commit_hash = get_git_commit_hash()
-
-    if (env.CHANGE_ID) {
-      // PR execution
-      codecovArgs = "-B ${env.CHANGE_TARGET} -C ${git_commit_hash} -P ${env.CHANGE_ID}"
-    } else {
-      // Branch execution
-      codecovArgs = "-B ${env.BRANCH_NAME} -C ${git_commit_hash}"
-    }
-
-    // To make sure we never fail because test coverage reporting is not available
-    // Fall back to our own copy of the bash helper if it failed to download the public version
-    sh "(curl --retry 10 -s https://codecov.io/bash | bash -s - ${codecovArgs}) || (curl --retry 10 -s https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data/codecov-bash.txt | bash -s - ${codecovArgs}) || true"
+    sh "curl -s https://codecov.io/bash | bash"
 }
 
 def collect_test_results_unix(original_file_name, new_file_name) {

diff --git a/ci/build.py b/ci/build.py
@@ -38,7 +38,6 @@
 from subprocess import check_call, check_output
 from typing import *
 
-from safe_docker_run import SafeDockerClient
 from util import *
 
 
@@ -187,8 +186,7 @@ def default_ccache_dir() -> str:
     return os.path.join(os.path.expanduser("~"), ".ccache")
 
 
-def container_run(docker_client: SafeDockerClient,
-                  platform: str,
+def container_run(platform: str,
                   nvidia_runtime: bool,
                   docker_registry: str,
                   shared_memory_size: str,
@@ -197,17 +195,12 @@ def container_run(docker_client: SafeDockerClient,
                   environment: Dict[str, str],
                   dry_run: bool = False) -> int:
     """Run command in a container"""
-    container_wait_s = 600
-    #
-    # Environment setup
-    #
+    # set default environment variables
     environment.update({
         'CCACHE_MAXSIZE': '500G',
         'CCACHE_TEMPDIR': '/tmp/ccache',  # temp dir should be local and not shared
-        'CCACHE_DIR': '/work/ccache',  # this path is inside the container as /work/ccache is
-                                       # mounted
-        'CCACHE_LOGFILE': '/tmp/ccache.log',  # a container-scoped log, useful for ccache
-                                              # verification.
+        'CCACHE_DIR': '/work/ccache',  # this path is inside the container as /work/ccache is mounted
+        'CCACHE_LOGFILE': '/tmp/ccache.log',  # a container-scoped log, useful for ccache verification.
     })
     environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ})
 
@@ -219,13 +212,9 @@ def container_run(docker_client: SafeDockerClient,
     os.makedirs(local_ccache_dir, exist_ok=True)
     logging.info("Using ccache directory: %s", local_ccache_dir)
 
-    # Equivalent command
-    docker_cmd_list = [
-        "docker",
-        'run',
-        "--gpus all" if nvidia_runtime else "",
-        "--cap-add",
-        "SYS_PTRACE", # Required by ASAN
+    # Build docker command
+    docker_arg_list = [
+        "--cap-add", "SYS_PTRACE", # Required by ASAN
         '--rm',
         '--shm-size={}'.format(shared_memory_size),
         # mount mxnet root
@@ -241,40 +230,27 @@ def container_run(docker_client: SafeDockerClient,
         '-e', "CCACHE_DIR={}".format(environment['CCACHE_DIR']),
         # a container-scoped log, useful for ccache verification.
         '-e', "CCACHE_LOGFILE={}".format(environment['CCACHE_LOGFILE']),
-        '-ti',
-        tag]
-    docker_cmd_list.extend(command)
-    docker_cmd = ' \\\n\t'.join(docker_cmd_list)
-    logging.info("Running %s in container %s", command, tag)
-    logging.info("Executing the equivalent of:\n%s\n", docker_cmd)
+    ]
+    docker_arg_list += [tag]
+    docker_arg_list.extend(command)
+
+    def docker_run_cmd(cmd):
+        logging.info("Running %s in container %s", command, tag)
+        logging.info("Executing command:\n%s\n", ' \\\n\t'.join(cmd))
+        subprocess.run(cmd, stdout=sys.stdout, stderr=sys.stderr, check=True)
 
     if not dry_run:
-        #############################
-        #
-        signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM})
-        # noinspection PyShadowingNames
-        runtime = None
-        if nvidia_runtime:
-            # noinspection PyShadowingNames
-            # runc is default (docker info | grep -i runtime)
-            runtime = 'nvidia'
-
-        return docker_client.run(
-            tag,
-            runtime=runtime,
-            command=command,
-            shm_size=shared_memory_size,
-            user='{}:{}'.format(os.getuid(), os.getgid()),
-            cap_add='SYS_PTRACE',
-            volumes={
-                mx_root:
-                    {'bind': '/work/mxnet', 'mode': 'rw'},
-                local_build_folder:
-                    {'bind': '/work/build', 'mode': 'rw'},
-                local_ccache_dir:
-                    {'bind': '/work/ccache', 'mode': 'rw'},
-            },
-            environment=environment)
+        if not nvidia_runtime:
+            docker_run_cmd(['docker', 'run'] + docker_arg_list)
+        else:
+            try:
+                docker_run_cmd(['docker', 'run', '--gpus', 'all'] + docker_arg_list)
+            except subprocess.CalledProcessError as e:
+                if e.returncode == 125:
+                    docker_run_cmd(['docker', 'run', '--runtime', 'nvidia'] + docker_arg_list)
+                else:
+                    raise
+
     return 0
 
 
@@ -378,7 +354,6 @@ def main() -> int:
     args = parser.parse_args()
 
     command = list(chain(*args.command))
-    docker_client = SafeDockerClient()
 
     environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
                         for e in args.environment])
@@ -405,21 +380,21 @@ def main() -> int:
         ret = 0
         if command:
             ret = container_run(
-                docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
+                platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
                 local_ccache_dir=args.ccache_dir, environment=environment)
         elif args.print_docker_run:
             command = []
             ret = container_run(
-                docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
+                platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
                 local_ccache_dir=args.ccache_dir, dry_run=True, environment=environment)
         else:
             # With no commands, execute a build function for the target platform
             command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
             logging.info("No command specified, trying default build: %s", ' '.join(command))
             ret = container_run(
-                docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
+                platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
                 local_ccache_dir=args.ccache_dir, environment=environment)
 
@@ -449,7 +424,7 @@ def main() -> int:
                 continue
             command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
             container_run(
-                docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
+                platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
                 local_ccache_dir=args.ccache_dir, environment=environment)
             shutil.move(buildir(), plat_buildir)

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
@@ -769,7 +769,7 @@ def test_unix_python3_gpu_cu110() {
 
 def test_unix_python3_quantize_gpu() {
     return ['Python3: Quantize GPU': {
-      node(NODE_LINUX_GPU_P3) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/ut-python3-quantize-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
@@ -787,7 +787,7 @@ def test_unix_python3_quantize_gpu() {
 
 def test_unix_python3_quantize_gpu_cu110() {
     return ['Python3+CUDA11.0: Quantize GPU': {
-      node(NODE_LINUX_GPU_P3) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/ut-python3-quantize-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
@@ -903,7 +903,7 @@ def test_unix_python3_mkldnn_nocudnn_gpu() {
 
 def test_unix_python3_tensorrt_gpu() {
     return ['Python3: TensorRT GPU': {
-      node(NODE_LINUX_GPU_P3) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/build-tensorrt') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {