Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
CI Infra updates (#19903)
Browse files Browse the repository at this point in the history
This PR makes a number of changes to make it more stable:

    Remove SafeDocker client which uses python docker package to run containers. Change to use "docker run" command directly using subprocess.call(), because the python-docker client does not support a gpus parameter which newer docker versions use and we don't get timeout issues when using the docker command directly. This will allow us to update our AMIs to use newer docker versions.
    In order to support both docker variants simultaneously, we first try to use the --gpus all parameter to docker run, if it fails with error code 125 (which means docker run command itself failed,) then we retry using the old --runtime nvidia parameter.
    Remove the extra custom codecov calls (which usually fail and have to retry multiple times, even though the initial codecov command works.)
  • Loading branch information
josephevans authored and leezu committed Mar 30, 2021
1 parent 0f4be44 commit 9ed0582
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 740 deletions.
15 changes: 1 addition & 14 deletions ci/Jenkinsfile_utils.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -112,20 +112,7 @@ def get_git_commit_hash() {
}

def publish_test_coverage() {
// CodeCovs auto detection has trouble with our CIs PR validation due the merging strategy
git_commit_hash = get_git_commit_hash()

if (env.CHANGE_ID) {
// PR execution
codecovArgs = "-B ${env.CHANGE_TARGET} -C ${git_commit_hash} -P ${env.CHANGE_ID}"
} else {
// Branch execution
codecovArgs = "-B ${env.BRANCH_NAME} -C ${git_commit_hash}"
}

// To make sure we never fail because test coverage reporting is not available
// Fall back to our own copy of the bash helper if it failed to download the public version
sh "(curl --retry 10 -s https://codecov.io/bash | bash -s - ${codecovArgs}) || (curl --retry 10 -s https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data/codecov-bash.txt | bash -s - ${codecovArgs}) || true"
sh "curl -s https://codecov.io/bash | bash"
}

def collect_test_results_unix(original_file_name, new_file_name) {
Expand Down
85 changes: 29 additions & 56 deletions ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@

import yaml

from safe_docker_run import SafeDockerClient
from util import *


Expand Down Expand Up @@ -104,8 +103,7 @@ def default_ccache_dir() -> str:
return os.path.join(os.path.expanduser("~"), ".ccache")


def container_run(docker_client: SafeDockerClient,
platform: str,
def container_run(platform: str,
nvidia_runtime: bool,
docker_registry: str,
shared_memory_size: str,
Expand All @@ -114,17 +112,12 @@ def container_run(docker_client: SafeDockerClient,
environment: Dict[str, str],
dry_run: bool = False) -> int:
"""Run command in a container"""
container_wait_s = 600
#
# Environment setup
#
# set default environment variables
environment.update({
'CCACHE_MAXSIZE': '500G',
'CCACHE_TEMPDIR': '/tmp/ccache', # temp dir should be local and not shared
'CCACHE_DIR': '/work/ccache', # this path is inside the container as /work/ccache is
# mounted
'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache
# verification.
'CCACHE_DIR': '/work/ccache', # this path is inside the container as /work/ccache is mounted
'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache verification.
})
environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ})

Expand All @@ -136,13 +129,9 @@ def container_run(docker_client: SafeDockerClient,
os.makedirs(local_ccache_dir, exist_ok=True)
logging.info("Using ccache directory: %s", local_ccache_dir)

# Equivalent command
docker_cmd_list = [
"docker",
'run',
"--gpus all" if nvidia_runtime else "",
"--cap-add",
"SYS_PTRACE", # Required by ASAN
# Build docker command
docker_arg_list = [
"--cap-add", "SYS_PTRACE", # Required by ASAN
'--rm',
'--shm-size={}'.format(shared_memory_size),
# mount mxnet root
Expand All @@ -158,40 +147,27 @@ def container_run(docker_client: SafeDockerClient,
'-e', "CCACHE_DIR={}".format(environment['CCACHE_DIR']),
# a container-scoped log, useful for ccache verification.
'-e', "CCACHE_LOGFILE={}".format(environment['CCACHE_LOGFILE']),
'-ti',
tag]
docker_cmd_list.extend(command)
docker_cmd = ' \\\n\t'.join(docker_cmd_list)
logging.info("Running %s in container %s", command, tag)
logging.info("Executing the equivalent of:\n%s\n", docker_cmd)
]
docker_arg_list += [tag]
docker_arg_list.extend(command)

def docker_run_cmd(cmd):
logging.info("Running %s in container %s", command, tag)
logging.info("Executing command:\n%s\n", ' \\\n\t'.join(cmd))
subprocess.run(cmd, stdout=sys.stdout, stderr=sys.stderr, check=True)

if not dry_run:
#############################
#
signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM})
# noinspection PyShadowingNames
runtime = None
if nvidia_runtime:
# noinspection PyShadowingNames
# runc is default (docker info | grep -i runtime)
runtime = 'nvidia'

return docker_client.run(
tag,
runtime=runtime,
command=command,
shm_size=shared_memory_size,
user='{}:{}'.format(os.getuid(), os.getgid()),
cap_add='SYS_PTRACE',
volumes={
mx_root:
{'bind': '/work/mxnet', 'mode': 'rw'},
local_build_folder:
{'bind': '/work/build', 'mode': 'rw'},
local_ccache_dir:
{'bind': '/work/ccache', 'mode': 'rw'},
},
environment=environment)
if not nvidia_runtime:
docker_run_cmd(['docker', 'run'] + docker_arg_list)
else:
try:
docker_run_cmd(['docker', 'run', '--gpus', 'all'] + docker_arg_list)
except subprocess.CalledProcessError as e:
if e.returncode == 125:
docker_run_cmd(['docker', 'run', '--runtime', 'nvidia'] + docker_arg_list)
else:
raise

return 0


Expand Down Expand Up @@ -292,8 +268,6 @@ def main() -> int:
args = parser.parse_args()

command = list(chain.from_iterable(args.command))
docker_client = SafeDockerClient()

environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
for e in args.environment])

Expand All @@ -318,29 +292,28 @@ def main() -> int:
ret = 0
if command:
ret = container_run(
docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, environment=environment)
elif args.print_docker_run:
command = []
ret = container_run(
docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, dry_run=True, environment=environment)
else:
# With no commands, execute a build function for the target platform
command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
logging.info("No command specified, trying default build: %s", ' '.join(command))
ret = container_run(
docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, environment=environment)

if ret != 0:
logging.critical("Execution of %s failed with status: %d", command, ret)
return ret


else:
parser.print_help()
list_platforms()
Expand Down
Loading

0 comments on commit 9ed0582

Please sign in to comment.