Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[V1.8.x] Attemp to fix cd for v1.8.x #19947

Merged
merged 13 commits into from
Mar 2, 2021
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions cd/python/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,15 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

ARG PYTHON_CMD=python3
RUN apt-get update && \
apt-get install -y wget ${PYTHON_CMD}-dev gcc && \
wget https://bootstrap.pypa.io/get-pip.py && \
${PYTHON_CMD} get-pip.py
RUN apt-get update || true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you combine all these RUN commands into one? Each RUN command causes a docker layer to be saved, so it's more efficient to do

RUN apt-get update && \
    apt-get install -y software-properties-common && \
    add-apt-repository -y ppa:deadsnakes/ppa && \
    apt-get update && \
    apt-get install -y python3.7-dev python3.7-distutils virtualenv wget && \
    ln -sf /usr/bin/python3.7 /usr/local/bin/python3 && \
    wget -nv https://bootstrap.pypa.io/get-pip.py && \
    python3 get-pip.py

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed!

RUN apt-get install -y software-properties-common
RUN add-apt-repository -y ppa:deadsnakes/ppa
RUN apt-get update || true
RUN apt-get install -y python3.7-dev python3.7-distutils virtualenv wget
RUN ln -sf /usr/bin/python3.7 /usr/local/bin/python3

RUN wget -nv https://bootstrap.pypa.io/get-pip.py
RUN python3 get-pip.py

ARG MXNET_COMMIT_ID
ENV MXNET_COMMIT_ID=${MXNET_COMMIT_ID}
Expand Down
2 changes: 1 addition & 1 deletion ci/Jenkinsfile_docker_cache
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ core_logic: {
ws('workspace/docker_cache') {
timeout(time: total_timeout, unit: 'MINUTES') {
utils.init_git()
sh "ci/docker_cache.py --docker-registry ${env.DOCKER_CACHE_REGISTRY}"
sh "ci/docker_cache.py --docker-registry ${env.DOCKER_ECR_REGISTRY}"
}
}
}
Expand Down
17 changes: 2 additions & 15 deletions ci/Jenkinsfile_utils.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -112,20 +112,7 @@ def get_git_commit_hash() {
}

def publish_test_coverage() {
// CodeCovs auto detection has trouble with our CIs PR validation due the merging strategy
git_commit_hash = get_git_commit_hash()

if (env.CHANGE_ID) {
// PR execution
codecovArgs = "-B ${env.CHANGE_TARGET} -C ${git_commit_hash} -P ${env.CHANGE_ID}"
} else {
// Branch execution
codecovArgs = "-B ${env.BRANCH_NAME} -C ${git_commit_hash}"
}

// To make sure we never fail because test coverage reporting is not available
// Fall back to our own copy of the bash helper if it failed to download the public version
sh "(curl --retry 10 -s https://codecov.io/bash | bash -s - ${codecovArgs}) || (curl --retry 10 -s https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data/codecov-bash.txt | bash -s - ${codecovArgs}) || true"
sh "curl -s https://codecov.io/bash | bash"
}

def collect_test_results_unix(original_file_name, new_file_name) {
Expand Down Expand Up @@ -160,7 +147,7 @@ def collect_test_results_windows(original_file_name, new_file_name) {


def docker_run(platform, function_name, use_nvidia, shared_mem = '500m', env_vars = "") {
def command = "ci/build.py %ENV_VARS% --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
def command = "ci/build.py %ENV_VARS% --docker-registry ${env.DOCKER_ECR_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
command = command.replaceAll('%ENV_VARS%', env_vars.length() > 0 ? "-e ${env_vars}" : '')
command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
command = command.replaceAll('%PLATFORM%', platform)
Expand Down
122 changes: 64 additions & 58 deletions ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

import argparse
import glob
import hashlib
import os
import pprint
import re
import shutil
Expand All @@ -36,7 +38,6 @@
from subprocess import check_call, check_output
from typing import *

from safe_docker_run import SafeDockerClient
from util import *


Expand All @@ -52,13 +53,41 @@ def get_platforms(path: str = get_dockerfiles_path()) -> List[str]:
platforms = list(map(lambda x: os.path.split(x)[1], sorted(files)))
return platforms

def _find_copied_files(dockerfile):
"""
Creates a list of files copied into given dockerfile.
"""
copied_files = []
basedir = os.path.dirname(dockerfile)
with open(dockerfile, "r") as f:
for line in f.readlines():
if line.startswith("COPY "):
copied_files.append(os.path.join(basedir, line.split(" ")[1]))
return copied_files

def _hash_file(ctx, filename):
"""
Add contents of passed file into passed hash context.
"""
bufsiz = 16384
with open(filename,"rb") as f:
while True:
d = f.read(bufsiz)
if not d:
break
ctx.update(d)

def get_docker_tag(platform: str, registry: str) -> str:
""":return: docker tag to be used for the container"""
platform = platform if any(x in platform for x in ['build.', 'publish.']) else 'build.{}'.format(platform)
if not registry:
registry = "mxnet_local"
return "{0}/{1}".format(registry, platform)
dockerfile = get_dockerfile(platform)
sha256 = hashlib.sha256()
_hash_file(sha256, dockerfile)
for f in _find_copied_files(dockerfile):
_hash_file(sha256, f)
return "{0}:{1}-{2}".format(registry, platform, sha256.hexdigest()[:12])


def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
Expand All @@ -67,7 +96,7 @@ def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:


def build_docker(platform: str, registry: str, num_retries: int, no_cache: bool,
cache_intermediate: bool) -> str:
cache_intermediate: bool=False) -> str:
"""
Build a container for the given platform
:param platform: Platform
Expand Down Expand Up @@ -157,8 +186,7 @@ def default_ccache_dir() -> str:
return os.path.join(os.path.expanduser("~"), ".ccache")


def container_run(docker_client: SafeDockerClient,
platform: str,
def container_run(platform: str,
nvidia_runtime: bool,
docker_registry: str,
shared_memory_size: str,
Expand All @@ -167,17 +195,12 @@ def container_run(docker_client: SafeDockerClient,
environment: Dict[str, str],
dry_run: bool = False) -> int:
"""Run command in a container"""
container_wait_s = 600
#
# Environment setup
#
# set default environment variables
environment.update({
'CCACHE_MAXSIZE': '500G',
'CCACHE_TEMPDIR': '/tmp/ccache', # temp dir should be local and not shared
'CCACHE_DIR': '/work/ccache', # this path is inside the container as /work/ccache is
# mounted
'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache
# verification.
'CCACHE_DIR': '/work/ccache', # this path is inside the container as /work/ccache is mounted
'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache verification.
})
environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ})

Expand All @@ -189,13 +212,9 @@ def container_run(docker_client: SafeDockerClient,
os.makedirs(local_ccache_dir, exist_ok=True)
logging.info("Using ccache directory: %s", local_ccache_dir)

# Equivalent command
docker_cmd_list = [
"docker",
'run',
"--gpus all" if nvidia_runtime else "",
"--cap-add",
"SYS_PTRACE", # Required by ASAN
# Build docker command
docker_arg_list = [
"--cap-add", "SYS_PTRACE", # Required by ASAN
'--rm',
'--shm-size={}'.format(shared_memory_size),
# mount mxnet root
Expand All @@ -211,40 +230,27 @@ def container_run(docker_client: SafeDockerClient,
'-e', "CCACHE_DIR={}".format(environment['CCACHE_DIR']),
# a container-scoped log, useful for ccache verification.
'-e', "CCACHE_LOGFILE={}".format(environment['CCACHE_LOGFILE']),
'-ti',
tag]
docker_cmd_list.extend(command)
docker_cmd = ' \\\n\t'.join(docker_cmd_list)
logging.info("Running %s in container %s", command, tag)
logging.info("Executing the equivalent of:\n%s\n", docker_cmd)
]
docker_arg_list += [tag]
docker_arg_list.extend(command)

def docker_run_cmd(cmd):
logging.info("Running %s in container %s", command, tag)
logging.info("Executing command:\n%s\n", ' \\\n\t'.join(cmd))
subprocess.run(cmd, stdout=sys.stdout, stderr=sys.stderr, check=True)

if not dry_run:
#############################
#
signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGINT, signal.SIGTERM})
# noinspection PyShadowingNames
runtime = None
if nvidia_runtime:
# noinspection PyShadowingNames
# runc is default (docker info | grep -i runtime)
runtime = 'nvidia'

return docker_client.run(
tag,
runtime=runtime,
command=command,
shm_size=shared_memory_size,
user='{}:{}'.format(os.getuid(), os.getgid()),
cap_add='SYS_PTRACE',
volumes={
mx_root:
{'bind': '/work/mxnet', 'mode': 'rw'},
local_build_folder:
{'bind': '/work/build', 'mode': 'rw'},
local_ccache_dir:
{'bind': '/work/ccache', 'mode': 'rw'},
},
environment=environment)
if not nvidia_runtime:
docker_run_cmd(['docker', 'run'] + docker_arg_list)
else:
try:
docker_run_cmd(['docker', 'run', '--gpus', 'all'] + docker_arg_list)
except subprocess.CalledProcessError as e:
if e.returncode == 125:
docker_run_cmd(['docker', 'run', '--runtime', 'nvidia'] + docker_arg_list)
else:
raise

return 0


Expand Down Expand Up @@ -348,7 +354,6 @@ def main() -> int:
args = parser.parse_args()

command = list(chain(*args.command))
docker_client = SafeDockerClient()

environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
for e in args.environment])
Expand All @@ -375,21 +380,21 @@ def main() -> int:
ret = 0
if command:
ret = container_run(
docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, environment=environment)
elif args.print_docker_run:
command = []
ret = container_run(
docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, dry_run=True, environment=environment)
else:
# With no commands, execute a build function for the target platform
command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
logging.info("No command specified, trying default build: %s", ' '.join(command))
ret = container_run(
docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, environment=environment)

Expand All @@ -406,7 +411,8 @@ def main() -> int:
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
build_docker(platform, registry=args.docker_registry,
num_retries=args.docker_build_retries, no_cache=args.no_cache)
num_retries=args.docker_build_retries, no_cache=args.no_cache,
cache_intermediate=args.cache_intermediate)
if args.build_only:
continue
shutil.rmtree(buildir(), ignore_errors=True)
Expand All @@ -418,7 +424,7 @@ def main() -> int:
continue
command = ["/work/mxnet/ci/docker/runtime_functions.sh", build_platform]
container_run(
docker_client=docker_client, platform=platform, nvidia_runtime=args.nvidiadocker,
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, environment=environment)
shutil.move(buildir(), plat_buildir)
Expand Down
7 changes: 6 additions & 1 deletion ci/docker/install/ubuntu_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@
set -ex
# install libraries for mxnet's python package on ubuntu
apt-get update || true
apt-get install -y python-dev python3-dev virtualenv wget
apt-get install -y software-properties-common
add-apt-repository -y ppa:deadsnakes/ppa
apt-get update || true
apt-get install -y python3.6-dev virtualenv wget
# setup symlink in /usr/local/bin to override python3 version
ln -sf /usr/bin/python3.6 /usr/local/bin/python3

# the version of the pip shipped with ubuntu may be too lower, install a recent version here
wget -nv https://bootstrap.pypa.io/get-pip.py
Expand Down
18 changes: 9 additions & 9 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ build_ubuntu_cpu_cmake_debug() {
cmake \
-DUSE_CUDA=OFF \
-DUSE_TVM_OP=ON \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DPython3_EXECUTABLE=python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_OPENMP=OFF \
-DUSE_OPENCV=ON \
Expand All @@ -538,7 +538,7 @@ build_ubuntu_cpu_cmake_no_tvm_op() {
cmake \
-DUSE_CUDA=OFF \
-DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DPython3_EXECUTABLE=python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_OPENMP=OFF \
-DUSE_OPENCV=ON \
Expand Down Expand Up @@ -874,7 +874,7 @@ build_ubuntu_gpu_cmake_mkldnn() {
-DUSE_CUDA=1 \
-DUSE_CUDNN=1 \
-DUSE_TVM_OP=0 \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DPython3_EXECUTABLE=python3 \
-DUSE_MKLML_MKL=1 \
-DCMAKE_BUILD_TYPE=Release \
-DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
Expand All @@ -892,8 +892,8 @@ build_ubuntu_gpu_cmake() {
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
-DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
-DUSE_MKLDNN=OFF \
Expand All @@ -915,8 +915,8 @@ build_ubuntu_gpu_cmake_no_rtc() {
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
-DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
-DUSE_MKLDNN=ON \
Expand Down Expand Up @@ -956,8 +956,8 @@ build_ubuntu_gpu_large_tensor() {
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
-DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
-DUSE_MKLDNN=OFF \
Expand Down
Loading