Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

CI: Consolidate Dockerfiles #18115

Merged
merged 1 commit into from
Apr 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cd/mxnet_lib/mxnet_lib_pipeline.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ def get_stash(mxnet_variant) {
// The environment corresponds to the docker files in the 'docker' directory
def get_environment(mxnet_variant) {
if (mxnet_variant.startsWith("cu")) {
return "publish.centos7_gpu_${mxnet_variant}"
return "centos7_gpu_${mxnet_variant}"
leezu marked this conversation as resolved.
Show resolved Hide resolved
}
return "publish.centos7_cpu"
return "centos7_cpu"
}

// Returns the variant appropriate jenkins node test in which
Expand Down
4 changes: 2 additions & 2 deletions cd/python/docker/Jenkins_pipeline.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def get_pipeline(mxnet_variant) {
// The environment corresponds to the docker files in the 'docker' directory
def get_environment(mxnet_variant) {
if (mxnet_variant.startsWith("cu")) {
return "publish.centos7_gpu_${mxnet_variant}"
return "centos7_gpu_${mxnet_variant}"
}
return "publish.centos7_cpu"
return "centos7_cpu"
}


Expand Down
6 changes: 3 additions & 3 deletions cd/python/pypi/Jenkins_pipeline.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def get_pipeline(mxnet_variant) {

def get_environment(mxnet_variant) {
if (mxnet_variant.startsWith('cu')) {
return "publish.centos7_gpu_${mxnet_variant}"
return "centos7_gpu_${mxnet_variant}"
}
return "publish.centos7_cpu"
return "centos7_cpu"
}

def build(mxnet_variant) {
Expand All @@ -58,7 +58,7 @@ def test(mxnet_variant) {
// test wheel file
def environment = get_environment(mxnet_variant)
def nvidia_docker = mxnet_variant.startsWith('cu')
ci_utils.docker_run(environment, "cd_integration_test_pypi python3 ${nvidia_docker}", nvidia_docker)
ci_utils.docker_run(environment, "cd_integration_test_pypi ${nvidia_docker}", nvidia_docker)
}
}

Expand Down
6 changes: 4 additions & 2 deletions ci/Jenkinsfile_utils.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,11 @@ def collect_test_results_windows(original_file_name, new_file_name) {
}


def docker_run(platform, function_name, use_nvidia, shared_mem = '500m', env_vars = "") {
def command = "ci/build.py %ENV_VARS% --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
def docker_run(platform, function_name, use_nvidia = false, shared_mem = '500m', env_vars = "",
build_args = "") {
def command = "ci/build.py %ENV_VARS% %BUILD_ARGS% --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
command = command.replaceAll('%ENV_VARS%', env_vars.length() > 0 ? "-e ${env_vars}" : '')
command = command.replaceAll('%BUILD_ARGS%', env_vars.length() > 0 ? "${build_args}" : '')
command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
command = command.replaceAll('%PLATFORM%', platform)
command = command.replaceAll('%FUNCTION_NAME%', function_name)
Expand Down
109 changes: 64 additions & 45 deletions ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,23 @@
import glob
import pprint
import re
import os
import shutil
import signal
import subprocess
from itertools import chain
from subprocess import check_call, check_output
from typing import *

import yaml

from safe_docker_run import SafeDockerClient
from util import *

# NOTE: Temporary whitelist used until all Dockerfiles are refactored for docker compose
DOCKER_COMPOSE_WHITELIST = ('centos7_cpu', 'centos7_gpu_cu92', 'centos7_gpu_cu100',
'centos7_gpu_cu101', 'centos7_gpu_cu102')


def get_dockerfiles_path():
return "docker"
Expand All @@ -55,6 +62,11 @@ def get_platforms(path: str = get_dockerfiles_path()) -> List[str]:

def get_docker_tag(platform: str, registry: str) -> str:
""":return: docker tag to be used for the container"""
if platform in DOCKER_COMPOSE_WHITELIST:
with open("docker/docker-compose.yml", "r") as f:
compose_config = yaml.load(f.read(), yaml.SafeLoader)
return compose_config["services"][platform]["image"]

platform = platform if any(x in platform for x in ['build.', 'publish.']) else 'build.{}'.format(platform)
if not registry:
registry = "mxnet_local"
Expand All @@ -66,72 +78,80 @@ def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
return os.path.join(path, "Dockerfile.{0}".format(platform))


def get_docker_binary(use_nvidia_docker: bool) -> str:
return "nvidia-docker" if use_nvidia_docker else "docker"


def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, no_cache: bool,
def build_docker(platform: str, registry: str, num_retries: int, no_cache: bool,
cache_intermediate: bool = False) -> str:
"""
Build a container for the given platform
:param platform: Platform
:param docker_binary: docker binary to use (docker/nvidia-docker)
:param registry: Dockerhub registry name
:param num_retries: Number of retries to build the docker image
:param no_cache: pass no-cache to docker to rebuild the images
:return: Id of the top level image
"""
tag = get_docker_tag(platform=platform, registry=registry)
logging.info("Building docker container tagged '%s' with %s", tag, docker_binary)
#
# We add a user with the same group as the executing non-root user so files created in the
# container match permissions of the local user. Same for the group.
#
# These variables are used in the docker files to create user and group with these ids.
# see: docker/install/ubuntu_adduser.sh
#
# cache-from is needed so we use the cached images tagged from the remote via
# docker pull see: docker_cache.load_docker_cache
#
# This also prevents using local layers for caching: https://github.com/moby/moby/issues/33002
# So to use local caching, we should omit the cache-from by using --no-dockerhub-cache argument to this
# script.
#
# This doesn't work with multi head docker files.
#
cmd = [docker_binary, "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
if no_cache:
cmd.append("--no-cache")
if cache_intermediate:
cmd.append("--rm=false")
elif registry:
cmd.extend(["--cache-from", tag])
cmd.extend(["-t", tag, get_dockerfiles_path()])

# Case 1: docker-compose
if platform in DOCKER_COMPOSE_WHITELIST:
logging.info('Building docker container tagged \'%s\' based on ci/docker/docker-compose.yml', tag)
# We add a user with the same group as the executing non-root user so files created in the
# container match permissions of the local user. Same for the group.
cmd = ['docker-compose', '-f', 'docker/docker-compose.yml', 'build',
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
if cache_intermediate:
cmd.append('--no-rm')
cmd.append(platform)
else: # Case 2: Deprecated way, will be removed
# We add a user with the same group as the executing non-root user so files created in the
# container match permissions of the local user. Same for the group.
#
# These variables are used in the docker files to create user and group with these ids.
# see: docker/install/ubuntu_adduser.sh
#
# cache-from is needed so we use the cached images tagged from the remote via
# docker pull see: docker_cache.load_docker_cache
#
# This also prevents using local layers for caching: https://github.com/moby/moby/issues/33002
# So to use local caching, we should omit the cache-from by using --no-dockerhub-cache argument to this
# script.
#
# This doesn't work with multi head docker files.
logging.info("Building docker container tagged '%s'", tag)
cmd = ["docker", "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
if no_cache:
cmd.append("--no-cache")
if cache_intermediate:
cmd.append("--rm=false")
elif registry:
cmd.extend(["--cache-from", tag])
cmd.extend(["-t", tag, get_dockerfiles_path()])


@retry(subprocess.CalledProcessError, tries=num_retries)
def run_cmd():
logging.info("Running command: '%s'", ' '.join(cmd))
check_call(cmd)

run_cmd()

# Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
# check_call would have failed
image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
image_id = _get_local_image_id(docker_tag=tag)
if not image_id:
raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag))
return image_id


def _get_local_image_id(docker_binary, docker_tag):
def _get_local_image_id(docker_tag):
"""
Get the image id of the local docker layer with the passed tag
:param docker_tag: docker tag
:return: Image id as string or None if tag does not exist
"""
cmd = [docker_binary, "images", "-q", docker_tag]
cmd = ["docker", "images", "-q", docker_tag]
image_id_b = check_output(cmd)
image_id = image_id_b.decode('utf-8').strip()
if not image_id:
Expand Down Expand Up @@ -196,7 +216,7 @@ def container_run(docker_client: SafeDockerClient,

# Equivalent command
docker_cmd_list = [
get_docker_binary(nvidia_runtime),
"nvidia-docker" if nvidia_runtime else "docker",
'run',
"--cap-add",
"SYS_PTRACE", # Required by ASAN
Expand Down Expand Up @@ -352,7 +372,6 @@ def main() -> int:
args = parser.parse_args()

command = list(chain(*args.command))
docker_binary = get_docker_binary(args.nvidiadocker)
docker_client = SafeDockerClient()

environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
Expand All @@ -363,12 +382,12 @@ def main() -> int:
elif args.platform:
platform = args.platform
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
if args.docker_registry:
if args.docker_registry and platform not in DOCKER_COMPOSE_WHITELIST:
# Caching logic for Dockerfiles not yet refactored with compose
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
if not args.run_only:
build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
num_retries=args.docker_build_retries, no_cache=args.no_cache,
cache_intermediate=args.cache_intermediate)
build_docker(platform=platform, registry=args.docker_registry, num_retries=args.docker_build_retries,
no_cache=args.no_cache, cache_intermediate=args.cache_intermediate)
else:
logging.info("Skipping docker build step.")

Expand Down Expand Up @@ -410,8 +429,8 @@ def main() -> int:
for platform in platforms:
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
build_docker(platform, docker_binary=docker_binary, registry=args.docker_registry,
num_retries=args.docker_build_retries, no_cache=args.no_cache)
build_docker(platform, registry=args.docker_registry, num_retries=args.docker_build_retries,
no_cache=args.no_cache)
if args.build_only:
continue
shutil.rmtree(buildir(), ignore_errors=True)
Expand Down
140 changes: 140 additions & 0 deletions ci/docker/Dockerfile.build.centos7
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# -*- mode: dockerfile -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Dockerfile declaring CentOS 7 related images.
# Via the CentOS 7 Dockerfiles, we ensure MXNet continues to run fine on older systems.
#
# See docker-compose.yml for supported BASE_IMAGE ARGs and targets.

####################################################################################################
# The Dockerfile uses a dynamic BASE_IMAGE (for examplecentos:7, nvidia/cuda:10.2-devel-centos7 etc)
# On top of BASE_IMAGE we install all dependencies shared by all MXNet build environments into a
# "base" target. At the end of this file, we specialize "base" for specific usecases.
# The target built by docker can be selected via "--target" option or docker-compose.yml
####################################################################################################
ARG BASE_IMAGE
FROM $BASE_IMAGE AS base

WORKDIR /work/deps

RUN yum -y check-update || true && \
yum -y install epel-release centos-release-scl && \
yum install -y \
# Utilities
wget \
unzip \
patchelf \
pandoc \
# Development tools
git \
make \
ninja-build \
automake \
autoconf \
libtool \
protobuf-compiler \
protobuf-devel \
# CentOS Software Collections https://www.softwarecollections.org
devtoolset-7 \
rh-python35 \
rh-maven35 \
# Libraries
# Provide clbas headerfiles
atlas-devel \
openblas-devel \
lapack-devel \
opencv-devel \
openssl-devel \
zeromq-devel \
# Build-dependencies for ccache 3.7.9
gperf \
libb2-devel \
libzstd-devel && \
yum clean all && \
# Centos 7 only provides ninja-build
ln -s /usr/bin/ninja-build /usr/bin/ninja

# Make GCC7, Python 3.5 and Maven 3.3 Software Collections available by default
# during build and runtime of this container
SHELL [ "/usr/bin/scl", "enable", "devtoolset-7", "rh-python35", "rh-maven35" ]

# Install minimum required cmake version
RUN cd /usr/local/src && \
wget -nv https://cmake.org/files/v3.13/cmake-3.13.5-Linux-x86_64.sh && \
sh cmake-3.13.5-Linux-x86_64.sh --prefix=/usr/local --skip-license && \
rm cmake-3.13.5-Linux-x86_64.sh

# ccache 3.7.9 has fixes for caching nvcc outputs
RUN cd /usr/local/src && \
git clone --recursive https://github.com/ccache/ccache.git && \
cd ccache && \
git checkout v3.7.9 && \
./autogen.sh && \
./configure --disable-man && \
make -j$(nproc) && \
make install && \
cd /usr/local/src && \
rm -rf ccache

# Python dependencies
RUN pip3 install --no-cache-dir --upgrade pip && \
pip3 install --no-cache-dir nose pylint cython numpy nose-timer requests h5py scipy==1.2.3 wheel


ARG USER_ID=0
# Add user in order to make sure the assumed user the container is running under
# actually exists inside the container to avoid problems like missing home dir
RUN if [[ "$USER_ID" -gt 0 ]]; then \
# -no-log-init required due to https://github.com/moby/moby/issues/5419
useradd -m --no-log-init --uid $USER_ID --system jenkins_slave; \
usermod -aG wheel jenkins_slave; \
# By default, docker creates all WORK_DIRs with root owner
mkdir /work/mxnet; \
mkdir /work/build; \
chown -R jenkins_slave /work/; \
fi

ENV PYTHONPATH=./python/
WORKDIR /work/mxnet

COPY runtime_functions.sh /work/

####################################################################################################
# Specialize base image to install more gpu specific dependencies.
# The target built by docker can be selected via "--target" option or docker-compose.yml
####################################################################################################
FROM base as gpu
# Different Cuda versions require different NCCL versions
# https://wiki.bash-hackers.org/syntax/pe#search_and_replace
RUN export SHORT_CUDA_VERSION=${CUDA_VERSION%.*} && \
if [[ ${SHORT_CUDA_VERSION} == 9.2 ]]; then \
export NCCL_VERSION=2.4.8; \
elif [[ ${SHORT_CUDA_VERSION} == 10.* ]]; then \
export NCCL_VERSION=2.6.4; \
else \
echo "ERROR: Cuda ${SHORT_CUDA_VERSION} not yet supported in Dockerfile.build.centos7"; \
exit 1; \
fi && \
curl -fsSL https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm -O && \
rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
yum -y check-update || true && \
yum -y install \
libnccl-${NCCL_VERSION}-1+cuda${SHORT_CUDA_VERSION} \
libnccl-devel-${NCCL_VERSION}-1+cuda${SHORT_CUDA_VERSION} \
libnccl-static-${NCCL_VERSION}-1+cuda${SHORT_CUDA_VERSION} && \
yum clean all
Loading