From 2d89d109e19d1e84c4ada3c9d5d48cfcf3d997ea Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Tue, 27 Nov 2018 09:09:16 -0800 Subject: [PATCH] [SPARK-26025][K8S] Speed up docker image build on dev repo. The "build context" for a docker image - basically the whole contents of the current directory where "docker" is invoked - can be huge in a dev build, easily breaking a couple of gigs. Doing that copy 3 times during the build of docker images severely slows down the process. This patch creates a smaller build context - basically mimicking what the make-distribution.sh script does, so that when building the docker images, only the necessary bits are in the current directory. For PySpark and R that is optimized further, since those images are built based on the previously built Spark main image. In my current local clone, the dir size is about 2G, but with this script the "context" sent to docker is about 250M for the main image, 1M for the pyspark image and 8M for the R image. That speeds up the image builds considerably. I also snuck in a fix to the k8s integration test dependencies in the sbt build, so that the examples are properly built (without having to do it manually). Closes #23019 from vanzin/SPARK-26025. Authored-by: Marcelo Vanzin Signed-off-by: Marcelo Vanzin --- bin/docker-image-tool.sh | 122 ++++++++++++------ project/SparkBuild.scala | 3 +- .../src/main/dockerfiles/spark/Dockerfile | 14 +- 3 files changed, 91 insertions(+), 48 deletions(-) diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh index e51201a77cb5d..9f735f1148da4 100755 --- a/bin/docker-image-tool.sh +++ b/bin/docker-image-tool.sh @@ -29,6 +29,20 @@ if [ -z "${SPARK_HOME}" ]; then fi . "${SPARK_HOME}/bin/load-spark-env.sh" +CTX_DIR="$SPARK_HOME/target/tmp/docker" + +function is_dev_build { + [ ! -f "$SPARK_HOME/RELEASE" ] +} + +function cleanup_ctx_dir { + if is_dev_build; then + rm -rf "$CTX_DIR" + fi +} + +trap cleanup_ctx_dir EXIT + function image_ref { local image="$1" local add_repo="${2:-1}" @@ -53,80 +67,114 @@ function docker_push { fi } +# Create a smaller build context for docker in dev builds to make the build faster. Docker +# uploads all of the current directory to the daemon, and it can get pretty big with dev +# builds that contain test log files and other artifacts. +# +# Three build contexts are created, one for each image: base, pyspark, and sparkr. For them +# to have the desired effect, the docker command needs to be executed inside the appropriate +# context directory. +# +# Note: docker does not support symlinks in the build context. +function create_dev_build_context {( + set -e + local BASE_CTX="$CTX_DIR/base" + mkdir -p "$BASE_CTX/kubernetes" + cp -r "resource-managers/kubernetes/docker/src/main/dockerfiles" \ + "$BASE_CTX/kubernetes/dockerfiles" + + cp -r "assembly/target/scala-$SPARK_SCALA_VERSION/jars" "$BASE_CTX/jars" + cp -r "resource-managers/kubernetes/integration-tests/tests" \ + "$BASE_CTX/kubernetes/tests" + + mkdir "$BASE_CTX/examples" + cp -r "examples/src" "$BASE_CTX/examples/src" + # Copy just needed examples jars instead of everything. + mkdir "$BASE_CTX/examples/jars" + for i in examples/target/scala-$SPARK_SCALA_VERSION/jars/*; do + if [ ! -f "$BASE_CTX/jars/$(basename $i)" ]; then + cp $i "$BASE_CTX/examples/jars" + fi + done + + for other in bin sbin data; do + cp -r "$other" "$BASE_CTX/$other" + done + + local PYSPARK_CTX="$CTX_DIR/pyspark" + mkdir -p "$PYSPARK_CTX/kubernetes" + cp -r "resource-managers/kubernetes/docker/src/main/dockerfiles" \ + "$PYSPARK_CTX/kubernetes/dockerfiles" + mkdir "$PYSPARK_CTX/python" + cp -r "python/lib" "$PYSPARK_CTX/python/lib" + + local R_CTX="$CTX_DIR/sparkr" + mkdir -p "$R_CTX/kubernetes" + cp -r "resource-managers/kubernetes/docker/src/main/dockerfiles" \ + "$R_CTX/kubernetes/dockerfiles" + cp -r "R" "$R_CTX/R" +)} + +function img_ctx_dir { + if is_dev_build; then + echo "$CTX_DIR/$1" + else + echo "$SPARK_HOME" + fi +} + function build { local BUILD_ARGS - local IMG_PATH - local JARS - - if [ ! -f "$SPARK_HOME/RELEASE" ]; then - # Set image build arguments accordingly if this is a source repo and not a distribution archive. - # - # Note that this will copy all of the example jars directory into the image, and that will - # contain a lot of duplicated jars with the main Spark directory. In a proper distribution, - # the examples directory is cleaned up before generating the distribution tarball, so this - # issue does not occur. - IMG_PATH=resource-managers/kubernetes/docker/src/main/dockerfiles - JARS=assembly/target/scala-$SPARK_SCALA_VERSION/jars - BUILD_ARGS=( - ${BUILD_PARAMS} - --build-arg - img_path=$IMG_PATH - --build-arg - spark_jars=$JARS - --build-arg - example_jars=examples/target/scala-$SPARK_SCALA_VERSION/jars - --build-arg - k8s_tests=resource-managers/kubernetes/integration-tests/tests - ) - else - # Not passed as arguments to docker, but used to validate the Spark directory. - IMG_PATH="kubernetes/dockerfiles" - JARS=jars - BUILD_ARGS=(${BUILD_PARAMS}) + local SPARK_ROOT="$SPARK_HOME" + + if is_dev_build; then + create_dev_build_context || error "Failed to create docker build context." + SPARK_ROOT="$CTX_DIR/base" fi # Verify that the Docker image content directory is present - if [ ! -d "$IMG_PATH" ]; then + if [ ! -d "$SPARK_ROOT/kubernetes/dockerfiles" ]; then error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark." fi # Verify that Spark has actually been built/is a runnable distribution # i.e. the Spark JARs that the Docker files will place into the image are present - local TOTAL_JARS=$(ls $JARS/spark-* | wc -l) + local TOTAL_JARS=$(ls $SPARK_ROOT/jars/spark-* | wc -l) TOTAL_JARS=$(( $TOTAL_JARS )) if [ "${TOTAL_JARS}" -eq 0 ]; then error "Cannot find Spark JARs. This script assumes that Apache Spark has first been built locally or this is a runnable distribution." fi + local BUILD_ARGS=(${BUILD_PARAMS}) local BINDING_BUILD_ARGS=( ${BUILD_PARAMS} --build-arg base_img=$(image_ref spark) ) - local BASEDOCKERFILE=${BASEDOCKERFILE:-"$IMG_PATH/spark/Dockerfile"} + local BASEDOCKERFILE=${BASEDOCKERFILE:-"kubernetes/dockerfiles/spark/Dockerfile"} local PYDOCKERFILE=${PYDOCKERFILE:-false} local RDOCKERFILE=${RDOCKERFILE:-false} - docker build $NOCACHEARG "${BUILD_ARGS[@]}" \ + (cd $(img_ctx_dir base) && docker build $NOCACHEARG "${BUILD_ARGS[@]}" \ -t $(image_ref spark) \ - -f "$BASEDOCKERFILE" . + -f "$BASEDOCKERFILE" .) if [ $? -ne 0 ]; then error "Failed to build Spark JVM Docker image, please refer to Docker build output for details." fi if [ "${PYDOCKERFILE}" != "false" ]; then - docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \ + (cd $(img_ctx_dir pyspark) && docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \ -t $(image_ref spark-py) \ - -f "$PYDOCKERFILE" . + -f "$PYDOCKERFILE" .) if [ $? -ne 0 ]; then error "Failed to build PySpark Docker image, please refer to Docker build output for details." fi fi if [ "${RDOCKERFILE}" != "false" ]; then - docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \ + (cd $(img_ctx_dir sparkr) && docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \ -t $(image_ref spark-r) \ - -f "$RDOCKERFILE" . + -f "$RDOCKERFILE" .) if [ $? -ne 0 ]; then error "Failed to build SparkR Docker image, please refer to Docker build output for details." fi diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 08e22fab65165..bb834bc483f1f 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -516,7 +516,8 @@ object KubernetesIntegrationTests { s"-Dspark.kubernetes.test.unpackSparkDir=$sparkHome" ), // Force packaging before building images, so that the latest code is tested. - dockerBuild := dockerBuild.dependsOn(packageBin in Compile in assembly).value + dockerBuild := dockerBuild.dependsOn(packageBin in Compile in assembly) + .dependsOn(packageBin in Compile in examples).value ) } diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile index 5f469c30a96fa..89b20e1446229 100644 --- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile +++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile @@ -17,11 +17,6 @@ FROM openjdk:8-alpine -ARG spark_jars=jars -ARG example_jars=examples/jars -ARG img_path=kubernetes/dockerfiles -ARG k8s_tests=kubernetes/tests - # Before building the docker image, first build and make a Spark distribution following # the instructions in http://spark.apache.org/docs/latest/building-spark.html. # If this docker file is being used in the context of building your images from a Spark @@ -41,13 +36,12 @@ RUN set -ex && \ echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ chgrp root /etc/passwd && chmod ug+rw /etc/passwd -COPY ${spark_jars} /opt/spark/jars +COPY jars /opt/spark/jars COPY bin /opt/spark/bin COPY sbin /opt/spark/sbin -COPY ${img_path}/spark/entrypoint.sh /opt/ -COPY ${example_jars} /opt/spark/examples/jars -COPY examples/src /opt/spark/examples/src -COPY ${k8s_tests} /opt/spark/tests +COPY kubernetes/dockerfiles/spark/entrypoint.sh /opt/ +COPY examples /opt/spark/examples +COPY kubernetes/tests /opt/spark/tests COPY data /opt/spark/data ENV SPARK_HOME /opt/spark