From d1907a813360472b1d8e3d827bf991d43a699a98 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Tue, 8 Nov 2022 14:31:22 -0500 Subject: [PATCH] "build-base" Docker image for faster variantstore image builds [VS-712] (#8085) --- .dockstore.yml | 1 + .../GvsCalculatePrecisionAndSensitivity.wdl | 2 +- scripts/variantstore/wdl/GvsCallsetCost.wdl | 2 +- scripts/variantstore/wdl/GvsCreateVAT.wdl | 2 +- .../wdl/GvsCreateVATAnnotations.wdl | 4 +- .../wdl/GvsExtractAvroFilesForHail.wdl | 2 +- .../variantstore/wdl/GvsPopulateAltAllele.wdl | 2 +- .../wdl/GvsPrepareRangesCallset.wdl | 2 +- scripts/variantstore/wdl/GvsUtils.wdl | 2 +- scripts/variantstore/wdl/extract/Dockerfile | 92 ++----------------- .../wdl/extract/build_base.Dockerfile | 77 ++++++++++++++++ .../wdl/extract/build_build_base_docker.sh | 26 ++++++ .../variantstore/wdl/extract/build_docker.sh | 2 +- 13 files changed, 120 insertions(+), 96 deletions(-) create mode 100644 scripts/variantstore/wdl/extract/build_base.Dockerfile create mode 100755 scripts/variantstore/wdl/extract/build_build_base_docker.sh diff --git a/.dockstore.yml b/.dockstore.yml index e1e0bd46069..87060478101 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -208,6 +208,7 @@ workflows: branches: - master - ah_var_store + - vs_707_azure_setup - name: GvsIngestTieout subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl index a1cecb223d1..7b43ec399f8 100644 --- a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl +++ b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl @@ -264,7 +264,7 @@ task Add_AS_MAX_VQSLOD_ToVcf { File input_vcf String output_basename - String docker = "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine" + String docker = "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine" Int cpu = 1 Int memory_mb = 3500 Int disk_size_gb = ceil(2*size(input_vcf, "GiB")) + 50 diff --git a/scripts/variantstore/wdl/GvsCallsetCost.wdl b/scripts/variantstore/wdl/GvsCallsetCost.wdl index 619a723016c..ae866d5fdf4 100644 --- a/scripts/variantstore/wdl/GvsCallsetCost.wdl +++ b/scripts/variantstore/wdl/GvsCallsetCost.wdl @@ -62,7 +62,7 @@ task WorkflowComputeCosts { >>> runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine" + docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine" } output { diff --git a/scripts/variantstore/wdl/GvsCreateVAT.wdl b/scripts/variantstore/wdl/GvsCreateVAT.wdl index a8482ba465e..22a4f265930 100644 --- a/scripts/variantstore/wdl/GvsCreateVAT.wdl +++ b/scripts/variantstore/wdl/GvsCreateVAT.wdl @@ -124,7 +124,7 @@ task MakeSubpopulationFilesAndReadSchemaFiles { # ------------------------------------------------ # Runtime settings: runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine" + docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine" memory: "1 GB" preemptible: 3 cpu: "1" diff --git a/scripts/variantstore/wdl/GvsCreateVATAnnotations.wdl b/scripts/variantstore/wdl/GvsCreateVATAnnotations.wdl index c7338ee1b28..427970dcf7a 100644 --- a/scripts/variantstore/wdl/GvsCreateVATAnnotations.wdl +++ b/scripts/variantstore/wdl/GvsCreateVATAnnotations.wdl @@ -169,7 +169,7 @@ task ExtractAnAcAfFromVCF { # ------------------------------------------------ # Runtime settings: runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine" + docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine" maxRetries: 3 memory: "16 GB" preemptible: 3 @@ -291,7 +291,7 @@ task PrepAnnotationJson { # ------------------------------------------------ # Runtime settings: runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine" + docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine" memory: "8 GB" preemptible: 5 cpu: "1" diff --git a/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl b/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl index fa911b83744..7105dc07f9b 100644 --- a/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl +++ b/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl @@ -286,7 +286,7 @@ task GenerateHailScripts { File hail_create_vat_inputs_script = 'hail_create_vat_inputs.py' } runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine" + docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine" disks: "local-disk 500 HDD" } } diff --git a/scripts/variantstore/wdl/GvsPopulateAltAllele.wdl b/scripts/variantstore/wdl/GvsPopulateAltAllele.wdl index dc1590bc4ea..3929a892990 100644 --- a/scripts/variantstore/wdl/GvsPopulateAltAllele.wdl +++ b/scripts/variantstore/wdl/GvsPopulateAltAllele.wdl @@ -243,7 +243,7 @@ task PopulateAltAlleleTable { done >>> runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine" + docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine" memory: "3 GB" disks: "local-disk 10 HDD" cpu: 1 diff --git a/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl b/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl index 3b47efdd231..f990d272c1b 100644 --- a/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl +++ b/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl @@ -110,7 +110,7 @@ task PrepareRangesCallsetTask { } runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine" + docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine" memory: "3 GB" disks: "local-disk 100 HDD" bootDiskSizeGb: 15 diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index b1ddcf63666..e1b9317cbe4 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -347,7 +347,7 @@ task ScaleXYBedValues { } runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine" + docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine" maxRetries: 3 memory: "7 GB" preemptible: 3 diff --git a/scripts/variantstore/wdl/extract/Dockerfile b/scripts/variantstore/wdl/extract/Dockerfile index b8fbd7f957d..7e18ec66ea6 100644 --- a/scripts/variantstore/wdl/extract/Dockerfile +++ b/scripts/variantstore/wdl/extract/Dockerfile @@ -4,98 +4,18 @@ # Google Cloud Storage. Sharing the base image with Cromwell's GCS localization should result in reuse of a cached copy # of this base (and by far largest) image layer when running GVS pipelines in Terra / Cromwell. # -# Because this is an Alpine-based image it is more bare-bones than its Debian-based peers. Key components missing here -# are the Apache Arrow library (a requirement for pyarrow which in turn is a requirement for the google-cloud-bigquery -# Python module) and bcftools. -FROM gcr.io/google.com/cloudsdktool/cloud-sdk:406.0.0-alpine as build - -RUN apk update && apk upgrade -RUN python3 -m ensurepip --upgrade - -# Add all required build tools. These will not be added to the main stage as they are only required to build PyArrow -# and bcftools but not to use them. -RUN apk add autoconf bash cmake g++ gcc make ninja python3-dev git openssl-dev zlib-dev xz-dev bzip2-dev curl-dev - -# Unfortunately neither pyarrow nor google-cloud-bigquery will fetch or build Apache Arrow when `pip install`ed from -# this base image. Therefore we do the Apache Arrow build ourselves. In order to keep the final image size small this -# Dockerfile is set up to do a multi-stage build following the usual pattern of "build" stage / "main" stage. -# https://docs.docker.com/build/building/multi-stage/#use-multi-stage-builds -# -# The build stage installs the required development tools, downloads the Apache Arrow source bundle and builds all -# required components including Apache Arrow C++ libraries, pyarrow Python module, and all pyarrow dependencies -# including the numpy Python module. The main stage will then use the same base image and copy over the artifacts -# produced by the build stage without having to install development tools or clean up after a build. - -# Build Apache Arrow version 8.0.0 as version 9.0.0 does not compile under Alpine: -# https://issues.apache.org/jira/browse/ARROW-17329 -ARG ARROW_VERSION=8.0.0 -RUN cd / && \ - curl -O https://dlcdn.apache.org/arrow/arrow-$ARROW_VERSION/apache-arrow-$ARROW_VERSION.tar.gz && \ - tar xfz apache-arrow-$ARROW_VERSION.tar.gz - -# Pyarrow build instructions from https://arrow.apache.org/docs/developers/python.html#python-development -# Modified slightly for the requirements of this installation: -# - Download a static source tarball rather than cloning the git repo. -# - Use build type Release rather than Debug. -# - Do not build tests. -# - Install PyArrow and its dependencies specifying the --user flag so all artifacts go to the /root/.local directory -# which can easily be copied to the main stage below. -RUN pip3 install --user -r /apache-arrow-$ARROW_VERSION/python/requirements-build.txt - -RUN mkdir /dist -ARG ARROW_HOME=/dist -ARG LD_LIBRARY_PATH=/dist/lib:$LD_LIBRARY_PATH -RUN mkdir /apache-arrow-$ARROW_VERSION/cpp/build && \ - cd /apache-arrow-$ARROW_VERSION/cpp/build && \ - cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - -DCMAKE_INSTALL_LIBDIR=lib \ - -DCMAKE_BUILD_TYPE=Release \ - -DARROW_DATASET=ON \ - -DARROW_WITH_BZ2=ON \ - -DARROW_WITH_ZLIB=ON \ - -DARROW_WITH_ZSTD=ON \ - -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_BROTLI=ON \ - -DARROW_PARQUET=ON \ - -DPARQUET_REQUIRE_ENCRYPTION=ON \ - -DARROW_PYTHON=ON \ - -DARROW_BUILD_TESTS=OFF \ - .. && \ - make -j4 && \ - make install - -ARG PYARROW_WITH_PARQUET=1 -ARG PYARROW_WITH_DATASET=1 -ARG PYARROW_PARALLEL=4 -RUN cd /apache-arrow-$ARROW_VERSION/python && \ - python3 setup.py build_ext --inplace && \ - pip3 install wheel && \ - python3 setup.py build_ext --build-type=release \ - --bundle-arrow-cpp bdist_wheel && \ - pip3 install --user /apache-arrow-$ARROW_VERSION/python/dist/pyarrow-$ARROW_VERSION-*.whl +# This is a multi-stage build that uses a custom "build-base" image for the build stage. The build-base image is +# expensive to create and isn't expected to change often, while the steps in this Dockerfile are much less expensive and +# more likely to change. Using a build-base image essentially allows the expensive layers to be globally cached which +# should make building the final image much faster in most cases. +FROM us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine-build-base as build # Install all of our variantstore Python requirements. COPY requirements.txt requirements.txt RUN pip3 install --user -r requirements.txt -# Straightforward bcftools build following these instructions: -# https://github.com/samtools/bcftools/blob/develop/INSTALL -ARG BCFTOOLS_VERSION=1.16 -RUN mkdir /bcftools bcftools-build && \ - cd bcftools-build && \ - git clone --recurse-submodules https://github.com/samtools/htslib.git && \ - git clone https://github.com/samtools/bcftools.git && \ - cd bcftools && \ - git checkout tags/$BCFTOOLS_VERSION -b $BCFTOOLS_VERSION && \ - autoheader && \ - autoconf && \ - ./configure --prefix /bcftools && \ - make && \ - make install - # The main layer does not install development tools, instead copies artifacts from the build layer above. -FROM gcr.io/google.com/cloudsdktool/cloud-sdk:406.0.0-alpine as main +FROM gcr.io/google.com/cloudsdktool/cloud-sdk:408.0.1-alpine as main RUN apk update && apk upgrade RUN python3 -m ensurepip --upgrade diff --git a/scripts/variantstore/wdl/extract/build_base.Dockerfile b/scripts/variantstore/wdl/extract/build_base.Dockerfile new file mode 100644 index 00000000000..62ccc5864bf --- /dev/null +++ b/scripts/variantstore/wdl/extract/build_base.Dockerfile @@ -0,0 +1,77 @@ +# This Dockerfile creates a "build-base" image with tools and libraries required to build the tools and libraries used +# in the Genomic Variant Store pipeline. The Alpine version of the Google Cloud SDK is used as the base image which is +# not only the most compact of the Google Cloud SDK Docker images, but is also the image currently used by Cromwell for +# (de)localization of files in Google Cloud Storage. Sharing the base image with Cromwell's GCS localization should +# result in reuse of a cached copy of this base (and by far largest) image layer when running GVS pipelines in Terra / +# Cromwell. +# +# Because this is an Alpine-based image it is more bare-bones than its Debian-based peers. Key components missing here +# are the Apache Arrow library (a requirement for pyarrow which in turn is a requirement for the google-cloud-bigquery +# Python module) and bcftools. Compiling all these tools makes this a fairly expensive image to create (an hour or so +# under ideal circumstances, potentially much longer on low memory and/or non-x86 build hosts). Since this image isn't +# expected to change often it's broken out into a separate "build-base" image that can effectively be globally cached +# and referenced from the main Dockerfile. +FROM gcr.io/google.com/cloudsdktool/cloud-sdk:408.0.1-alpine + +RUN apk update && apk upgrade +RUN python3 -m ensurepip --upgrade + +# Add all required build tools. These will not be added to the main stage as they are only required to build PyArrow +# and bcftools but not to use them. +RUN apk add autoconf bash cmake g++ gcc make ninja python3-dev git openssl-dev zlib-dev xz-dev bzip2-dev curl-dev + +# Unfortunately neither pyarrow nor google-cloud-bigquery will fetch or build Apache Arrow when `pip install`ed from +# this base image. Therefore we do the Apache Arrow build ourselves. In order to keep the final image size small this +# Dockerfile is set up to do a multi-stage build following the usual pattern of "build" stage / "main" stage. +# https://docs.docker.com/build/building/multi-stage/#use-multi-stage-builds +# +# The build stage installs the required development tools, downloads the Apache Arrow source bundle and builds all +# required components including Apache Arrow C++ libraries, pyarrow Python module, and all pyarrow dependencies +# including the numpy Python module. The main stage will then use the same base image and copy over the artifacts +# produced by the build stage without having to install development tools or clean up after a build. + +ARG ARROW_VERSION=10.0.0 +RUN cd / && \ + curl -O https://dlcdn.apache.org/arrow/arrow-$ARROW_VERSION/apache-arrow-$ARROW_VERSION.tar.gz && \ + tar xfz apache-arrow-$ARROW_VERSION.tar.gz + +# Pyarrow build instructions from https://arrow.apache.org/docs/developers/python.html#python-development +# Modified slightly for the requirements of this installation: +# - Download a static source tarball rather than cloning the git repo. +# - Use `ninja` to build the C++ libraries as the `make` system doesn't seem to work as of Arrow 10.0.0. +# - Install PyArrow and its dependencies specifying the --user flag so all artifacts go to the /root/.local directory +# which can easily be copied to the main stage below. +ARG ARROW_SRC_DIR=/apache-arrow-$ARROW_VERSION +RUN pip3 install --user -r $ARROW_SRC_DIR/python/requirements-build.txt + +RUN mkdir /dist +RUN mkdir $ARROW_SRC_DIR/cpp/build && \ + cd $ARROW_SRC_DIR/cpp/build && \ + cmake .. --preset ninja-release-python && \ + cmake --build . && \ + cmake --install . + +ARG PYARROW_WITH_PARQUET=1 +ARG PYARROW_WITH_DATASET=1 +ARG PYARROW_PARALLEL=4 +RUN cd $ARROW_SRC_DIR/python && \ + python3 setup.py build_ext --inplace && \ + pip3 install wheel && \ + python3 setup.py build_ext --build-type=release \ + --bundle-arrow-cpp bdist_wheel && \ + pip3 install --user /apache-arrow-$ARROW_VERSION/python/dist/pyarrow-$ARROW_VERSION-*.whl + +# Straightforward bcftools build following these instructions: +# https://github.com/samtools/bcftools/blob/develop/INSTALL +ARG BCFTOOLS_VERSION=1.16 +RUN mkdir /bcftools bcftools-build && \ + cd bcftools-build && \ + git clone --recurse-submodules https://github.com/samtools/htslib.git && \ + git clone https://github.com/samtools/bcftools.git && \ + cd bcftools && \ + git checkout tags/$BCFTOOLS_VERSION -b $BCFTOOLS_VERSION && \ + autoheader && \ + autoconf && \ + ./configure --prefix /bcftools && \ + make && \ + make install diff --git a/scripts/variantstore/wdl/extract/build_build_base_docker.sh b/scripts/variantstore/wdl/extract/build_build_base_docker.sh new file mode 100755 index 00000000000..e25accf801b --- /dev/null +++ b/scripts/variantstore/wdl/extract/build_build_base_docker.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +if [ $# -lt 1 ]; then + echo "USAGE: ./build_build_base_docker.sh [DOCKER_TAG_STRING] [OPTIONAL:LATEST]" + echo " e.g.: ./build_build_base_docker.sh 2022-11-04-alpine-build-base" + exit 1 +fi + +if [[ ! "$1" == *-build-base ]]; then + echo "Specified tag '$1' does end with '-build-base'." + echo "build_build_base_docker.sh is intended for building build base images only." + exit 1 +fi + +set -o errexit -o nounset -o pipefail -o xtrace + +BASE_REPO="broad-dsde-methods/variantstore" +REPO_WITH_TAG="${BASE_REPO}:${1}" +GCR_TAG="us.gcr.io/${REPO_WITH_TAG}" + +docker build --file build_base.Dockerfile . -t "${REPO_WITH_TAG}" + +docker tag "${REPO_WITH_TAG}" "${GCR_TAG}" +docker push "${GCR_TAG}" + +echo "docker image pushed to \"${GCR_TAG}\"" diff --git a/scripts/variantstore/wdl/extract/build_docker.sh b/scripts/variantstore/wdl/extract/build_docker.sh index 765df17c117..1e0983f17b8 100755 --- a/scripts/variantstore/wdl/extract/build_docker.sh +++ b/scripts/variantstore/wdl/extract/build_docker.sh @@ -1,6 +1,6 @@ if [ $# -lt 1 ]; then echo "USAGE: ./build_docker.sh [DOCKER_TAG_STRING] [OPTIONAL:LATEST]" - echo " e.g.: ./build_docker.sh mybranch_20210403" + echo " e.g.: ./build_docker.sh 2022-11-04-alpine" exit 1 fi