Skip to content

Commit

Permalink
"build-base" Docker image for faster variantstore image builds [VS-71…
Browse files Browse the repository at this point in the history
…2] (#8085)
  • Loading branch information
mcovarr authored Nov 8, 2022
1 parent 7dd6ede commit d1907a8
Show file tree
Hide file tree
Showing 13 changed files with 120 additions and 96 deletions.
1 change: 1 addition & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ workflows:
branches:
- master
- ah_var_store
- vs_707_azure_setup
- name: GvsIngestTieout
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ task Add_AS_MAX_VQSLOD_ToVcf {
File input_vcf
String output_basename

String docker = "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
String docker = "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine"
Int cpu = 1
Int memory_mb = 3500
Int disk_size_gb = ceil(2*size(input_vcf, "GiB")) + 50
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsCallsetCost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ task WorkflowComputeCosts {
>>>

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine"
}

output {
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsCreateVAT.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ task MakeSubpopulationFilesAndReadSchemaFiles {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine"
memory: "1 GB"
preemptible: 3
cpu: "1"
Expand Down
4 changes: 2 additions & 2 deletions scripts/variantstore/wdl/GvsCreateVATAnnotations.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ task ExtractAnAcAfFromVCF {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine"
maxRetries: 3
memory: "16 GB"
preemptible: 3
Expand Down Expand Up @@ -291,7 +291,7 @@ task PrepAnnotationJson {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine"
memory: "8 GB"
preemptible: 5
cpu: "1"
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ task GenerateHailScripts {
File hail_create_vat_inputs_script = 'hail_create_vat_inputs.py'
}
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine"
disks: "local-disk 500 HDD"
}
}
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsPopulateAltAllele.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ task PopulateAltAlleleTable {
done
>>>
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine"
memory: "3 GB"
disks: "local-disk 10 HDD"
cpu: 1
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ task PrepareRangesCallsetTask {
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine"
memory: "3 GB"
disks: "local-disk 100 HDD"
bootDiskSizeGb: 15
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ task ScaleXYBedValues {
}
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine"
maxRetries: 3
memory: "7 GB"
preemptible: 3
Expand Down
92 changes: 6 additions & 86 deletions scripts/variantstore/wdl/extract/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,98 +4,18 @@
# Google Cloud Storage. Sharing the base image with Cromwell's GCS localization should result in reuse of a cached copy
# of this base (and by far largest) image layer when running GVS pipelines in Terra / Cromwell.
#
# Because this is an Alpine-based image it is more bare-bones than its Debian-based peers. Key components missing here
# are the Apache Arrow library (a requirement for pyarrow which in turn is a requirement for the google-cloud-bigquery
# Python module) and bcftools.
FROM gcr.io/google.com/cloudsdktool/cloud-sdk:406.0.0-alpine as build

RUN apk update && apk upgrade
RUN python3 -m ensurepip --upgrade

# Add all required build tools. These will not be added to the main stage as they are only required to build PyArrow
# and bcftools but not to use them.
RUN apk add autoconf bash cmake g++ gcc make ninja python3-dev git openssl-dev zlib-dev xz-dev bzip2-dev curl-dev

# Unfortunately neither pyarrow nor google-cloud-bigquery will fetch or build Apache Arrow when `pip install`ed from
# this base image. Therefore we do the Apache Arrow build ourselves. In order to keep the final image size small this
# Dockerfile is set up to do a multi-stage build following the usual pattern of "build" stage / "main" stage.
# https://docs.docker.com/build/building/multi-stage/#use-multi-stage-builds
#
# The build stage installs the required development tools, downloads the Apache Arrow source bundle and builds all
# required components including Apache Arrow C++ libraries, pyarrow Python module, and all pyarrow dependencies
# including the numpy Python module. The main stage will then use the same base image and copy over the artifacts
# produced by the build stage without having to install development tools or clean up after a build.

# Build Apache Arrow version 8.0.0 as version 9.0.0 does not compile under Alpine:
# https://issues.apache.org/jira/browse/ARROW-17329
ARG ARROW_VERSION=8.0.0
RUN cd / && \
curl -O https://dlcdn.apache.org/arrow/arrow-$ARROW_VERSION/apache-arrow-$ARROW_VERSION.tar.gz && \
tar xfz apache-arrow-$ARROW_VERSION.tar.gz

# Pyarrow build instructions from https://arrow.apache.org/docs/developers/python.html#python-development
# Modified slightly for the requirements of this installation:
# - Download a static source tarball rather than cloning the git repo.
# - Use build type Release rather than Debug.
# - Do not build tests.
# - Install PyArrow and its dependencies specifying the --user flag so all artifacts go to the /root/.local directory
# which can easily be copied to the main stage below.
RUN pip3 install --user -r /apache-arrow-$ARROW_VERSION/python/requirements-build.txt

RUN mkdir /dist
ARG ARROW_HOME=/dist
ARG LD_LIBRARY_PATH=/dist/lib:$LD_LIBRARY_PATH
RUN mkdir /apache-arrow-$ARROW_VERSION/cpp/build && \
cd /apache-arrow-$ARROW_VERSION/cpp/build && \
cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-DCMAKE_INSTALL_LIBDIR=lib \
-DCMAKE_BUILD_TYPE=Release \
-DARROW_DATASET=ON \
-DARROW_WITH_BZ2=ON \
-DARROW_WITH_ZLIB=ON \
-DARROW_WITH_ZSTD=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_SNAPPY=ON \
-DARROW_WITH_BROTLI=ON \
-DARROW_PARQUET=ON \
-DPARQUET_REQUIRE_ENCRYPTION=ON \
-DARROW_PYTHON=ON \
-DARROW_BUILD_TESTS=OFF \
.. && \
make -j4 && \
make install

ARG PYARROW_WITH_PARQUET=1
ARG PYARROW_WITH_DATASET=1
ARG PYARROW_PARALLEL=4
RUN cd /apache-arrow-$ARROW_VERSION/python && \
python3 setup.py build_ext --inplace && \
pip3 install wheel && \
python3 setup.py build_ext --build-type=release \
--bundle-arrow-cpp bdist_wheel && \
pip3 install --user /apache-arrow-$ARROW_VERSION/python/dist/pyarrow-$ARROW_VERSION-*.whl
# This is a multi-stage build that uses a custom "build-base" image for the build stage. The build-base image is
# expensive to create and isn't expected to change often, while the steps in this Dockerfile are much less expensive and
# more likely to change. Using a build-base image essentially allows the expensive layers to be globally cached which
# should make building the final image much faster in most cases.
FROM us.gcr.io/broad-dsde-methods/variantstore:2022-11-08-alpine-build-base as build

# Install all of our variantstore Python requirements.
COPY requirements.txt requirements.txt
RUN pip3 install --user -r requirements.txt

# Straightforward bcftools build following these instructions:
# https://github.com/samtools/bcftools/blob/develop/INSTALL
ARG BCFTOOLS_VERSION=1.16
RUN mkdir /bcftools bcftools-build && \
cd bcftools-build && \
git clone --recurse-submodules https://github.com/samtools/htslib.git && \
git clone https://github.com/samtools/bcftools.git && \
cd bcftools && \
git checkout tags/$BCFTOOLS_VERSION -b $BCFTOOLS_VERSION && \
autoheader && \
autoconf && \
./configure --prefix /bcftools && \
make && \
make install

# The main layer does not install development tools, instead copies artifacts from the build layer above.
FROM gcr.io/google.com/cloudsdktool/cloud-sdk:406.0.0-alpine as main
FROM gcr.io/google.com/cloudsdktool/cloud-sdk:408.0.1-alpine as main

RUN apk update && apk upgrade
RUN python3 -m ensurepip --upgrade
Expand Down
77 changes: 77 additions & 0 deletions scripts/variantstore/wdl/extract/build_base.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# This Dockerfile creates a "build-base" image with tools and libraries required to build the tools and libraries used
# in the Genomic Variant Store pipeline. The Alpine version of the Google Cloud SDK is used as the base image which is
# not only the most compact of the Google Cloud SDK Docker images, but is also the image currently used by Cromwell for
# (de)localization of files in Google Cloud Storage. Sharing the base image with Cromwell's GCS localization should
# result in reuse of a cached copy of this base (and by far largest) image layer when running GVS pipelines in Terra /
# Cromwell.
#
# Because this is an Alpine-based image it is more bare-bones than its Debian-based peers. Key components missing here
# are the Apache Arrow library (a requirement for pyarrow which in turn is a requirement for the google-cloud-bigquery
# Python module) and bcftools. Compiling all these tools makes this a fairly expensive image to create (an hour or so
# under ideal circumstances, potentially much longer on low memory and/or non-x86 build hosts). Since this image isn't
# expected to change often it's broken out into a separate "build-base" image that can effectively be globally cached
# and referenced from the main Dockerfile.
FROM gcr.io/google.com/cloudsdktool/cloud-sdk:408.0.1-alpine

RUN apk update && apk upgrade
RUN python3 -m ensurepip --upgrade

# Add all required build tools. These will not be added to the main stage as they are only required to build PyArrow
# and bcftools but not to use them.
RUN apk add autoconf bash cmake g++ gcc make ninja python3-dev git openssl-dev zlib-dev xz-dev bzip2-dev curl-dev

# Unfortunately neither pyarrow nor google-cloud-bigquery will fetch or build Apache Arrow when `pip install`ed from
# this base image. Therefore we do the Apache Arrow build ourselves. In order to keep the final image size small this
# Dockerfile is set up to do a multi-stage build following the usual pattern of "build" stage / "main" stage.
# https://docs.docker.com/build/building/multi-stage/#use-multi-stage-builds
#
# The build stage installs the required development tools, downloads the Apache Arrow source bundle and builds all
# required components including Apache Arrow C++ libraries, pyarrow Python module, and all pyarrow dependencies
# including the numpy Python module. The main stage will then use the same base image and copy over the artifacts
# produced by the build stage without having to install development tools or clean up after a build.

ARG ARROW_VERSION=10.0.0
RUN cd / && \
curl -O https://dlcdn.apache.org/arrow/arrow-$ARROW_VERSION/apache-arrow-$ARROW_VERSION.tar.gz && \
tar xfz apache-arrow-$ARROW_VERSION.tar.gz

# Pyarrow build instructions from https://arrow.apache.org/docs/developers/python.html#python-development
# Modified slightly for the requirements of this installation:
# - Download a static source tarball rather than cloning the git repo.
# - Use `ninja` to build the C++ libraries as the `make` system doesn't seem to work as of Arrow 10.0.0.
# - Install PyArrow and its dependencies specifying the --user flag so all artifacts go to the /root/.local directory
# which can easily be copied to the main stage below.
ARG ARROW_SRC_DIR=/apache-arrow-$ARROW_VERSION
RUN pip3 install --user -r $ARROW_SRC_DIR/python/requirements-build.txt

RUN mkdir /dist
RUN mkdir $ARROW_SRC_DIR/cpp/build && \
cd $ARROW_SRC_DIR/cpp/build && \
cmake .. --preset ninja-release-python && \
cmake --build . && \
cmake --install .

ARG PYARROW_WITH_PARQUET=1
ARG PYARROW_WITH_DATASET=1
ARG PYARROW_PARALLEL=4
RUN cd $ARROW_SRC_DIR/python && \
python3 setup.py build_ext --inplace && \
pip3 install wheel && \
python3 setup.py build_ext --build-type=release \
--bundle-arrow-cpp bdist_wheel && \
pip3 install --user /apache-arrow-$ARROW_VERSION/python/dist/pyarrow-$ARROW_VERSION-*.whl

# Straightforward bcftools build following these instructions:
# https://github.com/samtools/bcftools/blob/develop/INSTALL
ARG BCFTOOLS_VERSION=1.16
RUN mkdir /bcftools bcftools-build && \
cd bcftools-build && \
git clone --recurse-submodules https://github.com/samtools/htslib.git && \
git clone https://github.com/samtools/bcftools.git && \
cd bcftools && \
git checkout tags/$BCFTOOLS_VERSION -b $BCFTOOLS_VERSION && \
autoheader && \
autoconf && \
./configure --prefix /bcftools && \
make && \
make install
26 changes: 26 additions & 0 deletions scripts/variantstore/wdl/extract/build_build_base_docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env bash

if [ $# -lt 1 ]; then
echo "USAGE: ./build_build_base_docker.sh [DOCKER_TAG_STRING] [OPTIONAL:LATEST]"
echo " e.g.: ./build_build_base_docker.sh 2022-11-04-alpine-build-base"
exit 1
fi

if [[ ! "$1" == *-build-base ]]; then
echo "Specified tag '$1' does end with '-build-base'."
echo "build_build_base_docker.sh is intended for building build base images only."
exit 1
fi

set -o errexit -o nounset -o pipefail -o xtrace

BASE_REPO="broad-dsde-methods/variantstore"
REPO_WITH_TAG="${BASE_REPO}:${1}"
GCR_TAG="us.gcr.io/${REPO_WITH_TAG}"

docker build --file build_base.Dockerfile . -t "${REPO_WITH_TAG}"

docker tag "${REPO_WITH_TAG}" "${GCR_TAG}"
docker push "${GCR_TAG}"

echo "docker image pushed to \"${GCR_TAG}\""
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/extract/build_docker.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
if [ $# -lt 1 ]; then
echo "USAGE: ./build_docker.sh [DOCKER_TAG_STRING] [OPTIONAL:LATEST]"
echo " e.g.: ./build_docker.sh mybranch_20210403"
echo " e.g.: ./build_docker.sh 2022-11-04-alpine"
exit 1
fi

Expand Down

0 comments on commit d1907a8

Please sign in to comment.