Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alpine based Variant Store Docker image [VS-648] #8044

Merged
merged 10 commits into from
Oct 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ workflows:
branches:
- master
- ah_var_store
- vs_616_split_hail
- vs_648_alpine
- name: GvsIngestTieout
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ task Add_AS_MAX_VQSLOD_ToVcf {
File input_vcf
String output_basename

String docker = "us.gcr.io/broad-dsde-methods/variantstore:2022-09-28-slim"
String docker = "us.gcr.io/broad-dsde-methods/variantstore:2022-10-01-alpine"
Int cpu = 1
Int memory_mb = 3500
Int disk_size_gb = ceil(2*size(input_vcf, "GiB")) + 50
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsCallsetCost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ task WorkflowComputeCosts {
>>>

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-09-28-slim"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-01-alpine"
}

output {
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsCreateVAT.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ task MakeSubpopulationFilesAndReadSchemaFiles {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-09-28-slim"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-01-alpine"
memory: "1 GB"
preemptible: 3
cpu: "1"
Expand Down
4 changes: 2 additions & 2 deletions scripts/variantstore/wdl/GvsCreateVATAnnotations.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ task ExtractAnAcAfFromVCF {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-09-28-slim"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-01-alpine"
maxRetries: 3
memory: "16 GB"
preemptible: 3
Expand Down Expand Up @@ -291,7 +291,7 @@ task PrepAnnotationJson {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-09-28-slim"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-01-alpine"
memory: "8 GB"
preemptible: 5
cpu: "1"
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ task GenerateHailScripts {
File hail_create_vat_inputs_script = 'hail_create_vat_inputs.py'
}
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-09-28-slim"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-01-alpine"
disks: "local-disk 500 HDD"
}
}
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ task CurateInputLists {
--output_files True
>>>
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-09-28-slim"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-01-alpine"
memory: "3 GB"
disks: "local-disk 100 HDD"
bootDiskSizeGb: 15
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsPopulateAltAllele.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ task PopulateAltAlleleTable {
done
>>>
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-09-28-slim"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-01-alpine"
memory: "3 GB"
disks: "local-disk 10 HDD"
cpu: 1
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ task PrepareRangesCallsetTask {
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-09-28-slim"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-01-alpine"
memory: "3 GB"
disks: "local-disk 100 HDD"
bootDiskSizeGb: 15
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ task ScaleXYBedValues {
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-09-28-slim"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-01-alpine"
maxRetries: 3
memory: "7 GB"
preemptible: 3
Expand Down
99 changes: 92 additions & 7 deletions scripts/variantstore/wdl/extract/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,98 @@
FROM gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-slim as build
# The image produced by this Dockerfile contains tools and libraries required to support the Genomic Variant Store
# pipeline. The Alpine version of the Google Cloud SDK is used as the base image which is not only the most compact of
# the Google Cloud SDK Docker images, but is also the image currently used by Cromwell for (de)localization of files in
# Google Cloud Storage. Sharing the base image with Cromwell's GCS localization should result in reuse of a cached copy
# of this base (and by far largest) image layer when running GVS pipelines in Terra / Cromwell.
#
# Because this is an Alpine-based image it is more bare-bones than its Debian-based peers. A key component missing here
# is the Apache Arrow library which is a requirement for pyarrow which is a requirement for the google-cloud-bigquery
# Python module.
#
# Unfortunately neither pyarrow nor google-cloud-bigquery will fetch or build Apache Arrow when `pip install`ed from
# this base image. Therefore we do the Apache Arrow build ourselves. In order to keep the final image size small this
# Dockerfile is set up to do a multi-stage build following the usual pattern of "build" stage / "main" stage.
# https://docs.docker.com/build/building/multi-stage/#use-multi-stage-builds
#
# The build stage installs the required development tools, downloads the Apache Arrow source bundle and builds all
# required components including Apache Arrow C++ libraries, pyarrow Python module, and all pyarrow dependencies
# including the numpy Python module. The main stage will then use the same base image and copy over the artifacts
# produced by the build stage without having to install development tools or clean up after a build.
FROM gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine as build

# Copy the application's requirements.txt and run pip to install
COPY requirements.txt /app/requirements.txt
RUN apt-get update && apt-get -y install cmake bcftools jq
RUN curl -O https://bootstrap.pypa.io/get-pip.py
RUN python3 get-pip.py
RUN pip3 install -r /app/requirements.txt
RUN apk update && apk upgrade
RUN python3 -m ensurepip --upgrade

# Add all required build tools. These will not be added to the main stage as they are only required to build PyArrow
# but not to use it.
RUN apk add autoconf bash cmake g++ gcc make ninja python3-dev git openssl-dev

# Build Apache Arrow version 8.0.0 as version 9.0.0 does not compile under Alpine:
# https://issues.apache.org/jira/browse/ARROW-17329
ARG ARROW_VERSION=8.0.0
RUN cd / && \
curl -O https://dlcdn.apache.org/arrow/arrow-$ARROW_VERSION/apache-arrow-$ARROW_VERSION.tar.gz && \
tar xfz apache-arrow-$ARROW_VERSION.tar.gz

# Pyarrow build instructions from https://arrow.apache.org/docs/developers/python.html#python-development
# Modified slightly for the requirements of this installation:
# - Download a static source tarball rather than cloning the git repo.
# - Use build type Release rather than Debug.
# - Do not build tests.
# - Install PyArrow and its dependencies specifying the --user flag so all artifacts go to the /root/.local directory
# which can easily be copied to the main stage below.
RUN pip3 install --user -r /apache-arrow-$ARROW_VERSION/python/requirements-build.txt

RUN mkdir /dist
ARG ARROW_HOME=/dist
ARG LD_LIBRARY_PATH=/dist/lib:$LD_LIBRARY_PATH
RUN mkdir /apache-arrow-$ARROW_VERSION/cpp/build && \
cd /apache-arrow-$ARROW_VERSION/cpp/build && \
cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-DCMAKE_INSTALL_LIBDIR=lib \
-DCMAKE_BUILD_TYPE=Release \
-DARROW_DATASET=ON \
-DARROW_WITH_BZ2=ON \
-DARROW_WITH_ZLIB=ON \
-DARROW_WITH_ZSTD=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_SNAPPY=ON \
-DARROW_WITH_BROTLI=ON \
-DARROW_PARQUET=ON \
-DPARQUET_REQUIRE_ENCRYPTION=ON \
-DARROW_PYTHON=ON \
-DARROW_BUILD_TESTS=OFF \
.. && \
make -j4 && \
make install

ARG PYARROW_WITH_PARQUET=1
ARG PYARROW_WITH_DATASET=1
ARG PYARROW_PARALLEL=4
RUN cd /apache-arrow-$ARROW_VERSION/python && \
python3 setup.py build_ext --inplace && \
pip3 install wheel && \
python3 setup.py build_ext --build-type=release \
--bundle-arrow-cpp bdist_wheel && \
pip3 install --user /apache-arrow-$ARROW_VERSION/python/dist/pyarrow-$ARROW_VERSION-*.whl

# Install all of our variantstore Python requirements.
COPY requirements.txt requirements.txt
RUN pip3 install --user -r requirements.txt

# The main layer does not install development tools, instead copies artifacts from the build layer above.
FROM gcr.io/google.com/cloudsdktool/cloud-sdk:404.0.0-alpine as main

RUN apk update && apk upgrade
RUN python3 -m ensurepip --upgrade

# Add any generally useful Alpine packages here.
RUN apk add --no-cache jq

# The build stage generated Python artifacts to /root/.local via `pip install --user`, so grab all of those.
COPY --from=build /root/.local /root/.local

# Copy the application source code.
RUN mkdir /app
COPY *.py /app
COPY *.sql /app

Expand Down