From 46b665961e2dec5140a9ae31b091eccaf81c6d8a Mon Sep 17 00:00:00 2001 From: Fan Ting Wei Date: Tue, 10 Oct 2023 13:57:17 +0800 Subject: [PATCH] feat(build): add spark 3.4.1 (#59) * add combinations for 3.4.1 * quotes * spaces * apply vars * fix issues with python versions and libreadline * define path for poetry * drop python 3.7 runtime * edits --- .github/workflows/ci.yml | 160 ++++++++++++++++++++------------------- Dockerfile | 12 ++- README.md | 9 ++- templates/ci.yml.tmpl | 7 +- templates/vars.yml | 16 +++- 5 files changed, 114 insertions(+), 90 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d0b0de1..afbf64c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,246 +15,246 @@ jobs: strategy: matrix: version: - - spark: "3.1.3" - hadoop: "2.7.4" - scala: "2.12" - java: "8" - python: "3.7" - spark: "3.1.3" hadoop: "2.7.4" scala: "2.12" java: "8" python: "3.8" + image: "-buster" - spark: "3.1.3" hadoop: "2.7.4" scala: "2.12" java: "8" python: "3.9" - - spark: "3.1.3" - hadoop: "3.2.0" - scala: "2.12" - java: "8" - python: "3.7" + image: "-buster" - spark: "3.1.3" hadoop: "3.2.0" scala: "2.12" java: "8" python: "3.8" + image: "-buster" - spark: "3.1.3" hadoop: "3.2.0" scala: "2.12" java: "8" python: "3.9" - - spark: "3.1.3" - hadoop: "2.7.4" - scala: "2.12" - java: "11" - python: "3.7" + image: "-buster" - spark: "3.1.3" hadoop: "2.7.4" scala: "2.12" java: "11" python: "3.8" + image: "-buster" - spark: "3.1.3" hadoop: "2.7.4" scala: "2.12" java: "11" python: "3.9" - - spark: "3.1.3" - hadoop: "3.2.0" - scala: "2.12" - java: "11" - python: "3.7" + image: "-buster" - spark: "3.1.3" hadoop: "3.2.0" scala: "2.12" java: "11" python: "3.8" + image: "-buster" - spark: "3.1.3" hadoop: "3.2.0" scala: "2.12" java: "11" python: "3.9" - - spark: "3.2.2" - hadoop: "3.3.1" - scala: "2.12" - java: "8" - python: "3.7" + image: "-buster" - spark: "3.2.2" hadoop: "3.3.1" scala: "2.12" java: "8" python: "3.8" + image: "-buster" - spark: "3.2.2" hadoop: "3.3.1" scala: "2.12" java: "8" python: "3.9" - - spark: "3.2.2" - hadoop: "3.3.1" - scala: "2.13" - java: "8" - python: "3.7" + image: "-buster" - spark: "3.2.2" hadoop: "3.3.1" scala: "2.13" java: "8" python: "3.8" + image: "-buster" - spark: "3.2.2" hadoop: "3.3.1" scala: "2.13" java: "8" python: "3.9" - - spark: "3.2.2" - hadoop: "3.3.1" - scala: "2.12" - java: "11" - python: "3.7" + image: "-buster" - spark: "3.2.2" hadoop: "3.3.1" scala: "2.12" java: "11" python: "3.8" + image: "-buster" - spark: "3.2.2" hadoop: "3.3.1" scala: "2.12" java: "11" python: "3.9" - - spark: "3.2.2" - hadoop: "3.3.1" - scala: "2.13" - java: "11" - python: "3.7" + image: "-buster" - spark: "3.2.2" hadoop: "3.3.1" scala: "2.13" java: "11" python: "3.8" + image: "-buster" - spark: "3.2.2" hadoop: "3.3.1" scala: "2.13" java: "11" python: "3.9" - - spark: "3.3.0" - hadoop: "3.3.2" - scala: "2.12" - java: "8" - python: "3.7" + image: "-buster" - spark: "3.3.0" hadoop: "3.3.2" scala: "2.12" java: "8" python: "3.8" + image: "-buster" - spark: "3.3.0" hadoop: "3.3.2" scala: "2.12" java: "8" python: "3.9" - - spark: "3.3.0" - hadoop: "3.3.2" - scala: "2.13" - java: "8" - python: "3.7" + image: "-buster" - spark: "3.3.0" hadoop: "3.3.2" scala: "2.13" java: "8" python: "3.8" + image: "-buster" - spark: "3.3.0" hadoop: "3.3.2" scala: "2.13" java: "8" python: "3.9" - - spark: "3.3.0" - hadoop: "3.3.2" - scala: "2.12" - java: "11" - python: "3.7" + image: "-buster" - spark: "3.3.0" hadoop: "3.3.2" scala: "2.12" java: "11" python: "3.8" + image: "-buster" - spark: "3.3.0" hadoop: "3.3.2" scala: "2.12" java: "11" python: "3.9" - - spark: "3.3.0" - hadoop: "3.3.2" - scala: "2.13" - java: "11" - python: "3.7" + image: "-buster" - spark: "3.3.0" hadoop: "3.3.2" scala: "2.13" java: "11" python: "3.8" + image: "-buster" - spark: "3.3.0" hadoop: "3.3.2" scala: "2.13" java: "11" python: "3.9" - - spark: "3.3.1" - hadoop: "3.3.2" - scala: "2.12" - java: "8" - python: "3.7" + image: "-buster" - spark: "3.3.1" hadoop: "3.3.2" scala: "2.12" java: "8" python: "3.8" + image: "-buster" - spark: "3.3.1" hadoop: "3.3.2" scala: "2.12" java: "8" python: "3.9" - - spark: "3.3.1" - hadoop: "3.3.2" - scala: "2.13" - java: "8" - python: "3.7" + image: "-buster" - spark: "3.3.1" hadoop: "3.3.2" scala: "2.13" java: "8" python: "3.8" + image: "-buster" - spark: "3.3.1" hadoop: "3.3.2" scala: "2.13" java: "8" python: "3.9" - - spark: "3.3.1" - hadoop: "3.3.2" - scala: "2.12" - java: "11" - python: "3.7" + image: "-buster" - spark: "3.3.1" hadoop: "3.3.2" scala: "2.12" java: "11" python: "3.8" + image: "-buster" - spark: "3.3.1" hadoop: "3.3.2" scala: "2.12" java: "11" python: "3.9" + image: "-buster" - spark: "3.3.1" hadoop: "3.3.2" scala: "2.13" java: "11" - python: "3.7" + python: "3.8" + image: "-buster" - spark: "3.3.1" hadoop: "3.3.2" scala: "2.13" java: "11" + python: "3.9" + image: "-buster" + - spark: "3.4.1" + hadoop: "3.3.4" + scala: "2.12" + java: "8" + python: "3.8" + image: "" + - spark: "3.4.1" + hadoop: "3.3.4" + scala: "2.12" + java: "8" + python: "3.9" + image: "" + - spark: "3.4.1" + hadoop: "3.3.4" + scala: "2.13" + java: "8" + python: "3.8" + image: "" + - spark: "3.4.1" + hadoop: "3.3.4" + scala: "2.13" + java: "8" + python: "3.9" + image: "" + - spark: "3.4.1" + hadoop: "3.3.4" + scala: "2.12" + java: "11" python: "3.8" - - spark: "3.3.1" - hadoop: "3.3.2" + image: "" + - spark: "3.4.1" + hadoop: "3.3.4" + scala: "2.12" + java: "11" + python: "3.9" + image: "" + - spark: "3.4.1" + hadoop: "3.3.4" + scala: "2.13" + java: "11" + python: "3.8" + image: "" + - spark: "3.4.1" + hadoop: "3.3.4" scala: "2.13" java: "11" python: "3.9" + image: "" runs-on: ubuntu-20.04 env: IMAGE_NAME: spark-k8s-addons @@ -265,6 +265,7 @@ jobs: SCALA_VERSION: "${{ matrix.version.scala }}" JAVA_VERSION: "${{ matrix.version.java }}" PYTHON_VERSION: "${{ matrix.version.python }}" + IMAGE_VERSION: "${{ matrix.version.image }}" steps: - name: Set up Java uses: actions/setup-java@v2 @@ -291,7 +292,8 @@ jobs: --build-arg JAVA_VERSION="${JAVA_VERSION}" \ --build-arg HADOOP_VERSION="${HADOOP_VERSION}" \ --build-arg SCALA_VERSION="${SCALA_VERSION}" \ - --build-arg PYTHON_VERSION="${PYTHON_VERSION}" + --build-arg PYTHON_VERSION="${PYTHON_VERSION}" \ + --build-arg IMAGE_VERSION="${IMAGE_VERSION}" - name: Push Docker image run: bash push-images.sh env: diff --git a/Dockerfile b/Dockerfile index 2b418ef..26dfc06 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,10 +5,10 @@ ARG HADOOP_VERSION ARG SCALA_VERSION ARG JAVA_VERSION ARG PYTHON_VERSION -ARG DEBIAN_DIST=buster +ARG IMAGE_VERSION # For copying over of Python set-up -FROM python:${PYTHON_VERSION}-${DEBIAN_DIST} as python_base +FROM python:${PYTHON_VERSION}${IMAGE_VERSION} as python_base # While it might make sense to start from `dsaidgovsg/spark-k8s-py` instead, # it is easier to just COPY over from the above image just the python directory @@ -26,6 +26,7 @@ ENV PYTHONPATH="${SPARK_HOME}/python/lib/pyspark.zip:${SPARK_HOME}/python/lib/py ARG HADOOP_VERSION ARG PYTHON_VERSION +ARG IMAGE_VERSION USER root SHELL ["/bin/bash", "-c"] @@ -39,7 +40,12 @@ RUN set -euo pipefail && \ spark-shell --version; \ pyspark --version; \ # Required extra deps - apt-get update && apt-get install --no-install-recommends -y libexpat1 libreadline7 tk; \ + if [ "${IMAGE_VERSION}" = "-buster" ]; then \ + export LIBREADLINE_VERSION=7 ; \ + else \ + export LIBREADLINE_VERSION=8 ; \ + fi ; \ + apt-get update && apt-get install --no-install-recommends -y libexpat1 libreadline"${LIBREADLINE_VERSION}" tk; \ rm -rf /var/lib/apt/lists/*; \ ldconfig; \ # Test every command to return non-error status code for help diff --git a/README.md b/README.md index 8acdedf..f9b4536 100644 --- a/README.md +++ b/README.md @@ -8,18 +8,18 @@ K8s Docker images. The Spark K8s Docker images are built using [this repository](https://github.com/dsaidgovsg/spark-k8s). -Note that the images here are Debian based because of how the official script -generates the Spark-Kubernetes images. +Note that the images for Spark version below 3.4.0 here are Debian based because of how the official script generates the Spark-Kubernetes images. For Spark version above 3.4.0, Ubuntu-based images are generated instead based on the official script. ## How to build ```bash BASE_VERSION=v3 -SPARK_VERSION=3.3.0 +SPARK_VERSION=3.4.1 JAVA_VERSION=11 -HADOOP_VERSION=3.3.2 +HADOOP_VERSION=3.3.4 SCALA_VERSION=2.13 PYTHON_VERSION=3.9 +IMAGE_VERSION="" docker pull dsaidgovsg/spark-k8s-py:${BASE_VERSION}_${SPARK_VERSION}_hadoop-${HADOOP_VERSION}_scala-${SCALA_VERSION}_java-${JAVA_VERSION} @@ -31,6 +31,7 @@ docker build -t "${IMAGE_NAME}" \ --build-arg HADOOP_VERSION="${HADOOP_VERSION}" \ --build-arg SCALA_VERSION="${SCALA_VERSION}" \ --build-arg PYTHON_VERSION="${PYTHON_VERSION}" \ + --build-arg IMAGE_VERSION="${IMAGE_VERSION}" \ . ``` diff --git a/templates/ci.yml.tmpl b/templates/ci.yml.tmpl index 631ef70..3d35128 100644 --- a/templates/ci.yml.tmpl +++ b/templates/ci.yml.tmpl @@ -21,11 +21,14 @@ jobs: {%- for hadoop in v.hadoop %} {%- for scala in v.scala %} {%- for python in v.python %} +{%- for image in v.image %} - spark: "{{ spark }}" hadoop: "{{ hadoop }}" scala: "{{ scala }}" java: "{{ java }}" python: "{{ python }}" + image: "{{ image }}" +{%- endfor %} {%- endfor %} {%- endfor %} {%- endfor %} @@ -43,6 +46,7 @@ jobs: SCALA_VERSION: "${{ matrix.version.scala }}" JAVA_VERSION: "${{ matrix.version.java }}" PYTHON_VERSION: "${{ matrix.version.python }}" + IMAGE_VERSION: "${{ matrix.version.image }}" {%- endraw %} steps: - name: Set up Java @@ -72,7 +76,8 @@ jobs: --build-arg JAVA_VERSION="${JAVA_VERSION}" \ --build-arg HADOOP_VERSION="${HADOOP_VERSION}" \ --build-arg SCALA_VERSION="${SCALA_VERSION}" \ - --build-arg PYTHON_VERSION="${PYTHON_VERSION}" + --build-arg PYTHON_VERSION="${PYTHON_VERSION}" \ + --build-arg IMAGE_VERSION="${IMAGE_VERSION}" - name: Push Docker image run: bash push-images.sh env: diff --git a/templates/vars.yml b/templates/vars.yml index 7df9959..67cc961 100644 --- a/templates/vars.yml +++ b/templates/vars.yml @@ -6,16 +6,26 @@ versions: hadoop: ["2.7.4", "3.2.0"] scala: ["2.12"] java: ["8", "11"] - python: ["3.7", "3.8", "3.9"] + python: ["3.8", "3.9"] + image: ["-buster"] - spark: ["3.2.2"] hadoop: ["3.3.1"] scala: ["2.12", "2.13"] java: ["8", "11"] - python: ["3.7", "3.8", "3.9"] + python: ["3.8", "3.9"] + image: ["-buster"] - spark: ["3.3.0", "3.3.1"] hadoop: ["3.3.2"] scala: ["2.12", "2.13"] java: ["8", "11"] - python: ["3.7", "3.8", "3.9"] + python: ["3.8", "3.9"] + image: ["-buster"] + +- spark: ['3.4.1'] + hadoop: ['3.3.4'] + scala: ['2.12', '2.13'] + java: ['8', '11'] + python: ["3.8", "3.9"] + image: [""]