From b5d8196b3f99d9f7cd81b112764a2a0184b1d944 Mon Sep 17 00:00:00 2001 From: akhurana001 <34587798+akhurana001@users.noreply.github.com> Date: Mon, 27 Jul 2020 16:56:33 -0700 Subject: [PATCH] Support Spark 3.0 (#142) * Support Spark 3.0 * Default to Spark 2.4 --- README.md | 5 +-- flytekit/__init__.py | 2 +- flytekit/plugins/__init__.py | 6 ++++ flytekit/tools/lazy_loader.py | 5 ++- scripts/flytekit_install_spark3.sh | 54 ++++++++++++++++++++++++++++++ setup.py | 1 + 6 files changed, 69 insertions(+), 4 deletions(-) create mode 100644 scripts/flytekit_install_spark3.sh diff --git a/README.md b/README.md index 9e4be08a88..eb71b181d8 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,8 @@ pip install flytekit If `@spark_task` is to be used, one should install the `spark` plugin. ```bash -pip install flytekit[spark] +pip install flytekit[spark] for Spark 2.4.x +pip install flytekit[spark3] for Spark 3.x ``` #### Schema @@ -74,7 +75,7 @@ To install all or multiple available plugins, one can specify them individually: pip install flytekit[sidecar,spark,schema] ``` -Or install them with the `all` directive. +Or install them with the `all` directive. `all` defaults to Spark 2.4.x currently. ```bash pip install flytekit[all] diff --git a/flytekit/__init__.py b/flytekit/__init__.py index 23b5d674a8..93ed670e95 100644 --- a/flytekit/__init__.py +++ b/flytekit/__init__.py @@ -2,4 +2,4 @@ import flytekit.plugins -__version__ = "0.10.9" +__version__ = "0.10.10" diff --git a/flytekit/plugins/__init__.py b/flytekit/plugins/__init__.py index 3235244d79..a155108136 100644 --- a/flytekit/plugins/__init__.py +++ b/flytekit/plugins/__init__.py @@ -25,6 +25,12 @@ [pyspark] ) +_lazy_loader.LazyLoadPlugin( + "spark3", + ["pyspark>=3.0.0"], + [pyspark] +) + _lazy_loader.LazyLoadPlugin( "sidecar", ["k8s-proto>=0.0.3,<1.0.0"], diff --git a/flytekit/tools/lazy_loader.py b/flytekit/tools/lazy_loader.py index 425ff6f2e3..cbf9367f79 100644 --- a/flytekit/tools/lazy_loader.py +++ b/flytekit/tools/lazy_loader.py @@ -26,7 +26,10 @@ def get_extras_require(cls): d = cls.LAZY_LOADING_PLUGINS.copy() all_plugins = [] for k in d: - all_plugins.extend(d[k]) + # Default to Spark 2.4.x . + if k !="spark3": + all_plugins.extend(d[k]) + d['all'] = all_plugins return d diff --git a/scripts/flytekit_install_spark3.sh b/scripts/flytekit_install_spark3.sh new file mode 100644 index 0000000000..190f1989eb --- /dev/null +++ b/scripts/flytekit_install_spark3.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Fetches and install Spark and its dependencies. To be invoked by the Dockerfile + +# echo commands to the terminal output +set -ex + +# Install JDK +apt-get update -y && \ + apt-get install -y software-properties-common && \ + add-apt-repository ppa:openjdk-r/ppa && \ + apt-get update -y && \ + apt-get install -y --force-yes ca-certificates-java && \ + apt-get install -y --force-yes openjdk-8-jdk && \ + apt-get install -y wget && \ + update-java-alternatives -s java-1.8.0-openjdk-amd64 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +mkdir -p /opt/spark +mkdir -p /opt/spark/work-dir +touch /opt/spark/RELEASE + +# Fetch Spark Distribution +wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz -O spark-dist.tgz +echo '98f6b92e5c476d7abb93cc179c2616aa5dc897da25753bd197e20ef54a28d945 spark-dist.tgz' | sha256sum --check +mkdir -p spark-dist +tar -xvf spark-dist.tgz -C spark-dist --strip-components 1 + +#Copy over required files +cp -rf spark-dist/jars /opt/spark/jars +cp -rf spark-dist/examples /opt/spark/examples +cp -rf spark-dist/python /opt/spark/python +cp -rf spark-dist/bin /opt/spark/bin +cp -rf spark-dist/sbin /opt/spark/sbin +cp -rf spark-dist/data /opt/spark/data +# Entrypoint for Driver/Executor pods +cp spark-dist/kubernetes/dockerfiles/spark/entrypoint.sh /opt/entrypoint.sh +chmod +x /opt/entrypoint.sh + +rm -rf spark-dist.tgz +rm -rf spark-dist + +# Fetch Hadoop Distribution with AWS Support +wget http://apache.mirrors.tds.net/hadoop/common/hadoop-2.7.7/hadoop-2.7.7.tar.gz -O hadoop-dist.tgz +echo 'd129d08a2c9dafec32855a376cbd2ab90c6a42790898cabbac6be4d29f9c2026 hadoop-dist.tgz' | sha256sum --check +mkdir -p hadoop-dist +tar -xvf hadoop-dist.tgz -C hadoop-dist --strip-components 1 + +cp -rf hadoop-dist/share/hadoop/tools/lib/hadoop-aws-2.7.7.jar /opt/spark/jars +cp -rf hadoop-dist/share/hadoop/tools/lib/aws-java-sdk-1.7.4.jar /opt/spark/jars + +rm -rf hadoop-dist.tgz +rm -rf hadoop-dist \ No newline at end of file diff --git a/setup.py b/setup.py index 6358c3386d..1577394b06 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ extras_require=extras_require, scripts=[ 'scripts/flytekit_install_spark.sh', + 'scripts/flytekit_install_spark3.sh', 'scripts/flytekit_build_image.sh', 'scripts/flytekit_venv' ],