Skip to content

Commit

Permalink
Support Spark 3.0 (#142)
Browse files Browse the repository at this point in the history
* Support Spark 3.0

* Default to Spark 2.4
  • Loading branch information
akhurana001 authored Jul 27, 2020
1 parent bf2ff3d commit b5d8196
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 4 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ pip install flytekit
If `@spark_task` is to be used, one should install the `spark` plugin.

```bash
pip install flytekit[spark]
pip install flytekit[spark] for Spark 2.4.x
pip install flytekit[spark3] for Spark 3.x
```

#### Schema
Expand Down Expand Up @@ -74,7 +75,7 @@ To install all or multiple available plugins, one can specify them individually:
pip install flytekit[sidecar,spark,schema]
```

Or install them with the `all` directive.
Or install them with the `all` directive. `all` defaults to Spark 2.4.x currently.

```bash
pip install flytekit[all]
Expand Down
2 changes: 1 addition & 1 deletion flytekit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

import flytekit.plugins

__version__ = "0.10.9"
__version__ = "0.10.10"
6 changes: 6 additions & 0 deletions flytekit/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
[pyspark]
)

_lazy_loader.LazyLoadPlugin(
"spark3",
["pyspark>=3.0.0"],
[pyspark]
)

_lazy_loader.LazyLoadPlugin(
"sidecar",
["k8s-proto>=0.0.3,<1.0.0"],
Expand Down
5 changes: 4 additions & 1 deletion flytekit/tools/lazy_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ def get_extras_require(cls):
d = cls.LAZY_LOADING_PLUGINS.copy()
all_plugins = []
for k in d:
all_plugins.extend(d[k])
# Default to Spark 2.4.x .
if k !="spark3":
all_plugins.extend(d[k])

d['all'] = all_plugins
return d

Expand Down
54 changes: 54 additions & 0 deletions scripts/flytekit_install_spark3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash

# Fetches and install Spark and its dependencies. To be invoked by the Dockerfile

# echo commands to the terminal output
set -ex

# Install JDK
apt-get update -y && \
apt-get install -y software-properties-common && \
add-apt-repository ppa:openjdk-r/ppa && \
apt-get update -y && \
apt-get install -y --force-yes ca-certificates-java && \
apt-get install -y --force-yes openjdk-8-jdk && \
apt-get install -y wget && \
update-java-alternatives -s java-1.8.0-openjdk-amd64 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

mkdir -p /opt/spark
mkdir -p /opt/spark/work-dir
touch /opt/spark/RELEASE

# Fetch Spark Distribution
wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz -O spark-dist.tgz
echo '98f6b92e5c476d7abb93cc179c2616aa5dc897da25753bd197e20ef54a28d945 spark-dist.tgz' | sha256sum --check
mkdir -p spark-dist
tar -xvf spark-dist.tgz -C spark-dist --strip-components 1

#Copy over required files
cp -rf spark-dist/jars /opt/spark/jars
cp -rf spark-dist/examples /opt/spark/examples
cp -rf spark-dist/python /opt/spark/python
cp -rf spark-dist/bin /opt/spark/bin
cp -rf spark-dist/sbin /opt/spark/sbin
cp -rf spark-dist/data /opt/spark/data
# Entrypoint for Driver/Executor pods
cp spark-dist/kubernetes/dockerfiles/spark/entrypoint.sh /opt/entrypoint.sh
chmod +x /opt/entrypoint.sh

rm -rf spark-dist.tgz
rm -rf spark-dist

# Fetch Hadoop Distribution with AWS Support
wget http://apache.mirrors.tds.net/hadoop/common/hadoop-2.7.7/hadoop-2.7.7.tar.gz -O hadoop-dist.tgz
echo 'd129d08a2c9dafec32855a376cbd2ab90c6a42790898cabbac6be4d29f9c2026 hadoop-dist.tgz' | sha256sum --check
mkdir -p hadoop-dist
tar -xvf hadoop-dist.tgz -C hadoop-dist --strip-components 1

cp -rf hadoop-dist/share/hadoop/tools/lib/hadoop-aws-2.7.7.jar /opt/spark/jars
cp -rf hadoop-dist/share/hadoop/tools/lib/aws-java-sdk-1.7.4.jar /opt/spark/jars

rm -rf hadoop-dist.tgz
rm -rf hadoop-dist
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
extras_require=extras_require,
scripts=[
'scripts/flytekit_install_spark.sh',
'scripts/flytekit_install_spark3.sh',
'scripts/flytekit_build_image.sh',
'scripts/flytekit_venv'
],
Expand Down

0 comments on commit b5d8196

Please sign in to comment.