Skip to content

Commit

Permalink
Put shared part together for Databricks test scripts [databricks] (#8828
Browse files Browse the repository at this point in the history
)

* Put shared part together for Databricks test scripts

To fix issue: #8779

Put shared part together for Databricks scripts test.sh and run_it.sh

to make update of the scripts easy, need not to modify both

test.sh and run_it.sh if we change the shared part.

Signed-off-by: Tim Liu <[email protected]>

* Put shared part together for cudf-udf test file

Signed-off-by: Tim Liu <[email protected]>

---------

Signed-off-by: Tim Liu <[email protected]>
  • Loading branch information
NvTimLiu authored Aug 2, 2023
1 parent 81630d1 commit 36b63c5
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 115 deletions.
59 changes: 59 additions & 0 deletions jenkins/databricks/common_vars.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash
#
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Setup SPARK_HOME if need
if [[ -z "$SPARK_HOME" ]]; then
# Configure spark environment on Databricks
export SPARK_HOME=$DB_HOME/spark
fi

# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
PYTHON_SITE_PACKAGES=${PYTHON_SITE_PACKAGES:-"$HOME/.local/lib/${PYTHON_VERSION}/site-packages"}

# Get the correct py4j file.
PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
export PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
if [[ "$TEST" == "cache_test" || "$TEST" == "cache_test.py" ]]; then
export PYSP_TEST_spark_sql_cache_serializer="$PCBS_CONF"
fi

export TEST_TYPE=${TEST_TYPE:-"nightly"}

if [[ -n "$LOCAL_JAR_PATH" ]]; then
export LOCAL_JAR_PATH=$LOCAL_JAR_PATH
fi

## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
if [ -n "$SPARK_CONF" ]; then
CONF_LIST=${SPARK_CONF//','/' '}
for CONF in ${CONF_LIST}; do
KEY=${CONF%%=*}
VALUE=${CONF#*=}
## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
done

## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
export SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
fi
62 changes: 13 additions & 49 deletions jenkins/databricks/cudf_udf_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,6 @@
# instructions accordingly.
set -ex

# Map of software versions for each dependency.

LOCAL_JAR_PATH=${LOCAL_JAR_PATH:-''}
SPARK_CONF=${SPARK_CONF:-''}

# Try to use "cudf-udf" conda environment for the python cudf-udf tests.
CONDA_HOME=${CONDA_HOME:-"/databricks/conda"}
if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then
Expand All @@ -44,63 +39,32 @@ if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then
fi
export PATH=${CONDA_HOME}/envs/cudf-udf/bin:$PATH
export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
# Set the path of python site-packages.
# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages"

# Install required packages
sudo apt -y install zip unzip
SOURCE_PATH="/home/ubuntu/spark-rapids"
[[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH

export SPARK_HOME=/databricks/spark
# Change to not point at Databricks confs so we don't conflict with their settings.
export SPARK_CONF_DIR=$PWD
# 'init_cudf_udf.sh' already be executed to install required python packages
# Init common variables like SPARK_HOME, spark configs
source jenkins/databricks/common_vars.sh

# Get the correct py4j file.
PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
# Set the path of python site-packages.
PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages"
# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
sudo chmod 777 /databricks/data/logs/
sudo chmod 777 /databricks/data/logs/*
echo { \"port\":\"15002\" } > ~/.databricks-connect
sudo ln -sf /databricks/jars/ $SPARK_HOME/jars
sudo chmod -R 777 /databricks/data/logs/

CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
--conf spark.rapids.memory.gpu.minAllocFraction=0 \
--conf spark.rapids.memory.gpu.allocFraction=0.1 \
--conf spark.rapids.python.memory.gpu.allocFraction=0.1 \
--conf spark.rapids.python.concurrentPythonWorkers=2"

## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
if [ -n "$SPARK_CONF" ]; then
CONF_LIST=${SPARK_CONF//','/' '}
for CONF in ${CONF_LIST}; do
KEY=${CONF%%=*}
VALUE=${CONF#*=}
## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
done

## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
fi

TEST_TYPE="nightly"
PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"

# Enable event log for qualification & profiling tools testing
export PYSP_TEST_spark_eventLog_enabled=true
mkdir -p /tmp/spark-events

if [ -d "$LOCAL_JAR_PATH" ]; then
## Run cudf-udf tests.
CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
else
## Run cudf-udf tests.
CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=0 \
bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
fi
CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $PWD/rapids-4-spark_*.jar | grep -v 'tests.jar'`"

SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
bash integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
39 changes: 6 additions & 33 deletions jenkins/databricks/run_it.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,46 +20,19 @@
# TEST_TAGS=xxx
# More details please refer to './integration_tests/run_pyspark_from_build.sh'.
# Note, 'setup.sh' should be executed first to setup proper environment.
#
# This file runs pytests with Jenkins parallel jobs.

set -xe

SPARK_VER=${SPARK_VER:-$(< /databricks/spark/VERSION)}
export SPARK_SHIM_VER=${SPARK_SHIM_VER:-spark${SPARK_VER//.}db}

# Setup SPARK_HOME if need
if [[ -z "$SPARK_HOME" ]]; then
# Configure spark environment on Databricks
export SPARK_HOME=$DB_HOME/spark
fi

SCALA_BINARY_VER=${SCALA_BINARY_VER:-'2.12'}

# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"

# Get the correct py4j file.
PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
# 'setup.sh' already be executed before running this script
db_script_path="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
# Init common variables like SPARK_HOME, spark configs
source $db_script_path/common_vars.sh

# Disable parallel test as multiple tests would be executed by leveraging external parallelism, e.g. Jenkins parallelism
export TEST_PARALLEL=${TEST_PARALLEL:-0}

if [[ "$TEST" == "cache_test" || "$TEST" == "cache_test.py" ]]; then
export PYSP_TEST_spark_sql_cache_serializer='com.nvidia.spark.ParquetCachedBatchSerializer'
fi

TEST_TYPE=${TEST_TYPE:-"nightly"}

if [[ -n "$LOCAL_JAR_PATH" ]]; then
export LOCAL_JAR_PATH=$LOCAL_JAR_PATH
fi

set +e
# Run integration testing
./integration_tests/run_pyspark_from_build.sh --runtime_env='databricks' --test_type=$TEST_TYPE
Expand Down
38 changes: 5 additions & 33 deletions jenkins/databricks/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,50 +33,24 @@
# 1. Check if any more dependencies need to be added to the apt/pip install commands.
# 2. If you had to go beyond the above steps to support the new runtime, then update the
# instructions accordingly.
#
# This file parallely runs pytests with python-xdist(e.g., TEST_PARALLEL=4).

set -ex

SOURCE_PATH="/home/ubuntu/spark-rapids"
[[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH

# Install python packages for integration tests
source jenkins/databricks/setup.sh
# Init common variables like SPARK_HOME, spark configs
source jenkins/databricks/common_vars.sh

SPARK_CONF=${SPARK_CONF:-''}
BASE_SPARK_VERSION=${BASE_SPARK_VERSION:-$(< /databricks/spark/VERSION)}
SHUFFLE_SPARK_SHIM=${SHUFFLE_SPARK_SHIM:-spark${BASE_SPARK_VERSION//./}db}
SHUFFLE_SPARK_SHIM=${SHUFFLE_SPARK_SHIM//\-SNAPSHOT/}
[[ -z $SPARK_SHIM_VER ]] && export SPARK_SHIM_VER=spark${BASE_SPARK_VERSION//.}db

# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"

export SPARK_HOME=/databricks/spark
# change to not point at databricks confs so we don't conflict with their settings
export SPARK_CONF_DIR=$PWD
# Get the correct py4j file.
PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE

## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
if [ -n "$SPARK_CONF" ]; then
CONF_LIST=${SPARK_CONF//','/' '}
for CONF in ${CONF_LIST}; do
KEY=${CONF%%=*}
VALUE=${CONF#*=}
## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
done

## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
fi

IS_SPARK_321_OR_LATER=0
[[ "$(printf '%s\n' "3.2.1" "$BASE_SPARK_VERSION" | sort -V | head -n1)" = "3.2.1" ]] && IS_SPARK_321_OR_LATER=1

Expand All @@ -86,8 +60,6 @@ IS_SPARK_321_OR_LATER=0
# - DELTA_LAKE_ONLY: delta_lake tests only
# - MULTITHREADED_SHUFFLE: shuffle tests only
TEST_MODE=${TEST_MODE:-'DEFAULT'}
TEST_TYPE="nightly"
PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"

# Classloader config is here to work around classloader issues with
# --packages in distributed setups, should be fixed by
Expand Down

0 comments on commit 36b63c5

Please sign in to comment.