Put shared part together for Databricks test scripts [databricks] (#8828

) * Put shared part together for Databricks test scripts To fix issue: #8779 Put shared part together for Databricks scripts test.sh and run_it.sh to make update of the scripts easy, need not to modify both test.sh and run_it.sh if we change the shared part. Signed-off-by: Tim Liu <[email protected]> * Put shared part together for cudf-udf test file Signed-off-by: Tim Liu <[email protected]> --------- Signed-off-by: Tim Liu <[email protected]>
NVIDIA · Aug 2, 2023 · 36b63c5 · 36b63c5
1 parent 81630d1
commit 36b63c5
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 115 deletions.
diff --git a/jenkins/databricks/common_vars.sh b/jenkins/databricks/common_vars.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+#
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Setup SPARK_HOME if need
+if [[ -z "$SPARK_HOME" ]]; then
+    # Configure spark environment on Databricks
+    export SPARK_HOME=$DB_HOME/spark
+fi
+
+# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
+export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
+# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
+PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
+PYTHON_SITE_PACKAGES=${PYTHON_SITE_PACKAGES:-"$HOME/.local/lib/${PYTHON_VERSION}/site-packages"}
+
+# Get the correct py4j file.
+PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
+# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
+# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
+export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
+export PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
+if [[ "$TEST" == "cache_test" || "$TEST" == "cache_test.py" ]]; then
+    export PYSP_TEST_spark_sql_cache_serializer="$PCBS_CONF"
+fi
+
+export TEST_TYPE=${TEST_TYPE:-"nightly"}
+
+if [[ -n "$LOCAL_JAR_PATH" ]]; then
+    export LOCAL_JAR_PATH=$LOCAL_JAR_PATH
+fi
+
+## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
+if [ -n "$SPARK_CONF" ]; then
+    CONF_LIST=${SPARK_CONF//','/' '}
+    for CONF in ${CONF_LIST}; do
+        KEY=${CONF%%=*}
+        VALUE=${CONF#*=}
+        ## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
+        export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
+    done
+
+    ## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
+    export SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
+fi
diff --git a/jenkins/databricks/cudf_udf_test.sh b/jenkins/databricks/cudf_udf_test.sh
@@ -31,11 +31,6 @@
 #      instructions accordingly.
 set -ex
 
-# Map of software versions for each dependency.
-
-LOCAL_JAR_PATH=${LOCAL_JAR_PATH:-''}
-SPARK_CONF=${SPARK_CONF:-''}
-
 # Try to use "cudf-udf" conda environment for the python cudf-udf tests.
 CONDA_HOME=${CONDA_HOME:-"/databricks/conda"}
 if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then
@@ -44,63 +39,32 @@ if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then
 fi
 export PATH=${CONDA_HOME}/envs/cudf-udf/bin:$PATH
 export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
+# Set the path of python site-packages.
 # Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
 PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages"
 
-# Install required packages
-sudo apt -y install zip unzip
+SOURCE_PATH="/home/ubuntu/spark-rapids"
+[[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH
 
-export SPARK_HOME=/databricks/spark
-# Change to not point at Databricks confs so we don't conflict with their settings.
-export SPARK_CONF_DIR=$PWD
+# 'init_cudf_udf.sh' already be executed to install required python packages
+# Init common variables like SPARK_HOME, spark configs
+source jenkins/databricks/common_vars.sh
 
-# Get the correct py4j file.
-PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
-# Set the path of python site-packages.
-PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages"
-# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
-# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
-export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
-sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
-sudo chmod 777 /databricks/data/logs/
-sudo chmod 777 /databricks/data/logs/*
-echo { \"port\":\"15002\" } > ~/.databricks-connect
+sudo ln -sf /databricks/jars/ $SPARK_HOME/jars
+sudo chmod -R 777 /databricks/data/logs/
 
 CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
     --conf spark.rapids.memory.gpu.minAllocFraction=0 \
     --conf spark.rapids.memory.gpu.allocFraction=0.1 \
     --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \
     --conf spark.rapids.python.concurrentPythonWorkers=2"
 
-## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
-if [ -n "$SPARK_CONF" ]; then
-    CONF_LIST=${SPARK_CONF//','/' '}
-    for CONF in ${CONF_LIST}; do
-        KEY=${CONF%%=*}
-        VALUE=${CONF#*=}
-        ## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
-        export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
-    done
-
-    ## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
-    SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
-fi
-
-TEST_TYPE="nightly"
-PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
-
 # Enable event log for qualification & profiling tools testing
 export PYSP_TEST_spark_eventLog_enabled=true
 mkdir -p /tmp/spark-events
 
-if [ -d "$LOCAL_JAR_PATH" ]; then
-    ## Run cudf-udf tests.
-    CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
-    LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
-        bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
-else
-    ## Run cudf-udf tests.
-    CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
-    SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=0 \
-        bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
-fi
+CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $PWD/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
+
+SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
+    bash integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
diff --git a/jenkins/databricks/run_it.sh b/jenkins/databricks/run_it.sh
@@ -20,46 +20,19 @@
 #   TEST_TAGS=xxx
 # More details please refer to './integration_tests/run_pyspark_from_build.sh'.
 # Note, 'setup.sh' should be executed first to setup proper environment.
+#
+# This file runs pytests with Jenkins parallel jobs.
 
 set -xe
 
-SPARK_VER=${SPARK_VER:-$(< /databricks/spark/VERSION)}
-export SPARK_SHIM_VER=${SPARK_SHIM_VER:-spark${SPARK_VER//.}db}
-
-# Setup SPARK_HOME if need
-if [[ -z "$SPARK_HOME" ]]; then
-    # Configure spark environment on Databricks
-    export SPARK_HOME=$DB_HOME/spark
-fi
-
-SCALA_BINARY_VER=${SCALA_BINARY_VER:-'2.12'}
-
-# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
-export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
-# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
-PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
-# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
-PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
-
-# Get the correct py4j file.
-PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
-# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
-# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
-export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
+# 'setup.sh' already be executed before running this script
+db_script_path="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+# Init common variables like SPARK_HOME, spark configs
+source $db_script_path/common_vars.sh
 
 # Disable parallel test as multiple tests would be executed by leveraging external parallelism, e.g. Jenkins parallelism
 export TEST_PARALLEL=${TEST_PARALLEL:-0}
 
-if [[ "$TEST" == "cache_test" || "$TEST" == "cache_test.py" ]]; then
-    export PYSP_TEST_spark_sql_cache_serializer='com.nvidia.spark.ParquetCachedBatchSerializer'
-fi
-
-TEST_TYPE=${TEST_TYPE:-"nightly"}
-
-if [[ -n "$LOCAL_JAR_PATH" ]]; then
-    export LOCAL_JAR_PATH=$LOCAL_JAR_PATH
-fi
-
 set +e
 # Run integration testing
 ./integration_tests/run_pyspark_from_build.sh --runtime_env='databricks' --test_type=$TEST_TYPE

diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
@@ -33,50 +33,24 @@
 #   1. Check if any more dependencies need to be added to the apt/pip install commands.
 #   2. If you had to go beyond the above steps to support the new runtime, then update the
 #      instructions accordingly.
+#
+# This file parallely runs pytests with python-xdist(e.g., TEST_PARALLEL=4).
+
 set -ex
 
 SOURCE_PATH="/home/ubuntu/spark-rapids"
 [[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH
 
 # Install python packages for integration tests
 source jenkins/databricks/setup.sh
+# Init common variables like SPARK_HOME, spark configs
+source jenkins/databricks/common_vars.sh
 
-SPARK_CONF=${SPARK_CONF:-''}
 BASE_SPARK_VERSION=${BASE_SPARK_VERSION:-$(< /databricks/spark/VERSION)}
 SHUFFLE_SPARK_SHIM=${SHUFFLE_SPARK_SHIM:-spark${BASE_SPARK_VERSION//./}db}
 SHUFFLE_SPARK_SHIM=${SHUFFLE_SPARK_SHIM//\-SNAPSHOT/}
 [[ -z $SPARK_SHIM_VER ]] && export SPARK_SHIM_VER=spark${BASE_SPARK_VERSION//.}db
 
-# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
-export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
-# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
-PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
-# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
-PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
-
-export SPARK_HOME=/databricks/spark
-# change to not point at databricks confs so we don't conflict with their settings
-export SPARK_CONF_DIR=$PWD
-# Get the correct py4j file.
-PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
-# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
-# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
-export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
-
-## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
-if [ -n "$SPARK_CONF" ]; then
-    CONF_LIST=${SPARK_CONF//','/' '}
-    for CONF in ${CONF_LIST}; do
-        KEY=${CONF%%=*}
-        VALUE=${CONF#*=}
-        ## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
-        export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
-    done
-
-    ## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
-    SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
-fi
-
 IS_SPARK_321_OR_LATER=0
 [[ "$(printf '%s\n' "3.2.1" "$BASE_SPARK_VERSION" | sort -V | head -n1)" = "3.2.1" ]] && IS_SPARK_321_OR_LATER=1
 
@@ -86,8 +60,6 @@ IS_SPARK_321_OR_LATER=0
 # - DELTA_LAKE_ONLY: delta_lake tests only
 # - MULTITHREADED_SHUFFLE: shuffle tests only
 TEST_MODE=${TEST_MODE:-'DEFAULT'}
-TEST_TYPE="nightly"
-PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
 
 # Classloader config is here to work around classloader issues with
 # --packages in distributed setups, should be fixed by