Put shared part together for Databricks test scripts

To fix issue: #8779 Put shared part together for Databricks scripts test.sh and run_it.sh to make update of the scripts easy, need not to modify both test.sh and run_it.sh if we change the shared part. Signed-off-by: Tim Liu <[email protected]>
NVIDIA · Jul 27, 2023 · ca2a910 · ca2a910
1 parent 5421a85
commit ca2a910
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 66 deletions.
diff --git a/jenkins/databricks/common_vars.sh b/jenkins/databricks/common_vars.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+#
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Setup SPARK_HOME if need
+if [[ -z "$SPARK_HOME" ]]; then
+    # Configure spark environment on Databricks
+    export SPARK_HOME=$DB_HOME/spark
+fi
+
+# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
+export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
+# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
+PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
+PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
+
+# Get the correct py4j file.
+PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
+# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
+# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
+export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
+export PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
+if [[ "$TEST" == "cache_test" || "$TEST" == "cache_test.py" ]]; then
+    export PYSP_TEST_spark_sql_cache_serializer="$PCBS_CONF"
+fi
+
+export TEST_TYPE=${TEST_TYPE:-"nightly"}
+
+if [[ -n "$LOCAL_JAR_PATH" ]]; then
+    export LOCAL_JAR_PATH=$LOCAL_JAR_PATH
+fi
+
+## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
+if [ -n "$SPARK_CONF" ]; then
+    CONF_LIST=${SPARK_CONF//','/' '}
+    for CONF in ${CONF_LIST}; do
+        KEY=${CONF%%=*}
+        VALUE=${CONF#*=}
+        ## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
+        export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
+    done
+
+    ## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
+    export SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
+fi
diff --git a/jenkins/databricks/run_it.sh b/jenkins/databricks/run_it.sh
@@ -20,46 +20,19 @@
 #   TEST_TAGS=xxx
 # More details please refer to './integration_tests/run_pyspark_from_build.sh'.
 # Note, 'setup.sh' should be executed first to setup proper environment.
+#
+# This file runs pytests with Jenkins parallel jobs.
 
 set -xe
 
-SPARK_VER=${SPARK_VER:-$(< /databricks/spark/VERSION)}
-export SPARK_SHIM_VER=${SPARK_SHIM_VER:-spark${SPARK_VER//.}db}
-
-# Setup SPARK_HOME if need
-if [[ -z "$SPARK_HOME" ]]; then
-    # Configure spark environment on Databricks
-    export SPARK_HOME=$DB_HOME/spark
-fi
-
-SCALA_BINARY_VER=${SCALA_BINARY_VER:-'2.12'}
-
-# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
-export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
-# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
-PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
-# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
-PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
-
-# Get the correct py4j file.
-PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
-# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
-# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
-export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
+# 'setup.sh' already be executed before running this script
+db_script_path="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+# Init common variables like SPARK_HOME, spark configs
+source $db_script_path/common_vars.sh
 
 # Disable parallel test as multiple tests would be executed by leveraging external parallelism, e.g. Jenkins parallelism
 export TEST_PARALLEL=${TEST_PARALLEL:-0}
 
-if [[ "$TEST" == "cache_test" || "$TEST" == "cache_test.py" ]]; then
-    export PYSP_TEST_spark_sql_cache_serializer='com.nvidia.spark.ParquetCachedBatchSerializer'
-fi
-
-TEST_TYPE=${TEST_TYPE:-"nightly"}
-
-if [[ -n "$LOCAL_JAR_PATH" ]]; then
-    export LOCAL_JAR_PATH=$LOCAL_JAR_PATH
-fi
-
 set +e
 # Run integration testing
 ./integration_tests/run_pyspark_from_build.sh --runtime_env='databricks' --test_type=$TEST_TYPE

diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
@@ -33,50 +33,24 @@
 #   1. Check if any more dependencies need to be added to the apt/pip install commands.
 #   2. If you had to go beyond the above steps to support the new runtime, then update the
 #      instructions accordingly.
+#
+# This file parallely runs pytests with python-xdist(e.g., TEST_PARALLEL=4).
+
 set -ex
 
 SOURCE_PATH="/home/ubuntu/spark-rapids"
 [[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH
 
 # Install python packages for integration tests
 source jenkins/databricks/setup.sh
+# Init common variables like SPARK_HOME, spark configs
+source jenkins/databricks/common_vars.sh
 
-SPARK_CONF=${SPARK_CONF:-''}
 BASE_SPARK_VERSION=${BASE_SPARK_VERSION:-$(< /databricks/spark/VERSION)}
 SHUFFLE_SPARK_SHIM=${SHUFFLE_SPARK_SHIM:-spark${BASE_SPARK_VERSION//./}db}
 SHUFFLE_SPARK_SHIM=${SHUFFLE_SPARK_SHIM//\-SNAPSHOT/}
 [[ -z $SPARK_SHIM_VER ]] && export SPARK_SHIM_VER=spark${BASE_SPARK_VERSION//.}db
 
-# Set PYSPARK_PYTHON to keep the version of driver/workers python consistent.
-export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
-# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
-PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
-# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
-PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
-
-export SPARK_HOME=/databricks/spark
-# change to not point at databricks confs so we don't conflict with their settings
-export SPARK_CONF_DIR=$PWD
-# Get the correct py4j file.
-PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
-# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
-# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
-export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
-
-## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
-if [ -n "$SPARK_CONF" ]; then
-    CONF_LIST=${SPARK_CONF//','/' '}
-    for CONF in ${CONF_LIST}; do
-        KEY=${CONF%%=*}
-        VALUE=${CONF#*=}
-        ## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
-        export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
-    done
-
-    ## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
-    SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
-fi
-
 IS_SPARK_321_OR_LATER=0
 [[ "$(printf '%s\n' "3.2.1" "$BASE_SPARK_VERSION" | sort -V | head -n1)" = "3.2.1" ]] && IS_SPARK_321_OR_LATER=1
 
@@ -86,8 +60,6 @@ IS_SPARK_321_OR_LATER=0
 # - DELTA_LAKE_ONLY: delta_lake tests only
 # - MULTITHREADED_SHUFFLE: shuffle tests only
 TEST_MODE=${TEST_MODE:-'DEFAULT'}
-TEST_TYPE="nightly"
-PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
 
 # Classloader config is here to work around classloader issues with
 # --packages in distributed setups, should be fixed by