Put shared part together for cudf-udf test file

Signed-off-by: Tim Liu <[email protected]>
NVIDIA · Jul 28, 2023 · 9c450b9 · 9c450b9
1 parent 155c6a1
commit 9c450b9
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 50 deletions.
diff --git a/jenkins/databricks/common_vars.sh b/jenkins/databricks/common_vars.sh
@@ -26,7 +26,7 @@ export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
 # Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
 PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
 # Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
-PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
+PYTHON_SITE_PACKAGES=${PYTHON_SITE_PACKAGES:-"$HOME/.local/lib/${PYTHON_VERSION}/site-packages"}
 
 # Get the correct py4j file.
 PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")

diff --git a/jenkins/databricks/cudf_udf_test.sh b/jenkins/databricks/cudf_udf_test.sh
@@ -31,11 +31,6 @@
 #      instructions accordingly.
 set -ex
 
-# Map of software versions for each dependency.
-
-LOCAL_JAR_PATH=${LOCAL_JAR_PATH:-''}
-SPARK_CONF=${SPARK_CONF:-''}
-
 # Try to use "cudf-udf" conda environment for the python cudf-udf tests.
 CONDA_HOME=${CONDA_HOME:-"/databricks/conda"}
 if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then
@@ -44,63 +39,32 @@ if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then
 fi
 export PATH=${CONDA_HOME}/envs/cudf-udf/bin:$PATH
 export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
+# Set the path of python site-packages.
 # Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
 PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
+PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages"
 
-# Install required packages
-sudo apt -y install zip unzip
+SOURCE_PATH="/home/ubuntu/spark-rapids"
+[[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH
 
-export SPARK_HOME=/databricks/spark
-# Change to not point at Databricks confs so we don't conflict with their settings.
-export SPARK_CONF_DIR=$PWD
+# 'init_cudf_udf.sh' already be executed to install required python packages
+# Init common variables like SPARK_HOME, spark configs
+source jenkins/databricks/common_vars.sh
 
-# Get the correct py4j file.
-PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
-# Set the path of python site-packages.
-PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages"
-# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
-# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
-export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
-sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
-sudo chmod 777 /databricks/data/logs/
-sudo chmod 777 /databricks/data/logs/*
-echo { \"port\":\"15002\" } > ~/.databricks-connect
+sudo ln -sf /databricks/jars/ $SPARK_HOME/jars
+sudo chmod -R 777 /databricks/data/logs/
 
 CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
     --conf spark.rapids.memory.gpu.minAllocFraction=0 \
     --conf spark.rapids.memory.gpu.allocFraction=0.1 \
     --conf spark.rapids.python.memory.gpu.allocFraction=0.1 \
     --conf spark.rapids.python.concurrentPythonWorkers=2"
 
-## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
-if [ -n "$SPARK_CONF" ]; then
-    CONF_LIST=${SPARK_CONF//','/' '}
-    for CONF in ${CONF_LIST}; do
-        KEY=${CONF%%=*}
-        VALUE=${CONF#*=}
-        ## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
-        export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
-    done
-
-    ## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
-    SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
-fi
-
-TEST_TYPE="nightly"
-PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"
-
 # Enable event log for qualification & profiling tools testing
 export PYSP_TEST_spark_eventLog_enabled=true
 mkdir -p /tmp/spark-events
 
-if [ -d "$LOCAL_JAR_PATH" ]; then
-    ## Run cudf-udf tests.
-    CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
-    LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
-        bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
-else
-    ## Run cudf-udf tests.
-    CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
-    SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=0 \
-        bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
-fi
+CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $PWD/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
+
+SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
+    bash integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE