Skip to content

Commit

Permalink
Put shared part together for cudf-udf test file
Browse files Browse the repository at this point in the history
Signed-off-by: Tim Liu <[email protected]>
  • Loading branch information
NvTimLiu committed Jul 28, 2023
1 parent 155c6a1 commit 9c450b9
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 50 deletions.
2 changes: 1 addition & 1 deletion jenkins/databricks/common_vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
# Set the path of python site-packages, packages were installed here by 'jenkins/databricks/setup.sh'.
PYTHON_SITE_PACKAGES="$HOME/.local/lib/${PYTHON_VERSION}/site-packages"
PYTHON_SITE_PACKAGES=${PYTHON_SITE_PACKAGES:-"$HOME/.local/lib/${PYTHON_VERSION}/site-packages"}

# Get the correct py4j file.
PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
Expand Down
62 changes: 13 additions & 49 deletions jenkins/databricks/cudf_udf_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,6 @@
# instructions accordingly.
set -ex

# Map of software versions for each dependency.

LOCAL_JAR_PATH=${LOCAL_JAR_PATH:-''}
SPARK_CONF=${SPARK_CONF:-''}

# Try to use "cudf-udf" conda environment for the python cudf-udf tests.
CONDA_HOME=${CONDA_HOME:-"/databricks/conda"}
if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then
Expand All @@ -44,63 +39,32 @@ if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then
fi
export PATH=${CONDA_HOME}/envs/cudf-udf/bin:$PATH
export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
# Set the path of python site-packages.
# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages"

# Install required packages
sudo apt -y install zip unzip
SOURCE_PATH="/home/ubuntu/spark-rapids"
[[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH

export SPARK_HOME=/databricks/spark
# Change to not point at Databricks confs so we don't conflict with their settings.
export SPARK_CONF_DIR=$PWD
# 'init_cudf_udf.sh' already be executed to install required python packages
# Init common variables like SPARK_HOME, spark configs
source jenkins/databricks/common_vars.sh

# Get the correct py4j file.
PY4J_FILE=$(find $SPARK_HOME/python/lib -type f -iname "py4j*.zip")
# Set the path of python site-packages.
PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages"
# Databricks Koalas can conflict with the actual Pandas version, so put site packages first.
# Note that Koala is deprecated for DB10.4+ and it is recommended to use Pandas API on Spark instead.
export PYTHONPATH=$PYTHON_SITE_PACKAGES:$SPARK_HOME/python:$SPARK_HOME/python/pyspark/:$PY4J_FILE
sudo ln -s /databricks/jars/ $SPARK_HOME/jars || true
sudo chmod 777 /databricks/data/logs/
sudo chmod 777 /databricks/data/logs/*
echo { \"port\":\"15002\" } > ~/.databricks-connect
sudo ln -sf /databricks/jars/ $SPARK_HOME/jars
sudo chmod -R 777 /databricks/data/logs/

CUDF_UDF_TEST_ARGS="--conf spark.python.daemon.module=rapids.daemon_databricks \
--conf spark.rapids.memory.gpu.minAllocFraction=0 \
--conf spark.rapids.memory.gpu.allocFraction=0.1 \
--conf spark.rapids.python.memory.gpu.allocFraction=0.1 \
--conf spark.rapids.python.concurrentPythonWorkers=2"

## 'spark.foo=1,spark.bar=2,...' to 'export PYSP_TEST_spark_foo=1 export PYSP_TEST_spark_bar=2'
if [ -n "$SPARK_CONF" ]; then
CONF_LIST=${SPARK_CONF//','/' '}
for CONF in ${CONF_LIST}; do
KEY=${CONF%%=*}
VALUE=${CONF#*=}
## run_pyspark_from_build.sh requires 'export PYSP_TEST_spark_foo=1' as the spark configs
export PYSP_TEST_${KEY//'.'/'_'}=$VALUE
done

## 'spark.foo=1,spark.bar=2,...' to '--conf spark.foo=1 --conf spark.bar=2 --conf ...'
SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
fi

TEST_TYPE="nightly"
PCBS_CONF="com.nvidia.spark.ParquetCachedBatchSerializer"

# Enable event log for qualification & profiling tools testing
export PYSP_TEST_spark_eventLog_enabled=true
mkdir -p /tmp/spark-events

if [ -d "$LOCAL_JAR_PATH" ]; then
## Run cudf-udf tests.
CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
else
## Run cudf-udf tests.
CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=0 \
bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
fi
CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $PWD/rapids-4-spark_*.jar | grep -v 'tests.jar'`"

SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
bash integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE

0 comments on commit 9c450b9

Please sign in to comment.