-
Notifications
You must be signed in to change notification settings - Fork 236
/
run_pyspark_from_build.sh
executable file
·262 lines (238 loc) · 12.2 KB
/
run_pyspark_from_build.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#!/bin/bash
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -ex
SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
cd "$SCRIPTPATH"
if [[ $( echo ${SKIP_TESTS} | tr [:upper:] [:lower:] ) == "true" ]];
then
echo "PYTHON INTEGRATION TESTS SKIPPED..."
elif [[ -z "$SPARK_HOME" ]];
then
>&2 echo "SPARK_HOME IS NOT SET CANNOT RUN PYTHON INTEGRATION TESTS..."
else
echo "WILL RUN TESTS WITH SPARK_HOME: ${SPARK_HOME}"
# Spark 3.1.1 includes https://github.com/apache/spark/pull/31540
# which helps with spurious task failures as observed in our tests. If you are running
# Spark versions before 3.1.1, this sets the spark.max.taskFailures to 4 to allow for
# more lineant configuration, else it will set them to 1 as spurious task failures are not expected
# for Spark 3.1.1+
VERSION_STRING=`$SPARK_HOME/bin/pyspark --version 2>&1|grep -v Scala|awk '/version\ [0-9.]+/{print $NF}'`
VERSION_STRING="${VERSION_STRING/-SNAPSHOT/}"
[[ -z $VERSION_STRING ]] && { echo "Unable to detect the Spark version at $SPARK_HOME"; exit 1; }
[[ -z $SPARK_SHIM_VER ]] && { SPARK_SHIM_VER="spark${VERSION_STRING//./}"; }
echo "Detected Spark version $VERSION_STRING (shim version: $SPARK_SHIM_VER)"
INTEGRATION_TEST_VERSION=$SPARK_SHIM_VER
if [[ ! -z $INTEGRATION_TEST_VERSION_OVERRIDE ]]; then
# Override auto detected shim version in case of non-standard version string, e.g. `spark3113172702000-53`
INTEGRATION_TEST_VERSION=$INTEGRATION_TEST_VERSION_OVERRIDE
fi
# support alternate local jars NOT building from the source code
if [ -d "$LOCAL_JAR_PATH" ]; then
AVRO_JARS=$(echo "$LOCAL_JAR_PATH"/spark-avro*.jar)
PLUGIN_JARS=$(echo "$LOCAL_JAR_PATH"/rapids-4-spark_*.jar)
if [ -f $(echo $LOCAL_JAR_PATH/parquet-hadoop*.jar) ]; then
export INCLUDE_PARQUET_HADOOP_TEST_JAR=true
PARQUET_HADOOP_TESTS=$(echo $LOCAL_JAR_PATH/parquet-hadoop*.jar)
# remove the log4j.properties file so it doesn't conflict with ours, ignore errors
# if it isn't present or already removed
zip -d $PARQUET_HADOOP_TESTS log4j.properties || true
else
export INCLUDE_PARQUET_HADOOP_TEST_JAR=false
PARQUET_HADOOP_TESTS=
fi
# the integration-test-spark3xx.jar, should not include the integration-test-spark3xxtest.jar
TEST_JARS=$(echo "$LOCAL_JAR_PATH"/rapids-4-spark-integration-tests*-$INTEGRATION_TEST_VERSION.jar)
else
AVRO_JARS=$(echo "$SCRIPTPATH"/target/dependency/spark-avro*.jar)
PARQUET_HADOOP_TESTS=$(echo "$SCRIPTPATH"/target/dependency/parquet-hadoop*.jar)
# remove the log4j.properties file so it doesn't conflict with ours, ignore errors
# if it isn't present or already removed
zip -d $PARQUET_HADOOP_TESTS log4j.properties || true
MIN_PARQUET_JAR="$SCRIPTPATH/target/dependency/parquet-hadoop-1.12.0-tests.jar"
# Make sure we have Parquet version >= 1.12 in the dependency
LOWEST_PARQUET_JAR=$(echo -e "$MIN_PARQUET_JAR\n$PARQUET_HADOOP_TESTS" | sort -V | head -1)
export INCLUDE_PARQUET_HADOOP_TEST_JAR=$([[ "$LOWEST_PARQUET_JAR" == "$MIN_PARQUET_JAR" ]] && echo true || echo false)
PLUGIN_JARS=$(echo "$SCRIPTPATH"/../dist/target/rapids-4-spark_*.jar)
# the integration-test-spark3xx.jar, should not include the integration-test-spark3xxtest.jar
TEST_JARS=$(echo "$SCRIPTPATH"/target/rapids-4-spark-integration-tests*-$INTEGRATION_TEST_VERSION.jar)
fi
# `./run_pyspark_from_build.sh` runs all the tests excluding the avro tests in 'avro_test.py'.
#
# `INCLUDE_SPARK_AVRO_JAR=true ./run_pyspark_from_build.sh` runs all the tests, including the tests
# in 'avro_test.py'.
if [[ $( echo ${INCLUDE_SPARK_AVRO_JAR} | tr [:upper:] [:lower:] ) == "true" ]];
then
export INCLUDE_SPARK_AVRO_JAR=true
else
export INCLUDE_SPARK_AVRO_JAR=false
AVRO_JARS=""
fi
# Only 3 jars: dist.jar integration-test.jar avro.jar
ALL_JARS="$PLUGIN_JARS $TEST_JARS $AVRO_JARS $PARQUET_HADOOP_TESTS"
echo "AND PLUGIN JARS: $ALL_JARS"
if [[ "${TEST}" != "" ]];
then
TEST_ARGS="-k $TEST"
fi
if [[ "${TEST_TAGS}" != "" ]];
then
TEST_TAGS="-m $TEST_TAGS"
fi
if [[ "${TEST_PARALLEL}" == "" ]];
then
# For integration tests we want to have at least
# - 1.5 GiB of GPU memory for the tests and 750 MiB for loading CUDF + CUDA.
# From profiling we saw that tests allocated under 200 MiB of GPU memory and
# 1.5 GiB felt like it gave us plenty of room to grow.
# - 5 GiB of host memory. In testing with a limited number of tasks (4) we saw
# the amount of host memory not go above 3 GiB so 5 felt like a good number
# for future growth.
# - 1 CPU core
# per Spark application. We reserve 2 GiB of GPU memory for general overhead also.
# For now just assume that we are going to use the GPU on the system with the most
# free memory. We use free memory to try and avoid issues if the GPU also is working
# on graphics, which happens with many workstation GPUs. We also reserve 2 GiB for
# CUDA/CUDF overhead which can be used because of JIT or launching large kernels.
# If you need to increase the amount of GPU memory you need to change it here and
# below where the processes are launched.
GPU_MEM_PARALLEL=`nvidia-smi --query-gpu=memory.free --format=csv,noheader | awk '{if (MAX < $1){ MAX = $1}} END {print int((MAX - 2 * 1024) / ((1.5 * 1024) + 750))}'`
CPU_CORES=`nproc`
HOST_MEM_PARALLEL=`cat /proc/meminfo | grep MemAvailable | awk '{print int($2 / (5 * 1024))}'`
TMP_PARALLEL=$(( $GPU_MEM_PARALLEL > $CPU_CORES ? $CPU_CORES : $GPU_MEM_PARALLEL ))
TMP_PARALLEL=$(( $TMP_PARALLEL > $HOST_MEM_PARALLEL ? $HOST_MEM_PARALLEL : $TMP_PARALLEL ))
if [[ $TMP_PARALLEL -gt 1 ]];
then
# We subtract 1 from the parallel number because xdist launches a process to
# control and monitor the other processes. It takes up one available parallel
# slot, even if it is not truly using all of the resources we give it.
TEST_PARALLEL=$(( $TMP_PARALLEL - 1 ))
else
TEST_PARALLEL=1
fi
echo "AUTO DETECTED PARALLELISM OF $TEST_PARALLEL"
fi
if python -c 'import findspark';
then
echo "FOUND findspark"
else
TEST_PARALLEL=0
echo "findspark not installed cannot run tests in parallel"
fi
if python -c 'import xdist.plugin';
then
echo "FOUND xdist"
else
TEST_PARALLEL=0
echo "xdist not installed cannot run tests in parallel"
fi
TEST_TYPE_PARAM=""
if [[ "${TEST_TYPE}" != "" ]];
then
TEST_TYPE_PARAM="--test_type $TEST_TYPE"
fi
if [[ ${TEST_PARALLEL} -lt 2 ]];
then
# With xdist 0 and 1 are the same parallelsm but
# 0 is more effecient
TEST_PARALLEL_OPTS=()
else
TEST_PARALLEL_OPTS=("-n" "$TEST_PARALLEL")
fi
RUN_DIR=${RUN_DIR-"$SCRIPTPATH"/target/run_dir}
mkdir -p "$RUN_DIR"
cd "$RUN_DIR"
## Under cloud environment, overwrite the '--rootdir' param to point to the working directory of each excutor
LOCAL_ROOTDIR=${LOCAL_ROOTDIR:-"$SCRIPTPATH"}
## Under cloud environment, overwrite the '--std_input_path' param to point to the distributed file path
INPUT_PATH=${INPUT_PATH:-"$SCRIPTPATH"}
RUN_TESTS_COMMAND=("$SCRIPTPATH"/runtests.py
--rootdir
"$LOCAL_ROOTDIR"
"$LOCAL_ROOTDIR"/src/main/python)
TEST_COMMON_OPTS=(-v
-rfExXs
"$TEST_TAGS"
--std_input_path="$INPUT_PATH"/src/test/resources
--color=yes
$TEST_TYPE_PARAM
"$TEST_ARGS"
$RUN_TEST_PARAMS
--junitxml=TEST-pytest-`date +%s%N`.xml
"$@")
NUM_LOCAL_EXECS=${NUM_LOCAL_EXECS:-0}
MB_PER_EXEC=${MB_PER_EXEC:-1024}
CORES_PER_EXEC=${CORES_PER_EXEC:-1}
SPARK_TASK_MAXFAILURES=1
[[ "$VERSION_STRING" < "3.1.1" ]] && SPARK_TASK_MAXFAILURES=4
export PYSP_TEST_spark_driver_extraClassPath="${ALL_JARS// /:}"
export PYSP_TEST_spark_executor_extraClassPath="${ALL_JARS// /:}"
export PYSP_TEST_spark_driver_extraJavaOptions="-ea -Duser.timezone=UTC $COVERAGE_SUBMIT_FLAGS"
export PYSP_TEST_spark_executor_extraJavaOptions='-ea -Duser.timezone=UTC'
export PYSP_TEST_spark_ui_showConsoleProgress='false'
export PYSP_TEST_spark_sql_session_timeZone='UTC'
export PYSP_TEST_spark_sql_shuffle_partitions='4'
# prevent cluster shape to change
export PYSP_TEST_spark_dynamicAllocation_enabled='false'
export PYSP_TEST_spark_rapids_memory_host_spillStorageSize='100m'
# Not the default 2G but should be large enough for a single batch for all data (we found
# 200 MiB being allocated by a single test at most, and we typically have 4 tasks.
export PYSP_TEST_spark_rapids_sql_batchSizeBytes='100m'
# Extract Databricks version from deployed configs. This is set automatically on Databricks
# notebooks but not when running Spark manually.
DB_DEPLOY_CONF=/databricks/common/conf/deploy.conf
if [[ -f $DB_DEPLOY_CONF ]]; then
DB_VER=$(grep spark.databricks.clusterUsageTags.sparkVersion $DB_DEPLOY_CONF | sed -e 's/.*"\(.*\)".*/\1/')
if [[ -z $DB_VER ]]; then
echo >&2 "Unable to determine Databricks version"
exit 1
fi
export PYSP_TEST_spark_databricks_clusterUsageTags_sparkVersion=$DB_VER
fi
# Set spark.task.maxFailures for most schedulers.
#
# Local (non-cluster) mode is the exception and does not work with `spark.task.maxFailures`.
# It requires two arguments to the master specification "local[N, K]" where
# N is the number of threads, and K is the maxFailures (otherwise this is hardcoded to 1,
# see https://issues.apache.org/jira/browse/SPARK-2083).
export PYSP_TEST_spark_task_maxFailures="$SPARK_TASK_MAXFAILURES"
if ((NUM_LOCAL_EXECS > 0)); then
export PYSP_TEST_spark_master="local-cluster[$NUM_LOCAL_EXECS,$CORES_PER_EXEC,$MB_PER_EXEC]"
else
# If a master is not specified, use "local[cores, $SPARK_TASK_MAXFAILURES]"
if [ -z "${PYSP_TEST_spark_master}" ] && [[ "$SPARK_SUBMIT_FLAGS" != *"--master"* ]]; then
CPU_CORES=`nproc`
# We are limiting the number of tasks in local mode to 4 because it helps to reduce the
# total memory usage, especially host memory usage because when copying data to the GPU
# buffers as large as batchSizeBytes can be allocated, and the fewer of them we have the better.
LOCAL_PARALLEL=$(( $CPU_CORES > 4 ? 4 : $CPU_CORES ))
export PYSP_TEST_spark_master="local[$LOCAL_PARALLEL,$SPARK_TASK_MAXFAILURES]"
fi
fi
# If you want to change the amount of GPU memory allocated you have to change it here
# and where TEST_PARALLEL is calculated
export PYSP_TEST_spark_rapids_memory_gpu_allocSize='1536m'
if ((${#TEST_PARALLEL_OPTS[@]} > 0));
then
exec python "${RUN_TESTS_COMMAND[@]}" "${TEST_PARALLEL_OPTS[@]}" "${TEST_COMMON_OPTS[@]}"
else
# We set the GPU memory size to be a constant value even if only running with a parallelism of 1
# because it helps us have consistent test runs.
exec "$SPARK_HOME"/bin/spark-submit --jars "${ALL_JARS// /,}" \
--driver-java-options "$PYSP_TEST_spark_driver_extraJavaOptions" \
$SPARK_SUBMIT_FLAGS \
--conf 'spark.rapids.memory.gpu.allocSize='"$PYSP_TEST_spark_rapids_memory_gpu_allocSize" \
"${RUN_TESTS_COMMAND[@]}" "${TEST_COMMON_OPTS[@]}"
fi
fi