Skip to content

Commit

Permalink
[SPARK-23300][TESTS][BRANCH-2.3] Prints out if Pandas and PyArrow are…
Browse files Browse the repository at this point in the history
… installed or not in PySpark SQL tests

This PR backports #20473 to branch-2.3.

Author: hyukjinkwon <[email protected]>

Closes #20533 from HyukjinKwon/backport-20473.
  • Loading branch information
HyukjinKwon committed Feb 8, 2018
1 parent 05239af commit 2ba07d5
Showing 1 changed file with 55 additions and 1 deletion.
56 changes: 55 additions & 1 deletion python/run-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import Queue
else:
import queue as Queue
from distutils.version import LooseVersion


# Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
Expand All @@ -39,7 +40,7 @@

from sparktestsupport import SPARK_HOME # noqa (suppress pep8 warnings)
from sparktestsupport.shellutils import which, subprocess_check_output # noqa
from sparktestsupport.modules import all_modules # noqa
from sparktestsupport.modules import all_modules, pyspark_sql # noqa


python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root')
Expand Down Expand Up @@ -151,6 +152,55 @@ def parse_opts():
return opts


def _check_dependencies(python_exec, modules_to_test):
# If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and
# explicitly prints out. See SPARK-23300.
if pyspark_sql in modules_to_test:
# TODO(HyukjinKwon): Relocate and deduplicate these version specifications.
minimum_pyarrow_version = '0.8.0'
minimum_pandas_version = '0.19.2'

try:
pyarrow_version = subprocess_check_output(
[python_exec, "-c", "import pyarrow; print(pyarrow.__version__)"],
universal_newlines=True,
stderr=open(os.devnull, 'w')).strip()
if LooseVersion(pyarrow_version) >= LooseVersion(minimum_pyarrow_version):
LOGGER.info("Will test PyArrow related features against Python executable "
"'%s' in '%s' module." % (python_exec, pyspark_sql.name))
else:
LOGGER.warning(
"Will skip PyArrow related features against Python executable "
"'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
"%s was found." % (
python_exec, pyspark_sql.name, minimum_pyarrow_version, pyarrow_version))
except:
LOGGER.warning(
"Will skip PyArrow related features against Python executable "
"'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
"was not found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version))

try:
pandas_version = subprocess_check_output(
[python_exec, "-c", "import pandas; print(pandas.__version__)"],
universal_newlines=True,
stderr=open(os.devnull, 'w')).strip()
if LooseVersion(pandas_version) >= LooseVersion(minimum_pandas_version):
LOGGER.info("Will test Pandas related features against Python executable "
"'%s' in '%s' module." % (python_exec, pyspark_sql.name))
else:
LOGGER.warning(
"Will skip Pandas related features against Python executable "
"'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
"%s was found." % (
python_exec, pyspark_sql.name, minimum_pandas_version, pandas_version))
except:
LOGGER.warning(
"Will skip Pandas related features against Python executable "
"'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
"was not found." % (python_exec, pyspark_sql.name, minimum_pandas_version))


def main():
opts = parse_opts()
if (opts.verbose):
Expand All @@ -175,6 +225,10 @@ def main():

task_queue = Queue.PriorityQueue()
for python_exec in python_execs:
# Check if the python executable has proper dependencies installed to run tests
# for given modules properly.
_check_dependencies(python_exec, modules_to_test)

python_implementation = subprocess_check_output(
[python_exec, "-c", "import platform; print(platform.python_implementation())"],
universal_newlines=True).strip()
Expand Down

0 comments on commit 2ba07d5

Please sign in to comment.