Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing mlvm test for presubmits #1216

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 73 additions & 25 deletions mlvm/mlvm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,37 +37,76 @@ R_VERSION="$(R --version | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/
readonly R_VERSION
readonly SPARK_NLP_VERSION="3.2.1" # Must include subminor version here

CONDA_PACKAGES=(
"r-dplyr=1.0"
"r-essentials=${R_VERSION}"
"r-sparklyr=1.7"
"scikit-learn=0.24"
"xgboost=1.4"
)
if [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.0" | bc -l) == 1 ]]; then
CONDA_PACKAGES=(
"r-dplyr=1.0"
"r-essentials=${R_VERSION}"
"r-sparklyr=1.7"
"scikit-learn=0.24"
"xgboost=1.4"
)
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.1" | bc -l) == 1 ]]; then
CONDA_PACKAGES=(
"r-dplyr=1.0"
"r-essentials=${R_VERSION}"
"r-sparklyr=1.7"
"scikit-learn=1.0"
"xgboost=1.6"
)
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.2" | bc -l) == 1 ]]; then
CONDA_PACKAGES=(
"r-dplyr=1.1"
"r-essentials=${R_VERSION}"
"r-sparklyr=1.8"
"scikit-learn=1.1"
"xgboost=2.0"
)
fi

# rapids-xgboost (part of the RAPIDS library) requires a custom build of
# xgboost that is incompatible with r-xgboost. As such, r-xgboost is not
# installed into the MLVM if RAPIDS support is desired.
if [[ -z ${RAPIDS_RUNTIME} ]]; then
CONDA_PACKAGES+=("r-xgboost=1.4")
if [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.0" | bc -l) == 1 ]]; then
CONDA_PACKAGES+=("r-xgboost=1.4")
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.1" | bc -l) == 1 ]]; then
CONDA_PACKAGES+=("r-xgboost=1.6")
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.2" | bc -l) == 1 ]]; then
CONDA_PACKAGES+=("r-xgboost=2.0")
fi
fi

PIP_PACKAGES=(
"spark-tensorflow-distributor==1.0.0"
"tensorflow-probability==0.13.*"
"mxnet==1.8.*"
"rpy2==3.4.*"
"spark-nlp==${SPARK_NLP_VERSION}"
"sparksql-magic==0.0.*"
"tensorflow-datasets==4.4.*"
"tensorflow-hub==0.12.*"
"nltk==3.6.5"
)

PIP_PACKAGES+=(
"spark-tensorflow-distributor==1.0.0"
"tensorflow==2.6.*"
"tensorflow-estimator==2.6.*"
"tensorflow-io==0.20"
"tensorflow-probability==0.13.*"
)
if [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.0" | bc -l) == 1 ]]; then
PIP_PACKAGES+=(
"tensorflow==2.6.*"
"tensorflow-estimator==2.6.*"
"tensorflow-io==0.20"
)
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.1" | bc -l) == 1 ]]; then
PIP_PACKAGES+=(
"tensorflow==2.8.*"
"tensorflow-estimator==2.8.*"
"tensorflow-io==0.23.1"
)
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.2" | bc -l) == 1 ]]; then
PIP_PACKAGES+=(
"tensorflow==2.12.*"
"tensorflow-estimator==2.12.*"
"tensorflow-io==0.29.0"
)
fi

readonly CONDA_PACKAGES
readonly PIP_PACKAGES
Expand Down Expand Up @@ -119,26 +158,24 @@ function install_conda_packages() {
conda config --add channels pytorch
conda config --add channels conda-forge

conda install pytorch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 -c pytorch -c conda-forge

# Create a separate environment with mamba.
# Mamba provides significant decreases in installation times.
conda create -y -n ${mamba_env_name} mamba

execute_with_retries "${mamba_env}/bin/mamba install -y ${CONDA_PACKAGES[*]} -p ${base}"
execute_with_retries "${base}/bin/mamba install -y ${CONDA_PACKAGES[*]} -p ${base}"

if [[ -n "${extra_channels}" ]]; then
for channel in ${extra_channels}; do
"${mamba_env}/bin/conda" config --add channels "${channel}"
"${base}/bin/conda" config --add channels "${channel}"
done
fi

if [[ -n "${extra_packages}" ]]; then
execute_with_retries "${mamba_env}/bin/mamba install -y ${extra_packages[*]} -p ${base}"
execute_with_retries "${base}/bin/mamba install -y ${extra_packages[*]} -p ${base}"
fi

# Clean up environment
"${mamba_env}/bin/mamba" clean -y --all
"${base}/bin/mamba" clean -y --all

# Remove mamba env when done
conda env remove -n ${mamba_env_name}
Expand All @@ -147,11 +184,22 @@ function install_conda_packages() {
function install_pip_packages() {
local -r extra_packages="$(/usr/share/google/get_metadata_value attributes/PIP_PACKAGES || echo "")"

if [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.0" | bc -l) == 1 ]]; then
pip install torch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.1" | bc -l) == 1 ]]; then
pip install torch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0
elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.2" | bc -l) == 1 ]]; then
pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1
fi

execute_with_retries "pip install ${PIP_PACKAGES[*]}"

if [[ -n "${extra_packages}" ]]; then
execute_with_retries "pip install ${extra_packages[*]}"
fi
if [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.0" | bc -l) == 1 ]]; then
execute_with_retries "pip install numpy==1.20.* --no-deps"
fi
}

function install_dask() {
Expand Down Expand Up @@ -211,13 +259,13 @@ function main() {
echo "Installing rapids"
install_rapids

# Install Conda packages
echo "Installing Conda packages"
install_conda_packages

# Install Pip packages
echo "Installing Pip Packages"
install_pip_packages

# Install Conda packages
echo "Installing Conda packages"
install_conda_packages
}

main
5 changes: 1 addition & 4 deletions mlvm/scripts/python_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,4 @@
import torch
import torchvision
import xgboost

import os
if os.getenv("DATAPROC_VERSION") >= "2.0":
import spark_tensorflow_distributor
import spark_tensorflow_distributor
2 changes: 1 addition & 1 deletion mlvm/scripts/spark_bq.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
table = "bigquery-public-data.samples.shakespeare"
df = spark.read.format("bigquery").option("table", table).load()

df.take(1)
df.show(1)
3 changes: 0 additions & 3 deletions mlvm/scripts/verify_dask_standalone.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from dask.distributed import Client
import dask.array as da

import numpy as np

client = Client("localhost:8786")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no. you need to change the environment so that this test passes. You cannot remove the test and claim that you have fixed the test suite.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They were not being used in the script, so thought of removing them.


x = da.sum(np.ones(5))
x.compute()
3 changes: 0 additions & 3 deletions mlvm/scripts/verify_dask_yarn.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from dask_yarn import YarnCluster
from dask.distributed import Client
import dask.array as da

import numpy as np

cluster = YarnCluster()
client = Client(cluster)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please stop. You can't change the tests unless the API has changed. The tests need to stay the same or may be changed to use more modern API methods if those have changed.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They were not being used in the script, so thought of removing them.


x = da.sum(np.ones(5))
x.compute()
22 changes: 10 additions & 12 deletions mlvm/scripts/verify_rapids_dask.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import cudf
import dask_cudf
import xgboost
import pandas as pd
import xgboost as xgb

# confirm RAPIDS and xgboost are available
df = cudf.DataFrame()
df['a'] = [0, 1, 2]
df['b'] = [1, 2, 3]
df['c'] = df.a * df.b + 100
dmat = xgboost.DMatrix(df)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you cannot remove the test for xgboost and claim that you are testing xgboost

# Create a Pandas DataFrame
df = pd.DataFrame({
'a': [0, 1, 2],
'b': [1, 2, 3]
})
df['c'] = df['a'] * df['b'] + 100

# confirm Dask is available
ds = dask_cudf.from_cudf(df['c'], npartitions=2)
ds.compute()
dmat = xgb.DMatrix(df)
computed_df = df['c']
1 change: 0 additions & 1 deletion mlvm/scripts/verify_rapids_spark.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel

conf = SparkConf().setAppName("RAPIDS_Accelerator_Spark_join_test")
conf.set("spark.executor.instances", "1")
Expand Down
24 changes: 14 additions & 10 deletions mlvm/test_mlvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ def verify_rapids_dask(self):
def verify_all(self):
self.verify_python()
self.verify_r()
self.verify_spark_bigquery_connector()
if self.getImageVersion() == pkg_resources.parse_version("2.0"):
self.verify_spark_bigquery_connector()

@parameterized.parameters(
("STANDARD", None),
Expand All @@ -93,6 +94,8 @@ def test_mlvm(self, configuration, dask_runtime):
# Supported on Dataproc 2.0+
if self.getImageVersion() < pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in pre 2.0 images")
if self.getImageVersion() > pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in 2.0+ images")

metadata = "init-actions-repo={}".format(self.INIT_ACTIONS_REPO)
if dask_runtime:
Expand All @@ -117,17 +120,16 @@ def test_mlvm(self, configuration, dask_runtime):
)
def test_mlvm_gpu(self, configuration, dask_runtime, rapids_runtime):
if self.getImageOs() == 'rocky':
self.skipTest("Not supported in Rocky Linux-based images")
self.skipTest("Not supported in Rocky Linux-based images")

# Supported on Dataproc 2.0+
if self.getImageVersion() < pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in pre 2.0 images")

metadata = ("init-actions-repo={},include-gpus=true"
",gpu-driver-provider=NVIDIA").format(self.INIT_ACTIONS_REPO)
self.skipTest("Not supported in pre 2.0 images")
if self.getImageVersion() > pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in 2.0+ images")

cudnn_version = "8.1.1.33"
cuda_version = "11.2"
cudnn_version = "8.6.0.163"
cuda_version = "11.8"

metadata = ("init-actions-repo={},include-gpus=true"
",gpu-driver-provider=NVIDIA,"
Expand All @@ -147,7 +149,8 @@ def test_mlvm_gpu(self, configuration, dask_runtime, rapids_runtime):
master_accelerator="type=nvidia-tesla-t4",
worker_accelerator="type=nvidia-tesla-t4",
timeout_in_minutes=60,
metadata=metadata)
metadata=metadata,
boot_disk_size="500GB")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it shouldn't be that much. Maybe 80GB or 100GB if you want to be excessively careful. Don't waste half a TB of pd-ssd on this, please.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ack.


self.verify_all()

Expand All @@ -157,5 +160,6 @@ def test_mlvm_gpu(self, configuration, dask_runtime, rapids_runtime):
elif rapids_runtime == "DASK":
self.verify_rapids_dask()


if __name__ == "__main__":
absltest.main()
absltest.main()