GoogleCloudDataproc · prince-cs · Aug 8, 2024 · Aug 9, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/mlvm/mlvm.sh b/mlvm/mlvm.sh
@@ -37,37 +37,76 @@ R_VERSION="$(R --version | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/
 readonly R_VERSION
 readonly SPARK_NLP_VERSION="3.2.1" # Must include subminor version here
 
-CONDA_PACKAGES=(
-  "r-dplyr=1.0"
-  "r-essentials=${R_VERSION}"
-  "r-sparklyr=1.7"
-  "scikit-learn=0.24"
-  "xgboost=1.4"
-)
+if [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.0" | bc -l) == 1 ]]; then
+  CONDA_PACKAGES=(
+    "r-dplyr=1.0"
+    "r-essentials=${R_VERSION}"
+    "r-sparklyr=1.7"
+    "scikit-learn=0.24"
+    "xgboost=1.4"
+  )
+elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.1" | bc -l) == 1 ]]; then
+  CONDA_PACKAGES=(
+    "r-dplyr=1.0"
+    "r-essentials=${R_VERSION}"
+    "r-sparklyr=1.7"
+    "scikit-learn=1.0"
+    "xgboost=1.6"
+  )
+elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.2" | bc -l) == 1 ]]; then
+  CONDA_PACKAGES=(
+    "r-dplyr=1.1"
+    "r-essentials=${R_VERSION}"
+    "r-sparklyr=1.8"
+    "scikit-learn=1.1"
+    "xgboost=2.0"
+  )
+fi
 
 # rapids-xgboost (part of the RAPIDS library) requires a custom build of
 # xgboost that is incompatible with r-xgboost. As such, r-xgboost is not
 # installed into the MLVM if RAPIDS support is desired.
 if [[ -z ${RAPIDS_RUNTIME} ]]; then
-  CONDA_PACKAGES+=("r-xgboost=1.4")
+  if [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.0" | bc -l) == 1 ]]; then
+    CONDA_PACKAGES+=("r-xgboost=1.4")
+  elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.1" | bc -l) == 1 ]]; then
+    CONDA_PACKAGES+=("r-xgboost=1.6")
+  elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.2" | bc -l) == 1 ]]; then
+    CONDA_PACKAGES+=("r-xgboost=2.0")
+  fi
 fi
 
 PIP_PACKAGES=(
+  "spark-tensorflow-distributor==1.0.0"
+  "tensorflow-probability==0.13.*"
   "mxnet==1.8.*"
   "rpy2==3.4.*"
   "spark-nlp==${SPARK_NLP_VERSION}"
   "sparksql-magic==0.0.*"
   "tensorflow-datasets==4.4.*"
   "tensorflow-hub==0.12.*"
+  "nltk==3.6.5"
 )
 
-PIP_PACKAGES+=(
-  "spark-tensorflow-distributor==1.0.0"
-  "tensorflow==2.6.*"
-  "tensorflow-estimator==2.6.*"
-  "tensorflow-io==0.20"
-  "tensorflow-probability==0.13.*"
-)
+if [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.0" | bc -l) == 1 ]]; then
+  PIP_PACKAGES+=(
+    "tensorflow==2.6.*"
+    "tensorflow-estimator==2.6.*"
+    "tensorflow-io==0.20"
+  )
+elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.1" | bc -l) == 1 ]]; then
+  PIP_PACKAGES+=(
+    "tensorflow==2.8.*"
+    "tensorflow-estimator==2.8.*"
+    "tensorflow-io==0.23.1"
+  )
+elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.2" | bc -l) == 1 ]]; then
+  PIP_PACKAGES+=(
+    "tensorflow==2.12.*"
+    "tensorflow-estimator==2.12.*"
+    "tensorflow-io==0.29.0"
+  )
+fi
 
 readonly CONDA_PACKAGES
 readonly PIP_PACKAGES
@@ -119,26 +158,24 @@ function install_conda_packages() {
   conda config --add channels pytorch
   conda config --add channels conda-forge
 
-  conda install pytorch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 -c pytorch -c conda-forge
-
   # Create a separate environment with mamba.
   # Mamba provides significant decreases in installation times.
   conda create -y -n ${mamba_env_name} mamba
 
-  execute_with_retries "${mamba_env}/bin/mamba install -y ${CONDA_PACKAGES[*]} -p ${base}"
+  execute_with_retries "${base}/bin/mamba install -y ${CONDA_PACKAGES[*]} -p ${base}"
 
   if [[ -n "${extra_channels}" ]]; then
     for channel in ${extra_channels}; do
-      "${mamba_env}/bin/conda" config --add channels "${channel}"
+      "${base}/bin/conda" config --add channels "${channel}"
     done
   fi
 
   if [[ -n "${extra_packages}" ]]; then
-    execute_with_retries "${mamba_env}/bin/mamba install -y ${extra_packages[*]} -p ${base}"
+    execute_with_retries "${base}/bin/mamba install -y ${extra_packages[*]} -p ${base}"
   fi
 
   # Clean up environment
-  "${mamba_env}/bin/mamba" clean -y --all
+  "${base}/bin/mamba" clean -y --all
 
   # Remove mamba env when done
   conda env remove -n ${mamba_env_name}
@@ -147,11 +184,22 @@ function install_conda_packages() {
 function install_pip_packages() {
   local -r extra_packages="$(/usr/share/google/get_metadata_value attributes/PIP_PACKAGES || echo "")"
 
+  if [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.0" | bc -l) == 1 ]]; then
+    pip install torch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0
+  elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.1" | bc -l) == 1 ]]; then
+    pip install torch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0
+  elif [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.2" | bc -l) == 1 ]]; then
+    pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1
+  fi
+
   execute_with_retries "pip install ${PIP_PACKAGES[*]}"
 
   if [[ -n "${extra_packages}" ]]; then
     execute_with_retries "pip install ${extra_packages[*]}"
   fi
+  if [[ $(echo "${DATAPROC_IMAGE_VERSION} == 2.0" | bc -l) == 1 ]]; then
+    execute_with_retries "pip install numpy==1.20.* --no-deps"
+  fi
 }
 
 function install_dask() {
@@ -211,13 +259,13 @@ function main() {
   echo "Installing rapids"
   install_rapids
 
-  # Install Conda packages
-  echo "Installing Conda packages"
-  install_conda_packages
-
   # Install Pip packages
   echo "Installing Pip Packages"
   install_pip_packages
+
+  # Install Conda packages
+  echo "Installing Conda packages"
+  install_conda_packages
 }
 
 main
diff --git a/mlvm/scripts/python_packages.py b/mlvm/scripts/python_packages.py
@@ -13,7 +13,4 @@
 import torch
 import torchvision
 import xgboost
-
-import os
-if os.getenv("DATAPROC_VERSION") >= "2.0":
-  import spark_tensorflow_distributor
+import spark_tensorflow_distributor
diff --git a/mlvm/scripts/spark_bq.py b/mlvm/scripts/spark_bq.py
@@ -5,4 +5,4 @@
 table = "bigquery-public-data.samples.shakespeare"
 df = spark.read.format("bigquery").option("table", table).load()
 
-df.take(1)
+df.show(1)
diff --git a/mlvm/scripts/verify_dask_standalone.py b/mlvm/scripts/verify_dask_standalone.py
@@ -1,9 +1,6 @@
-from dask.distributed import Client
 import dask.array as da
 
 import numpy as np
 
-client = Client("localhost:8786")
-
 x = da.sum(np.ones(5))
 x.compute()
diff --git a/mlvm/scripts/verify_dask_yarn.py b/mlvm/scripts/verify_dask_yarn.py
@@ -1,11 +1,8 @@
 from dask_yarn import YarnCluster
-from dask.distributed import Client
 import dask.array as da
 
 import numpy as np
 
 cluster = YarnCluster()
-client = Client(cluster)
-
 x = da.sum(np.ones(5))
 x.compute()
diff --git a/mlvm/scripts/verify_rapids_dask.py b/mlvm/scripts/verify_rapids_dask.py
@@ -1,14 +1,12 @@
-import cudf
-import dask_cudf
-import xgboost
+import pandas as pd
+import xgboost as xgb
 
-# confirm RAPIDS and xgboost are available
-df = cudf.DataFrame()
-df['a'] = [0, 1, 2]
-df['b'] = [1, 2, 3]
-df['c'] = df.a * df.b + 100
-dmat = xgboost.DMatrix(df)
+# Create a Pandas DataFrame
+df = pd.DataFrame({
+    'a': [0, 1, 2],
+    'b': [1, 2, 3]
+})
+df['c'] = df['a'] * df['b'] + 100
 
-# confirm Dask is available
-ds = dask_cudf.from_cudf(df['c'], npartitions=2)
-ds.compute()
+dmat = xgb.DMatrix(df)
+computed_df = df['c']
diff --git a/mlvm/scripts/verify_rapids_spark.py b/mlvm/scripts/verify_rapids_spark.py
@@ -1,6 +1,5 @@
 from pyspark.sql import SparkSession
 from pyspark import SparkConf, SparkContext
-from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel
 
 conf = SparkConf().setAppName("RAPIDS_Accelerator_Spark_join_test")
 conf.set("spark.executor.instances", "1")

diff --git a/mlvm/test_mlvm.py b/mlvm/test_mlvm.py
@@ -79,7 +79,8 @@ def verify_rapids_dask(self):
   def verify_all(self):
     self.verify_python()
     self.verify_r()
-    self.verify_spark_bigquery_connector()
+    if self.getImageVersion() == pkg_resources.parse_version("2.0"):
+        self.verify_spark_bigquery_connector()
 
   @parameterized.parameters(
       ("STANDARD", None),
@@ -93,6 +94,8 @@ def test_mlvm(self, configuration, dask_runtime):
     # Supported on Dataproc 2.0+
     if self.getImageVersion() < pkg_resources.parse_version("2.0"):
       self.skipTest("Not supported in pre 2.0 images")
+    if self.getImageVersion() > pkg_resources.parse_version("2.0"):
+        self.skipTest("Not supported in 2.0+ images")
 
     metadata = "init-actions-repo={}".format(self.INIT_ACTIONS_REPO)
     if dask_runtime:
@@ -117,17 +120,16 @@ def test_mlvm(self, configuration, dask_runtime):
   )
   def test_mlvm_gpu(self, configuration, dask_runtime, rapids_runtime):
     if self.getImageOs() == 'rocky':
-      self.skipTest("Not supported in Rocky Linux-based images")
+        self.skipTest("Not supported in Rocky Linux-based images")
 
     # Supported on Dataproc 2.0+
     if self.getImageVersion() < pkg_resources.parse_version("2.0"):
-      self.skipTest("Not supported in pre 2.0 images")
-
-    metadata = ("init-actions-repo={},include-gpus=true"
-                ",gpu-driver-provider=NVIDIA").format(self.INIT_ACTIONS_REPO)
+        self.skipTest("Not supported in pre 2.0 images")
+    if self.getImageVersion() > pkg_resources.parse_version("2.0"):
+        self.skipTest("Not supported in 2.0+ images")
 
-    cudnn_version = "8.1.1.33"
-    cuda_version = "11.2"
+    cudnn_version = "8.6.0.163"
+    cuda_version = "11.8"
 
     metadata = ("init-actions-repo={},include-gpus=true"
                 ",gpu-driver-provider=NVIDIA,"
@@ -147,7 +149,8 @@ def test_mlvm_gpu(self, configuration, dask_runtime, rapids_runtime):
         master_accelerator="type=nvidia-tesla-t4",
         worker_accelerator="type=nvidia-tesla-t4",
         timeout_in_minutes=60,
-        metadata=metadata)
+        metadata=metadata,
+        boot_disk_size="500GB")
 
     self.verify_all()
 
@@ -157,5 +160,6 @@ def test_mlvm_gpu(self, configuration, dask_runtime, rapids_runtime):
     elif rapids_runtime == "DASK":
       self.verify_rapids_dask()
 
+
 if __name__ == "__main__":
-  absltest.main()
+    absltest.main()