diff --git a/.github/workflows/ci-tests-r.yml b/.github/workflows/ci-tests-r.yml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/build.sbt b/build.sbt index 2345fd768c..9d15620b08 100644 --- a/build.sbt +++ b/build.sbt @@ -8,7 +8,7 @@ import scala.xml.transform.{RewriteRule, RuleTransformer} import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _} val condaEnvName = "synapseml" -val sparkVersion = "3.2.3" +val sparkVersion = "3.3.1" name := "synapseml" ThisBuild / organization := "com.microsoft.azure" ThisBuild / scalaVersion := "2.12.15" diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/RegressionBase.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/RegressionBase.scala index 9b6c7f9d08..1ef9ac6e85 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/RegressionBase.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/RegressionBase.scala @@ -43,6 +43,7 @@ abstract class RegressionBase { * when running on Spark 3.0.* and 3.1.*. * Workaround: use reflection to construct the implementation. */ + //TODO: Check for spark 3.3.0 implicit lazy val sumImpl: sum.Impl[BroadcastedColumns[BDM[Double], BDV[Double]], Transpose[BDV[Double]]] = { Try { // This works for breeze 1.2 diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala index 8e3fd1e85f..af95437c9c 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala @@ -101,7 +101,7 @@ object RTestGen { | "spark.sql.shuffle.partitions=10", | "spark.sql.crossJoin.enabled=true") | - |sc <- spark_connect(master = "local", version = "3.2.4", config = conf) + |sc <- spark_connect(master = "local", version = "3.3.1", config = conf) | |""".stripMargin, StandardOpenOption.CREATE) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksGPUTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksGPUTests.scala index be308c7af7..d99ac4a672 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksGPUTests.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksGPUTests.scala @@ -11,11 +11,7 @@ import java.io.File import scala.collection.mutable.ListBuffer class DatabricksGPUTests extends DatabricksTestHelper { - val horovodInstallationScript: File = FileUtilities.join( - BuildInfo.baseDirectory.getParent, "deep-learning", - "src", "main", "python", "horovod_installation.sh").getCanonicalFile - uploadFileToDBFS(horovodInstallationScript, "/FileStore/horovod-fix-commit/horovod_installation.sh") - val clusterId: String = createClusterInPool(GPUClusterName, AdbGpuRuntime, 2, GpuPoolId, GPUInitScripts) + val clusterId: String = createClusterInPool(GPUClusterName, AdbGpuRuntime, 2, GpuPoolId, "[]") val jobIdsToCancel: ListBuffer[Int] = databricksTestHelper( clusterId, GPULibraries, GPUNotebooks) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala index c60eac2634..1ac4872632 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala @@ -29,10 +29,11 @@ object DatabricksUtilities { // ADB Info val Region = "eastus" - val PoolName = "synapseml-build-10.4" - val GpuPoolName = "synapseml-build-10.4-gpu" - val AdbRuntime = "10.4.x-scala2.12" - val AdbGpuRuntime = "10.4.x-gpu-ml-scala2.12" + val PoolName = "synapseml-build-11.2" + val GpuPoolName = "synapseml-build-11.2-gpu" + val AdbRuntime = "11.2.x-scala2.12" + // https://learn.microsoft.com/en-us/azure/databricks/release-notes/runtime/11.2 + val AdbGpuRuntime = "11.2.x-gpu-ml-scala2.12" val NumWorkers = 5 val AutoTerminationMinutes = 15 @@ -72,6 +73,8 @@ object DatabricksUtilities { // TODO: install synapse.ml.dl wheel package here val GPULibraries: String = List( Map("maven" -> Map("coordinates" -> PackageMavenCoordinate, "repo" -> PackageRepository)), + Map("pypi" -> Map("package" -> "pytorch-lightning==1.5.0")), + Map("pypi" -> Map("package" -> "torchvision==0.12.0")), Map("pypi" -> Map("package" -> "transformers==4.15.0")), Map("pypi" -> Map("package" -> "petastorm==0.12.0")) ).toJson.compactPrint diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala index 317218c08d..478e829d79 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala @@ -255,7 +255,7 @@ object SynapseUtilities { | "nodeSizeFamily": "MemoryOptimized", | "provisioningState": "Succeeded", | "sessionLevelPackagesEnabled": "true", - | "sparkVersion": "3.2" + | "sparkVersion": "3.3" | } |} |""".stripMargin diff --git a/environment.yml b/environment.yml index 0bba06b1f3..8879472629 100644 --- a/environment.yml +++ b/environment.yml @@ -11,8 +11,7 @@ dependencies: - r-devtools=2.4.2 - pip: - pyarrow>=0.15.0 - - numpy>=1.19.3 - - pyspark==3.2.3 + - pyspark==3.3.1 - pandas==1.2.5 - wheel - sphinx==4.2.0 @@ -32,6 +31,7 @@ dependencies: - twine - jupyter - mlflow + - numpy==1.23.0 - torch==1.11.0 - torchvision==0.12.0 - horovod==0.25.0 diff --git a/notebooks/features/regression/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb b/notebooks/features/regression/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb index b19b31c56c..97dd1c59ea 100644 --- a/notebooks/features/regression/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb +++ b/notebooks/features/regression/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb @@ -141,7 +141,7 @@ "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", "lr_train_data = featurizer.transform(train_data)[\"target\", \"features\"]\n", "lr_test_data = featurizer.transform(test_data)[\"target\", \"features\"]\n", - "display(lr_train_data.limit(10).toPandas())" + "display(lr_train_data.limit(10))" ] }, { @@ -156,7 +156,7 @@ "lr_model = lr.fit(lr_train_data)\n", "lr_predictions = lr_model.transform(lr_test_data)\n", "\n", - "display(lr_predictions.limit(10).toPandas())" + "display(lr_predictions.limit(10))" ] }, { @@ -210,7 +210,7 @@ "\n", "vw_train_data = vw_featurizer.transform(train_data)[\"target\", \"features\"]\n", "vw_test_data = vw_featurizer.transform(test_data)[\"target\", \"features\"]\n", - "display(vw_train_data.limit(10).toPandas())" + "display(vw_train_data.limit(10))" ] }, { @@ -236,7 +236,7 @@ "vw_model = vwr.fit(vw_train_data_2.repartition(1))\n", "vw_predictions = vw_model.transform(vw_test_data)\n", "\n", - "display(vw_predictions.limit(10).toPandas())" + "display(vw_predictions.limit(10))" ] }, { diff --git a/pipeline.yaml b/pipeline.yaml index a6d93ae801..0d373d4124 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -513,7 +513,7 @@ jobs: fi sbt publishM2 - SPARK_VERSION=3.2.4 + SPARK_VERSION=3.3.1 HADOOP_VERSION=3.2 wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz (timeout 20m sbt "project $(PACKAGE)" coverage testR) || (echo "retrying" && timeout 20m sbt "project $(PACKAGE)" coverage testR) || (echo "retrying" && timeout 20m sbt "project $(PACKAGE)" coverage testR) diff --git a/start b/start index ebaddb8ef8..3c1805ce7f 100644 --- a/start +++ b/start @@ -1,7 +1,7 @@ #!/bin/bash export OPENMPI_VERSION="3.1.2" -export SPARK_VERSION="3.2.3" +export SPARK_VERSION="3.3.1" export HADOOP_VERSION="2.7" export SYNAPSEML_VERSION="0.11.1" # Binder compatibility version diff --git a/tools/docker/demo/Dockerfile b/tools/docker/demo/Dockerfile index 15df1f203a..aa8cc81ce6 100644 --- a/tools/docker/demo/Dockerfile +++ b/tools/docker/demo/Dockerfile @@ -3,8 +3,8 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04 ARG SYNAPSEML_VERSION=0.11.1 ARG DEBIAN_FRONTEND=noninteractive -ENV SPARK_VERSION=3.2.3 -ENV HADOOP_VERSION=2.7 +ENV SPARK_VERSION=3.3.1 +ENV HADOOP_VERSION=3 ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION} ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64 diff --git a/tools/docker/minimal/Dockerfile b/tools/docker/minimal/Dockerfile index 0fe4d04ec6..daa06a0ed3 100644 --- a/tools/docker/minimal/Dockerfile +++ b/tools/docker/minimal/Dockerfile @@ -3,8 +3,8 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04 ARG SYNAPSEML_VERSION=0.11.1 ARG DEBIAN_FRONTEND=noninteractive -ENV SPARK_VERSION=3.2.3 -ENV HADOOP_VERSION=2.7 +ENV SPARK_VERSION=3.3.1 +ENV HADOOP_VERSION=3 ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION} ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64 diff --git a/tools/dotnet/dotnetSetup.sh b/tools/dotnet/dotnetSetup.sh index 1244caf479..297d37310a 100644 --- a/tools/dotnet/dotnetSetup.sh +++ b/tools/dotnet/dotnetSetup.sh @@ -20,11 +20,11 @@ echo "##vso[task.setvariable variable=DOTNET_WORKER_DIR]$DOTNET_WORKER_DIR" # Install Sleet dotnet tool install -g sleet -# Install Apache Spark-3.2 -curl https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz -o spark-3.2.0-bin-hadoop3.2.tgz +# Install Apache Spark-3.3 +curl https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz -o spark-3.3.1-bin-hadoop3.tgz mkdir ~/bin -tar -xzvf spark-3.2.0-bin-hadoop3.2.tgz -C ~/bin -export SPARK_HOME=~/bin/spark-3.2.0-bin-hadoop3.2/ +tar -xzvf spark-3.3.1-bin-hadoop3.tgz -C ~/bin +export SPARK_HOME=~/bin/spark-3.3.1-bin-hadoop3/ export PATH=$SPARK_HOME/bin:$PATH echo "##vso[task.setvariable variable=SPARK_HOME]$SPARK_HOME" echo "##vso[task.setvariable variable=PATH]$SPARK_HOME/bin:$PATH" diff --git a/tools/tests/run_r_tests.R b/tools/tests/run_r_tests.R index 0d66844fef..365df33955 100644 --- a/tools/tests/run_r_tests.R +++ b/tools/tests/run_r_tests.R @@ -3,7 +3,7 @@ if (!require("sparklyr")) { library("sparklyr") } -spark_install_tar(paste(getwd(), "/../../../../../../spark-3.2.4-bin-hadoop3.2.tgz", sep = "")) +spark_install_tar(paste(getwd(), "/../../../../../../spark-3.3.1-bin-hadoop3.2.tgz", sep = "")) options("testthat.output_file" = "../../../../r-test-results.xml") devtools::test(reporter = JunitReporter$new())