[Spark] Upgrade Spark dependency to 3.5.0 in Delta-Spark

The following are the changes needed: * PySpark 3.5 has deprecated the support for Python 3.7. This required changes to Delta test infra to install the appropriate Python version and other packages. The `Dockerfile` used for running tests is also updated to have required Python version and packages and uses the same base image as PySpark test infra in Apache Spark. * `StructType.toAttributes` and `StructType.fromAttributes` methods are moved into a utility class `DataTypeUtils`. * The `iceberg` module is disabled as there is no released version of `iceberg` that works with Spark 3.5 yet * Remove the URI path hack used in `DMLWithDeletionVectorsHelper` to get around a bug in Spark 3.4. * Remove unrelated tutorial in `delta/examples/tutorials/saiseu19` * Test failure fixes * `org.apache.spark.sql.delta.DeltaHistoryManagerSuite` - Error message has changed * `org.apache.spark.sql.delta.DeltaOptionSuite` - Parquet file name using the LZ4 code has changed due to a apache/parquet-java#1000 in `parquet-mr` dependency. * `org.apache.spark.sql.delta.deletionvectors.DeletionVectorsSuite` - Parquet now generates `row-index` whenever `_metadata` column is selected, however Spark 3.5 has a bug where a row group containing more than 2bn rows fails. For now don't return any `row-index` column in `_metadata` by overriding the `metadataSchemaFields: Seq[StructField]` in `DeltaParquetFileFormat`. * `org.apache.spark.sql.delta.perf.OptimizeMetadataOnlyDeltaQuerySuite`: A behavior change by apache/spark#40922. In Spark plans a new function called `ToPrettyString` is used instead of `cast(aggExpr To STRING)` in when `Dataset.show()` usage. * `org.apache.spark.sql.delta.DeltaCDCStreamDeletionVectorSuite` and `org.apache.spark.sql.delta.DeltaCDCStreamSuite`: Regression in Spark 3.5 RC fixed by apache/spark#42774 before the Spark 3.5 release Closes #1986 GitOrigin-RevId: b0e4a81b608a857e45ecba71b070309347616a30
delta-io · Oct 6, 2023 · 4f9c8b9 · 4f9c8b9
1 parent bcd1867
commit 4f9c8b9
Show file tree

Hide file tree

Showing 58 changed files with 303 additions and 1,846 deletions.
diff --git a/.github/workflows/spark_test.yaml b/.github/workflows/spark_test.yaml
@@ -46,17 +46,21 @@ jobs:
           export PATH="~/.pyenv/bin:$PATH"
           eval "$(pyenv init -)"
           eval "$(pyenv virtualenv-init -)"
-          pyenv install 3.7.4
-          pyenv global system 3.7.4
-          pipenv --python 3.7 install
-          pipenv run pip install pyspark==3.4.0
+          pyenv install 3.8.18
+          pyenv global system 3.8.18
+          pipenv --python 3.8 install
+          pipenv run pip install pyspark==3.5.0
           pipenv run pip install flake8==3.5.0 pypandoc==1.3.3
           pipenv run pip install importlib_metadata==3.10.0
-          pipenv run pip install mypy==0.910
+          pipenv run pip install mypy==0.982
           pipenv run pip install cryptography==37.0.4
           pipenv run pip install twine==4.0.1
           pipenv run pip install wheel==0.33.4
           pipenv run pip install setuptools==41.0.1
+          pipenv run pip install pydocstyle==3.0.0
+          pipenv run pip install pandas==1.0.5
+          pipenv run pip install pyarrow==8.0.0
+          pipenv run pip install numpy==1.20.3
         if: steps.git-diff.outputs.diff
       - name: Run Scala/Java and Python tests
         run: |

diff --git a/Dockerfile b/Dockerfile
@@ -13,19 +13,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+FROM ubuntu:focal-20221019
 
-# Debian buster LTS is until June 30th 2024. [TODO] Upgrade to a newer version before then.
-FROM openjdk:8-jdk-buster
+ENV DEBIAN_FRONTEND noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN true
+
+RUN apt-get update
+RUN apt-get install -y software-properties-common
+RUN apt-get install -y curl
+RUN apt-get install -y wget
+RUN apt-get install -y openjdk-8-jdk
+RUN apt-get install -y python3.8
+RUN apt-get install -y python3-pip
 
-# Install pip3
-RUN apt-get update && apt-get install -y python3-pip
 # Upgrade pip. This is needed to use prebuilt wheels for packages cffi (dep of cryptography) and
 # cryptography. Otherwise, building wheels for these packages fails.
 RUN pip3 install --upgrade pip
 
-RUN pip3 install pyspark==3.4.0
+RUN pip3 install pyspark==3.5.0
+
+RUN pip3 install mypy==0.982
+
+RUN pip3 install pydocstyle==3.0.0
+
+RUN pip3 install pandas==1.0.5
+
+RUN pip3 install pyarrow==8.0.0
 
-RUN pip3 install mypy==0.910
+RUN pip3 install numpy==1.20.3
 
 RUN pip3 install importlib_metadata==3.10.0
 

diff --git a/build.sbt b/build.sbt
@@ -35,7 +35,7 @@ val default_scala_version = settingKey[String]("Default Scala version")
 Global / default_scala_version := scala212
 
 // Dependent library versions
-val sparkVersion = "3.4.0"
+val sparkVersion = "3.5.0"
 val flinkVersion = "1.16.1"
 val hadoopVersion = "3.3.1"
 val scalaTestVersion = "3.2.15"
@@ -115,6 +115,8 @@ lazy val spark = (project in file("spark"))
       "org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests",
       "org.apache.spark" %% "spark-hive" % sparkVersion % "test" classifier "tests",
     ),
+    // For adding staged Spark RC versions, Ex:
+    // resolvers += "Apche Spark 3.5.0 (RC1) Staging" at "https://repository.apache.org/content/repositories/orgapachespark-1444/",
     Compile / packageBin / mappings := (Compile / packageBin / mappings).value ++
         listPythonFiles(baseDirectory.value.getParentFile / "python"),
 
@@ -322,6 +324,8 @@ val icebergSparkRuntimeArtifactName = {
  s"iceberg-spark-runtime-$expMaj.$expMin"
 }
 
+/**
+ * Need a Icebeg release version with support for Spark 3.5
 lazy val testDeltaIcebergJar = (project in file("testDeltaIcebergJar"))
   // delta-iceberg depends on delta-spark! So, we need to include it during our test.
   .dependsOn(spark % "test")
@@ -451,7 +455,7 @@ lazy val icebergShaded = (project in file("icebergShaded"))
     assemblyPackageScala / assembleArtifact := false,
     // Make the 'compile' invoke the 'assembly' task to generate the uber jar.
   )
-
+*/
 lazy val hive = (project in file("connectors/hive"))
   .dependsOn(standaloneCosmetic)
   .settings (
@@ -1081,7 +1085,7 @@ val createTargetClassesDir = taskKey[Unit]("create target classes dir")
 
 // Don't use these groups for any other projects
 lazy val sparkGroup = project
-  .aggregate(spark, contribs, storage, storageS3DynamoDB, iceberg)
+  .aggregate(spark, contribs, storage, storageS3DynamoDB)
   .settings(
     // crossScalaVersions must be set to Nil on the aggregating project
     crossScalaVersions := Nil,

diff --git a/dev/tox.ini b/dev/tox.ini
@@ -19,4 +19,4 @@ ignore=E226,E241,E305,E402,E722,E731,E741,W503,W504
 max-line-length=100
 exclude=cloudpickle.py,heapq3.py,shared.py,python/docs/conf.py,work/*/*.py,python/.eggs/*,dist/*
 [pydocstyle]
-ignore=D100,D101,D102,D103,D104,D105,D106,D107,D200,D201,D202,D203,D204,D205,D206,D207,D208,D209,D210,D211,D212,D213,D214,D215,D300,D301,D302,D400,D401,D402,D403,D404,D405,D406,D407,D408,D409,D410,D411,D412,D413,D414
+ignore=D100,D101,D102,D103,D104,D105,D106,D107,D200,D201,D202,D203,D204,D205,D206,D207,D208,D209,D210,D211,D212,D213,D214,D215,D300,D301,D302,D400,D401,D402,D403,D404,D405,D406,D407,D408,D409,D410,D411,D412,D413,D414,D415,D417
diff --git a/docs/generate_api_docs.py b/docs/generate_api_docs.py
@@ -142,7 +142,7 @@ def run_cmd(cmd, throw_on_error=True, env=None, stream_output=False, **kwargs):
             stderr = stderr.decode("UTF-8")
 
         exit_code = child.wait()
-        if throw_on_error and exit_code is not 0:
+        if throw_on_error and exit_code != 0:
             raise Exception(
                 "Non-zero exitcode: %s\n\nSTDOUT:\n%s\n\nSTDERR:%s" %
                 (exit_code, stdout, stderr))