Fix schema evolution issue with nested struct (within a map) and colu…

…mn renaming Resolved the issue described in [Bug #3227](#3227) where adding a field inside a struct (nested within a map) while renaming a top column caused the operation to fail. The fix focuses on handling schema changes without affecting the integrity of existing data structures, specifically avoiding issues with nested fields within a map and renamed columns. fix!:renamed the added DeltaWriteExample to EvolutionW ithMap fix!: Modified TypeWideningInsertSchemaEvolutionSuite to accommodate that schema evolution is now allowed for maps Signed-off-by: Sola Richard Olorunfemi <[email protected]> fix!: addCastToMap to handle complex types. Added tests to cover new abilities fix: resolved scalaStyle error
delta-io · Nov 22, 2024 · 6fad5cc · 6fad5cc
1 parent 68275d1
commit 6fad5cc
Show file tree

Hide file tree

Showing 4 changed files with 377 additions and 25 deletions.
diff --git a/examples/scala/src/main/scala/example/EvolutionWithMap.scala b/examples/scala/src/main/scala/example/EvolutionWithMap.scala
@@ -0,0 +1,98 @@
+/*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package example
+
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.SparkSession
+
+object EvolutionWithMap {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession.builder()
+      .appName("EvolutionWithMap")
+      .master("local[*]")
+      .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+      .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
+      .getOrCreate()
+
+    import spark.implicits._
+
+    val tableName = "insert_map_schema_evolution"
+
+    try {
+      // Define initial schema
+      val initialSchema = StructType(Seq(
+          StructField("key", IntegerType, nullable = false),
+          StructField("metrics", MapType(StringType, StructType(Seq(
+              StructField("id", IntegerType, nullable = false),
+              StructField("value", IntegerType, nullable = false)
+          ))))
+          ))
+
+      val data = Seq(
+      Row(1, Map("event" -> Row(1, 1)))
+      )
+
+      val rdd = spark.sparkContext.parallelize(data)
+
+      val initialDf = spark.createDataFrame(rdd, initialSchema)
+
+      initialDf.write
+        .option("overwriteSchema", "true")
+        .mode("overwrite")
+        .format("delta")
+        .saveAsTable(s"$tableName")
+
+      // Define the schema with simulteneous change in a StructField name
+      // And additional field in a map column
+      val evolvedSchema = StructType(Seq(
+      StructField("renamed_key", IntegerType, nullable = false),
+      StructField("metrics", MapType(StringType, StructType(Seq(
+          StructField("id", IntegerType, nullable = false),
+          StructField("value", IntegerType, nullable = false),
+          StructField("comment", StringType, nullable = true)
+      ))))
+      ))
+
+      val evolvedData = Seq(
+      Row(1, Map("event" -> Row(1, 1, "deprecated")))
+      )
+
+      val evolvedRDD = spark.sparkContext.parallelize(evolvedData)
+
+      val modifiedDf = spark.createDataFrame(evolvedRDD, evolvedSchema)
+
+      // The below would fail without schema evolution for map types
+      modifiedDf.write
+        .mode("append")
+        .option("mergeSchema", "true")
+        .format("delta")
+        .insertInto(s"$tableName")
+
+      spark.sql(s"SELECT * FROM $tableName").show(false)
+
+    } finally {
+
+      // Cleanup
+      spark.sql(s"DROP TABLE IF EXISTS $tableName")
+
+      spark.stop()
+    }
+
+  }
+}
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala
@@ -69,6 +69,7 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 
+
 /**
  * Analysis rules for Delta. Currently, these rules enable schema enforcement / evolution with
  * INSERT INTO.
@@ -913,8 +914,8 @@ class DeltaAnalysis(session: SparkSession)
   }
 
   private def addCastToColumn(
-      attr: Attribute,
-      targetAttr: Attribute,
+      attr: NamedExpression,
+      targetAttr: NamedExpression,
       tblName: String,
       allowTypeWidening: Boolean): NamedExpression = {
     val expr = (attr.dataType, targetAttr.dataType) match {
@@ -930,6 +931,8 @@ class DeltaAnalysis(session: SparkSession)
         // Keep the type from the query, the target schema will be updated to widen the existing
         // type to match it.
         attr
+      case (s: MapType, t: MapType) if s != t =>
+        addCastsToMaps(tblName, attr, s, t, allowTypeWidening)
       case _ =>
         getCastFunction(attr, targetAttr.dataType, targetAttr.name)
     }
@@ -1047,8 +1050,7 @@ class DeltaAnalysis(session: SparkSession)
   }
 
   /**
-   * Recursively casts structs in case it contains null types.
-   * TODO: Support other complex types like MapType and ArrayType
+   * Recursively casts struct data types in case the source/target type differs.
    */
   private def addCastsToStructs(
       tableName: String,
@@ -1067,6 +1069,8 @@ class DeltaAnalysis(session: SparkSession)
             val subField = Alias(GetStructField(parent, i, Option(name)), target(i).name)(
               explicitMetadata = Option(metadata))
             addCastsToStructs(tableName, subField, nested, t, allowTypeWidening)
+          // We could also handle maptype within struct here but there is restriction
+          // on deep nexted operations that may result in maxIteration error
           case o =>
             val field = parent.qualifiedName + "." + name
             val targetName = parent.qualifiedName + "." + target(i).name
@@ -1124,6 +1128,63 @@ class DeltaAnalysis(session: SparkSession)
     DeltaViewHelper.stripTempViewForMerge(plan, conf)
   }
 
+  /**
+   * Recursively casts map data types in case the key/value type differs.
+   */
+  private def addCastsToMaps(
+      tableName: String,
+      parent: NamedExpression,
+      sourceMapType: MapType,
+      targetMapType: MapType,
+      allowTypeWidening: Boolean): Expression = {
+
+    val transformedKeys =
+      if (sourceMapType.keyType != targetMapType.keyType) {
+        // Create a transformation for the keys
+        ArrayTransform(MapKeys(parent), {
+          val key = NamedLambdaVariable(
+                  "key", sourceMapType.keyType, nullable = false)
+
+          val targetKeyAttr = AttributeReference(
+              "targetKey", targetMapType.keyType, nullable = false)()
+          val castedKey =
+              addCastToColumn(
+                key,
+                targetKeyAttr,
+                tableName,
+                allowTypeWidening
+              )
+          LambdaFunction(castedKey, Seq(key))
+        })
+      } else {
+        MapKeys(parent)
+      }
+
+    val transformedValues =
+      if (sourceMapType.valueType != targetMapType.valueType) {
+        // Create a transformation for the values
+        ArrayTransform(MapValues(parent), {
+          val value = NamedLambdaVariable(
+              "value", sourceMapType.valueType, sourceMapType.valueContainsNull)
+
+          val targetValueAttr = AttributeReference(
+              "targetValue", targetMapType.valueType, sourceMapType.valueContainsNull)()
+          val castedValue =
+              addCastToColumn(
+                value,
+                targetValueAttr,
+                tableName,
+                allowTypeWidening
+              )
+          LambdaFunction(castedValue, Seq(value))
+        })
+      } else {
+        MapValues(parent)
+      }
+    // Create new map from transformed keys and values
+    MapFromArrays(transformedKeys, transformedValues)
+  }
+
   /**
    * Verify the input plan for a SINGLE streaming query with the following:
    * 1. Schema location must be under checkpoint location, if not lifted by flag