apache · srowen · Aug 28, 2018 · mgaido91 · Aug 29, 2018 · cloud-fan
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -36,6 +36,11 @@ object MimaExcludes {
 
   // Exclude rules for 2.4.x
   lazy val v24excludes = v23excludes ++ Seq(
+    // [SPARK-25044] Address translation of LMF closure primitive args to Object in Scala 2.12
+    ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.sql.expressions.UserDefinedFunction$"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.expressions.UserDefinedFunction.apply"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.expressions.UserDefinedFunction.copy"),
+
     // [SPARK-24296][CORE] Replicate large blocks as a stream.
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.network.netty.NettyBlockRpcServer.this"),
     // [SPARK-23528] Add numIter to ClusteringSummary

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -932,15 +932,6 @@ trait ScalaReflection {
     tpe.dealias.erasure.typeSymbol.asClass.fullName
   }
 
-  /**
-   * Returns classes of input parameters of scala function object.
-   */
-  def getParameterTypes(func: AnyRef): Seq[Class[_]] = {
-    val methods = func.getClass.getMethods.filter(m => m.getName == "apply" && !m.isBridge)
-    assert(methods.length == 1)
-    methods.head.getParameterTypes
-  }
-
   /**
    * Returns the parameter names and types for the primary constructor of this type.
    *

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -2149,28 +2149,34 @@ class Analyzer(
 
       case p => p transformExpressionsUp {
 
-        case udf @ ScalaUDF(func, _, inputs, _, _, _, _) =>
-          val parameterTypes = ScalaReflection.getParameterTypes(func)
-          assert(parameterTypes.length == inputs.length)
-
-          // TODO: skip null handling for not-nullable primitive inputs after we can completely
-          // trust the `nullable` information.
-          // (cls, expr) => cls.isPrimitive && expr.nullable
-          val needsNullCheck = (cls: Class[_], expr: Expression) =>
-            cls.isPrimitive && !expr.isInstanceOf[KnownNotNull]
-          val inputsNullCheck = parameterTypes.zip(inputs)
-            .filter { case (cls, expr) => needsNullCheck(cls, expr) }
-            .map { case (_, expr) => IsNull(expr) }
-            .reduceLeftOption[Expression]((e1, e2) => Or(e1, e2))
-          // Once we add an `If` check above the udf, it is safe to mark those checked inputs
-          // as not nullable (i.e., wrap them with `KnownNotNull`), because the null-returning
-          // branch of `If` will be called if any of these checked inputs is null. Thus we can
-          // prevent this rule from being applied repeatedly.
-          val newInputs = parameterTypes.zip(inputs).map{ case (cls, expr) =>
-            if (needsNullCheck(cls, expr)) KnownNotNull(expr) else expr }
-          inputsNullCheck
-            .map(If(_, Literal.create(null, udf.dataType), udf.copy(children = newInputs)))
-            .getOrElse(udf)
+        case udf@ScalaUDF(func, _, inputs, _, _, _, _, nullableTypes) =>
+          if (nullableTypes.isEmpty) {
+            // If no nullability info is available, do nothing. No fields will be specially
+            // checked for null in the plan. If nullability info is incorrect, the results
+            // of the UDF could be wrong.
+            udf
+          } else {
+            // Otherwise, add special handling of null for fields that can't accept null.
+            // The result of operations like this, when passed null, is generally to return null.
+            assert(nullableTypes.length == inputs.length)
+
+            // TODO: skip null handling for not-nullable primitive inputs after we can completely
+            // trust the `nullable` information.
+            val inputsNullCheck = nullableTypes.zip(inputs)
+              .filter { case (nullable, _) => !nullable }
+              .map { case (_, expr) => IsNull(expr) }
+              .reduceLeftOption[Expression]((e1, e2) => Or(e1, e2))
+            // Once we add an `If` check above the udf, it is safe to mark those checked inputs
+            // as not nullable (i.e., wrap them with `KnownNotNull`), because the null-returning
+            // branch of `If` will be called if any of these checked inputs is null. Thus we can
+            // prevent this rule from being applied repeatedly.
+            val newInputs = nullableTypes.zip(inputs).map { case (nullable, expr) =>
+              if (nullable) expr else KnownNotNull(expr)
+            }
+            inputsNullCheck
+              .map(If(_, Literal.create(null, udf.dataType), udf.copy(children = newInputs)))
+              .getOrElse(udf)
+          }
       }
     }
   }

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.types.DataType
  * @param nullable  True if the UDF can return null value.
  * @param udfDeterministic  True if the UDF is deterministic. Deterministic UDF returns same result
  *                          each time it is invoked with a particular input.
+ * @param nullableTypes which of the inputTypes are nullable (i.e. not primitive)
  */
 case class ScalaUDF(
     function: AnyRef,
@@ -47,7 +48,8 @@ case class ScalaUDF(
     inputTypes: Seq[DataType] = Nil,
     udfName: Option[String] = None,
     nullable: Boolean = true,
-    udfDeterministic: Boolean = true)
+    udfDeterministic: Boolean = true,
+    nullableTypes: Seq[Boolean] = Nil)
   extends Expression with ImplicitCastInputTypes with NonSQLExpression with UserDefinedExpression {
 
   // The constructor for SPARK 2.1 and 2.2
@@ -58,7 +60,8 @@ case class ScalaUDF(
       inputTypes: Seq[DataType],
       udfName: Option[String]) = {
     this(
-      function, dataType, children, inputTypes, udfName, nullable = true, udfDeterministic = true)
+      function, dataType, children, inputTypes, udfName, nullable = true,
+      udfDeterministic = true, nullableTypes = Nil)
   }
 
   override lazy val deterministic: Boolean = udfDeterministic && children.forall(_.deterministic)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -261,23 +261,6 @@ class ScalaReflectionSuite extends SparkFunSuite {
     }
   }
 
-  test("get parameter type from a function object") {
-    val primitiveFunc = (i: Int, j: Long) => "x"
-    val primitiveTypes = getParameterTypes(primitiveFunc)
-    assert(primitiveTypes.forall(_.isPrimitive))
-    assert(primitiveTypes === Seq(classOf[Int], classOf[Long]))
-
-    val boxedFunc = (i: java.lang.Integer, j: java.lang.Long) => "x"
-    val boxedTypes = getParameterTypes(boxedFunc)
-    assert(boxedTypes.forall(!_.isPrimitive))
-    assert(boxedTypes === Seq(classOf[java.lang.Integer], classOf[java.lang.Long]))
-
-    val anyFunc = (i: Any, j: AnyRef) => "x"
-    val anyTypes = getParameterTypes(anyFunc)
-    assert(anyTypes.forall(!_.isPrimitive))
-    assert(anyTypes === Seq(classOf[java.lang.Object], classOf[java.lang.Object]))
-  }
-
   test("SPARK-15062: Get correct serializer for List[_]") {
     val list = List(1, 2, 3)
     val serializer = serializerFor[List[Int]](BoundReference(

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -317,13 +317,15 @@ class AnalysisSuite extends AnalysisTest with Matchers {
     checkUDF(udf1, expected1)
 
     // only primitive parameter needs special null handling
-    val udf2 = ScalaUDF((s: String, d: Double) => "x", StringType, string :: double :: Nil)
+    val udf2 = ScalaUDF((s: String, d: Double) => "x", StringType, string :: double :: Nil,
+      nullableTypes = true :: false :: Nil)
     val expected2 =
       If(IsNull(double), nullResult, udf2.copy(children = string :: KnownNotNull(double) :: Nil))
     checkUDF(udf2, expected2)
 
     // special null handling should apply to all primitive parameters
-    val udf3 = ScalaUDF((s: Short, d: Double) => "x", StringType, short :: double :: Nil)
+    val udf3 = ScalaUDF((s: Short, d: Double) => "x", StringType, short :: double :: Nil,
+      nullableTypes = false :: false :: Nil)
     val expected3 = If(
       IsNull(short) || IsNull(double),
       nullResult,
@@ -335,7 +337,8 @@ class AnalysisSuite extends AnalysisTest with Matchers {
     val udf4 = ScalaUDF(
       (s: Short, d: Double) => "x",
       StringType,
-      short :: double.withNullability(false) :: Nil)
+      short :: double.withNullability(false) :: Nil,
+      nullableTypes = false :: false :: Nil)
     val expected4 = If(
       IsNull(short),
       nullResult,