[NSE-710] Add rand expression support (#707)

* Initial commit * Correct the return type * Unify the implementation * Change arrow branch for unit test [will revert at last] * Correct the expected behavior of two unit tests * Revert "Change arrow branch for unit test [will revert at last]" This reverts commit 040cf0244f29997e603a01b2cd1021e2615fd11b. * Fix unit test issues * Change arrow branch for unit test [will revert at last] * Comment a unit test, not applicable to gazelle's implementation * Revert "Change arrow branch for unit test [will revert at last]" This reverts commit 3414449.
oap-project · Feb 17, 2022 · 4ffe297 · 4ffe297
1 parent 80c0149
commit 4ffe297
Show file tree

Hide file tree

Showing 6 changed files with 174 additions and 53 deletions.
diff --git a/native-sql-engine/core/src/main/scala/com/intel/oap/expression/ColumnarUnaryOperator.scala b/native-sql-engine/core/src/main/scala/com/intel/oap/expression/ColumnarUnaryOperator.scala
@@ -27,9 +27,9 @@ import org.apache.arrow.vector.types.pojo.Field
 import org.apache.arrow.vector.types.DateUnit
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.Rand
 import org.apache.spark.sql.catalyst.optimizer._
 import org.apache.spark.sql.types._
-import scala.collection.mutable.ListBuffer
 
 import com.intel.oap.expression.ColumnarDateTimeExpressions.ColumnarDayOfMonth
 import com.intel.oap.expression.ColumnarDateTimeExpressions.ColumnarDayOfWeek
@@ -849,6 +849,43 @@ class ColumnarNormalizeNaNAndZero(child: Expression, original: NormalizeNaNAndZe
   }
 }
 
+class ColumnarRand(child: Expression)
+    extends Rand(child: Expression) with ColumnarExpression with Logging {
+
+  val resultType = new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE);
+  var offset: Integer = _;
+
+  buildCheck()
+
+  def buildCheck(): Unit = {
+    val supportedTypes = List(IntegerType, LongType)
+    if (supportedTypes.indexOf(child.dataType) == -1 || !child.foldable) {
+      // Align with Spark's exception message and to pass the below unit test:
+      // test("SPARK-33945: handles a random seed consisting of an expr tree")
+      throw new Exception(
+        "Input argument to rand/random must be an integer, long, or null constant")
+    }
+  }
+
+  // Aligned with Spark, seed + partitionIndex will be the actual seed.
+  override def initializeInternal(partitionIndex: Int): Unit = {
+    offset = partitionIndex;
+  }
+
+  override def doColumnarCodeGen(args: java.lang.Object): (TreeNode, ArrowType) = {
+    val (child_node, _): (TreeNode, ArrowType) =
+      child.asInstanceOf[ColumnarExpression].doColumnarCodeGen(args)
+    if (offset != null) {
+      val offsetNode = TreeBuilder.makeLiteral(offset)
+      (TreeBuilder.makeFunction("rand", Lists.newArrayList(child_node, offsetNode),
+        resultType), resultType)
+    } else {
+      (TreeBuilder.makeFunction("rand", Lists.newArrayList(child_node),
+        resultType), resultType)
+    }
+  }
+}
+
 object ColumnarUnaryOperator {
 
   def create(child: Expression, original: Expression): Expression = original match {
@@ -914,6 +951,8 @@ object ColumnarUnaryOperator {
       new ColumnarMillisToTimestamp(child)
     case a: MicrosToTimestamp =>
       new ColumnarMicrosToTimestamp(child)
+    case r: Rand =>
+      new ColumnarRand(child)
     case other =>
       child.dataType match {
         case _: DateType => other match {

diff --git a/native-sql-engine/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/native-sql-engine/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql
 
-import scala.util.Random
+import com.intel.oap.execution.ColumnarHashAggregateExec
 
+import scala.util.Random
 import org.scalatest.matchers.must.Matchers.the
-
 import org.apache.spark.sql.execution.WholeStageCodegenExec
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
@@ -692,10 +692,49 @@ class DataFrameAggregateSuite extends QueryTest
     " before using it") {
     Seq(
       monotonically_increasing_id(), spark_partition_id(),
-      rand(Random.nextLong()), randn(Random.nextLong())
+      randn(Random.nextLong())
     ).foreach(assertNoExceptions)
   }
 
+  private def assertNoExceptionsColumnar(c: Column): Unit = {
+    for ((wholeStage, useObjectHashAgg) <-
+         Seq((true, true), (true, false), (false, true), (false, false))) {
+      withSQLConf(
+        (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, wholeStage.toString),
+        (SQLConf.USE_OBJECT_HASH_AGG.key, useObjectHashAgg.toString)) {
+
+        val df = Seq(("1", 1), ("1", 2), ("2", 3), ("2", 4)).toDF("x", "y")
+
+        // test case for HashAggregate
+        val hashAggDF = df.groupBy("x").agg(c, sum("y"))
+        hashAggDF.collect()
+        val hashAggPlan = hashAggDF.queryExecution.executedPlan
+        // Will not enter into spark WholeStageCodegen.
+        assert(stripAQEPlan(hashAggPlan.children.head).isInstanceOf[ColumnarHashAggregateExec])
+
+        // test case for ObjectHashAggregate and SortAggregate
+        val objHashAggOrSortAggDF = df.groupBy("x").agg(c, collect_list("y"))
+        objHashAggOrSortAggDF.collect()
+        val objHashAggOrSortAggPlan =
+          stripAQEPlan(objHashAggOrSortAggDF.queryExecution.executedPlan)
+        if (useObjectHashAgg) {
+          assert(objHashAggOrSortAggPlan.isInstanceOf[ObjectHashAggregateExec])
+        } else {
+          assert(objHashAggOrSortAggPlan.isInstanceOf[SortAggregateExec])
+        }
+      }
+    }
+  }
+
+  // This test is similar to the above one. The expected behavior changes since
+  // the relevant expressions are supported and fallback is not required.
+  test("SPARK-19471[Columnar]: AggregationIterator does not initialize the generated " +
+      "result projection before using it") {
+    Seq(
+      rand(Random.nextLong())
+    ).foreach(assertNoExceptionsColumnar)
+  }
+
   test("SPARK-21580 ints in aggregation expressions are taken as group-by ordinal.") {
     checkAnswer(
       testData2.groupBy(lit(3), lit(4)).agg(lit(6), lit(7), sum("b")),

diff --git a/native-sql-engine/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala b/native-sql-engine/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala
@@ -291,45 +291,46 @@ class DataFrameSetOperationsSuite extends QueryTest with SharedSparkSession {
     assert(except.count() === 70)
   }
 
-  test("SPARK-10740: handle nondeterministic expressions correctly for set operations") {
-    val df1 = (1 to 20).map(Tuple1.apply).toDF("i")
-    val df2 = (1 to 10).map(Tuple1.apply).toDF("i")
-
-    // When generating expected results at here, we need to follow the implementation of
-    // Rand expression.
-    def expected(df: DataFrame): Seq[Row] =
-      df.rdd.collectPartitions().zipWithIndex.flatMap {
-        case (data, index) =>
-          val rng = new org.apache.spark.util.random.XORShiftRandom(7 + index)
-          data.filter(_.getInt(0) < rng.nextDouble() * 10)
-      }.toSeq
-
-    val union = df1.union(df2)
-    checkAnswer(
-      union.filter($"i" < rand(7) * 10),
-      expected(union)
-    )
-    checkAnswer(
-      union.select(rand(7)),
-      union.rdd.collectPartitions().zipWithIndex.flatMap {
-        case (data, index) =>
-          val rng = new org.apache.spark.util.random.XORShiftRandom(7 + index)
-          data.map(_ => rng.nextDouble()).map(i => Row(i))
-      }
-    )
-
-    val intersect = df1.intersect(df2)
-    checkAnswer(
-      intersect.filter($"i" < rand(7) * 10),
-      expected(intersect)
-    )
-
-    val except = df1.except(df2)
-    checkAnswer(
-      except.filter($"i" < rand(7) * 10),
-      expected(except)
-    )
-  }
+  // The below test is not applicable to gazelle's implementation.
+//  test("SPARK-10740: handle nondeterministic expressions correctly for set operations") {
+//    val df1 = (1 to 20).map(Tuple1.apply).toDF("i")
+//    val df2 = (1 to 10).map(Tuple1.apply).toDF("i")
+//
+//    // When generating expected results at here, we need to follow the implementation of
+//    // Rand expression.
+//    def expected(df: DataFrame): Seq[Row] =
+//      df.rdd.collectPartitions().zipWithIndex.flatMap {
+//        case (data, index) =>
+//          val rng = new org.apache.spark.util.random.XORShiftRandom(7 + index)
+//          data.filter(_.getInt(0) < rng.nextDouble() * 10)
+//      }.toSeq
+//
+//    val union = df1.union(df2)
+//    checkAnswer(
+//      union.filter($"i" < rand(7) * 10),
+//      expected(union)
+//    )
+//    checkAnswer(
+//      union.select(rand(7)),
+//      union.rdd.collectPartitions().zipWithIndex.flatMap {
+//        case (data, index) =>
+//          val rng = new org.apache.spark.util.random.XORShiftRandom(7 + index)
+//          data.map(_ => rng.nextDouble()).map(i => Row(i))
+//      }
+//    )
+//
+//    val intersect = df1.intersect(df2)
+//    checkAnswer(
+//      intersect.filter($"i" < rand(7) * 10),
+//      expected(intersect)
+//    )
+//
+//    val except = df1.except(df2)
+//    checkAnswer(
+//      except.filter($"i" < rand(7) * 10),
+//      expected(except)
+//    )
+//  }
 
   ignore("SPARK-17123: Performing set operations that combine non-scala native types") {
     val dates = Seq(

diff --git a/native-sql-engine/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/native-sql-engine/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1537,14 +1537,15 @@ class DataFrameSuite extends QueryTest
     checkAnswer(df.sort(rand(33)), df.sort(rand(33)))
   }
 
-  test("SPARK-9083: sort with non-deterministic expressions") {
-    val seed = 33
-    val df = (1 to 100).map(Tuple1.apply).toDF("i").repartition(1)
-    val random = new XORShiftRandom(seed)
-    val expected = (1 to 100).map(_ -> random.nextDouble()).sortBy(_._2).map(_._1)
-    val actual = df.sort(rand(seed)).collect().map(_.getInt(0))
-    assert(expected === actual)
-  }
+  // This test is implementation dependent. Not applicable to gazelle.
+//  test("SPARK-9083: sort with non-deterministic expressions") {
+//    val seed = 33
+//    val df = (1 to 100).map(Tuple1.apply).toDF("i").repartition(1)
+//    val random = new XORShiftRandom(seed)
+//    val expected = (1 to 100).map(_ -> random.nextDouble()).sortBy(_._2).map(_._1)
+//    val actual = df.sort(rand(seed)).collect().map(_.getInt(0))
+//    assert(expected === actual)
+//  }
 
   test("Sorting columns are not in Filter and Project") {
     checkAnswer(

diff --git a/native-sql-engine/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/native-sql-engine/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -3793,7 +3793,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
         // Just checks if a query works correctly
         sql(s"SELECT $f(1 + 1)").collect()
 
-        val msg = intercept[AnalysisException] {
+        val msg = intercept[Exception] {
           sql(s"SELECT $f(id + 1) FROM range(0, 3)").collect()
         }.getMessage
         assert(msg.contains("must be an integer, long, or null constant"))

diff --git a/...ne/core/src/test/scala/org/apache/spark/sql/nativesql/NativeDataFrameAggregateSuite.scala b/...ne/core/src/test/scala/org/apache/spark/sql/nativesql/NativeDataFrameAggregateSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.nativesql
 
+import com.intel.oap.execution.ColumnarHashAggregateExec
+
 import org.apache.spark.sql.{AnalysisException, Column, DataFrame, QueryTest, Row}
 
 import scala.util.Random
@@ -691,10 +693,49 @@ class NativeDataFrameAggregateSuite extends QueryTest
     " before using it") {
     Seq(
       monotonically_increasing_id(), spark_partition_id(),
-      rand(Random.nextLong()), randn(Random.nextLong())
+      randn(Random.nextLong())
     ).foreach(assertNoExceptions)
   }
 
+  private def assertNoExceptionsColumnar(c: Column): Unit = {
+    for ((wholeStage, useObjectHashAgg) <-
+         Seq((true, true), (true, false), (false, true), (false, false))) {
+      withSQLConf(
+        (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, wholeStage.toString),
+        (SQLConf.USE_OBJECT_HASH_AGG.key, useObjectHashAgg.toString)) {
+
+        val df = Seq(("1", 1), ("1", 2), ("2", 3), ("2", 4)).toDF("x", "y")
+
+        // test case for HashAggregate
+        val hashAggDF = df.groupBy("x").agg(c, sum("y"))
+        hashAggDF.collect()
+        val hashAggPlan = hashAggDF.queryExecution.executedPlan
+        // Will not enter into spark WholeStageCodegen.
+        assert(stripAQEPlan(hashAggPlan.children.head).isInstanceOf[ColumnarHashAggregateExec])
+
+        // test case for ObjectHashAggregate and SortAggregate
+        val objHashAggOrSortAggDF = df.groupBy("x").agg(c, collect_list("y"))
+        objHashAggOrSortAggDF.collect()
+        val objHashAggOrSortAggPlan =
+          stripAQEPlan(objHashAggOrSortAggDF.queryExecution.executedPlan)
+        if (useObjectHashAgg) {
+          assert(objHashAggOrSortAggPlan.isInstanceOf[ObjectHashAggregateExec])
+        } else {
+          assert(objHashAggOrSortAggPlan.isInstanceOf[SortAggregateExec])
+        }
+      }
+    }
+  }
+
+  // This test is similar to the above one. The expected behavior changes since
+  // the relevant expressions are supported and fallback is not required.
+  test("SPARK-19471[Columnar]: AggregationIterator does not initialize the generated " +
+      "result projection before using it") {
+    Seq(
+      rand(Random.nextLong())
+    ).foreach(assertNoExceptionsColumnar)
+  }
+
   test("SPARK-21580 ints in aggregation expressions are taken as group-by ordinal.") {
     checkAnswer(
       testData2.groupBy(lit(3), lit(4)).agg(lit(6), lit(7), sum("b")),