From 56084cc2b9bd246ed84dc009d2d5b2b426d0f595 Mon Sep 17 00:00:00 2001 From: William Benton Date: Wed, 18 Jun 2014 15:20:41 -0500 Subject: [PATCH 1/6] Add support for HAVING clauses in Hive queries. --- .../org/apache/spark/sql/hive/HiveQl.scala | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index ec653efcc8c58..67ab17dd1fc20 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -480,6 +480,7 @@ private[hive] object HiveQl { whereClause :: groupByClause :: orderByClause :: + havingClause :: sortByClause :: clusterByClause :: distributeByClause :: @@ -494,6 +495,7 @@ private[hive] object HiveQl { "TOK_WHERE", "TOK_GROUPBY", "TOK_ORDERBY", + "TOK_HAVING", "TOK_SORTBY", "TOK_CLUSTERBY", "TOK_DISTRIBUTEBY", @@ -576,21 +578,26 @@ private[hive] object HiveQl { val withDistinct = if (selectDistinctClause.isDefined) Distinct(withProject) else withProject + val withHaving = havingClause.map { h => + val Seq(havingExpr) = h.getChildren.toSeq + Filter(nodeToExpr(havingExpr), withDistinct) + }.getOrElse(withDistinct) + val withSort = (orderByClause, sortByClause, distributeByClause, clusterByClause) match { case (Some(totalOrdering), None, None, None) => - Sort(totalOrdering.getChildren.map(nodeToSortOrder), withDistinct) + Sort(totalOrdering.getChildren.map(nodeToSortOrder), withHaving) case (None, Some(perPartitionOrdering), None, None) => - SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder), withDistinct) + SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder), withHaving) case (None, None, Some(partitionExprs), None) => - Repartition(partitionExprs.getChildren.map(nodeToExpr), withDistinct) + Repartition(partitionExprs.getChildren.map(nodeToExpr), withHaving) case (None, Some(perPartitionOrdering), Some(partitionExprs), None) => SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder), - Repartition(partitionExprs.getChildren.map(nodeToExpr), withDistinct)) + Repartition(partitionExprs.getChildren.map(nodeToExpr), withHaving)) case (None, None, None, Some(clusterExprs)) => SortPartitions(clusterExprs.getChildren.map(nodeToExpr).map(SortOrder(_, Ascending)), - Repartition(clusterExprs.getChildren.map(nodeToExpr), withDistinct)) - case (None, None, None, None) => withDistinct + Repartition(clusterExprs.getChildren.map(nodeToExpr), withHaving)) + case (None, None, None, None) => withHaving case _ => sys.error("Unsupported set of ordering / distribution clauses.") } From 942428e090b2dd45f51d98a501dccfb971ca8910 Mon Sep 17 00:00:00 2001 From: William Benton Date: Thu, 19 Jun 2014 13:59:54 -0500 Subject: [PATCH 2/6] Added test coverage for SPARK-2180. This is a simple test for HAVING clauses. --- .../sql/hive/execution/HiveQuerySuite.scala | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 9f5cf282f7c48..577a4436b4308 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -224,6 +224,21 @@ class HiveQuerySuite extends HiveComparisonTest { TestHive.reset() } + test("SPARK-2180: HAVING support in GROUP BY clauses") { + val fixture = List(("foo", 2), ("bar", 1), ("foo", 4), ("bar", 3)) + .zipWithIndex.map {case Pair(Pair(value, attr), key) => HavingRow(key, value, attr)} + + TestHive.sparkContext.parallelize(fixture).registerAsTable("having_test") + + val results = hql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value HAVING attr > 3") + .collect() + .map(x => Pair(x.getString(0), x.getInt(1))) + + assert(results === Array(Pair("foo", 4))) + + TestHive.reset() + } + test("Query Hive native command execution result") { val tableName = "test_native_commands" @@ -441,3 +456,6 @@ class HiveQuerySuite extends HiveComparisonTest { // since they modify /clear stuff. } + +// for SPARK-2180 test +case class HavingRow(key: Int, value: String, attr: Int) From b880bef264431f6ab71032b6bd8b63a521f4cb84 Mon Sep 17 00:00:00 2001 From: William Benton Date: Thu, 19 Jun 2014 15:59:08 -0500 Subject: [PATCH 3/6] Added semantic error for HAVING without GROUP BY --- .../src/main/scala/org/apache/spark/sql/hive/HiveQl.scala | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 67ab17dd1fc20..fe05757753b1f 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -204,6 +204,9 @@ private[hive] object HiveQl { class ParseException(sql: String, cause: Throwable) extends Exception(s"Failed to parse: $sql", cause) + class SemanticException(msg: String) + extends Exception(msg) + /** * Returns the AST for the given SQL string. */ @@ -579,6 +582,11 @@ private[hive] object HiveQl { if (selectDistinctClause.isDefined) Distinct(withProject) else withProject val withHaving = havingClause.map { h => + + if (groupByClause == None) { + throw new SemanticException("Error in semantic analysis: HAVING specified without GROUP BY") + } + val Seq(havingExpr) = h.getChildren.toSeq Filter(nodeToExpr(havingExpr), withDistinct) }.getOrElse(withDistinct) From 18387f108bcd7c754979db0207329d5437b86136 Mon Sep 17 00:00:00 2001 From: William Benton Date: Thu, 19 Jun 2014 16:13:09 -0500 Subject: [PATCH 4/6] Add test for HAVING without GROUP BY --- .../apache/spark/sql/hive/execution/HiveQuerySuite.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 577a4436b4308..92d7027e89371 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -224,7 +224,7 @@ class HiveQuerySuite extends HiveComparisonTest { TestHive.reset() } - test("SPARK-2180: HAVING support in GROUP BY clauses") { + test("SPARK-2180: HAVING support in GROUP BY clauses (positive)") { val fixture = List(("foo", 2), ("bar", 1), ("foo", 4), ("bar", 3)) .zipWithIndex.map {case Pair(Pair(value, attr), key) => HavingRow(key, value, attr)} @@ -239,6 +239,12 @@ class HiveQuerySuite extends HiveComparisonTest { TestHive.reset() } + test("SPARK-2180: HAVING without GROUP BY raises exception") { + intercept[Exception] { + hql("SELECT value, attr FROM having_test HAVING attr > 3") + } + } + test("Query Hive native command execution result") { val tableName = "test_native_commands" From 83f1340c70e4588fe2df3492138877c3453f64b3 Mon Sep 17 00:00:00 2001 From: William Benton Date: Thu, 19 Jun 2014 16:37:32 -0500 Subject: [PATCH 5/6] scalastyle fixes --- .../src/main/scala/org/apache/spark/sql/hive/HiveQl.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index fe05757753b1f..6901acff5c644 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -205,7 +205,7 @@ private[hive] object HiveQl { extends Exception(s"Failed to parse: $sql", cause) class SemanticException(msg: String) - extends Exception(msg) + extends Exception(s"Error in semantic analysis: $msg") /** * Returns the AST for the given SQL string. @@ -584,7 +584,7 @@ private[hive] object HiveQl { val withHaving = havingClause.map { h => if (groupByClause == None) { - throw new SemanticException("Error in semantic analysis: HAVING specified without GROUP BY") + throw new SemanticException("HAVING specified without GROUP BY") } val Seq(havingExpr) = h.getChildren.toSeq From 3bbaf266c87f599c029560ebce24246513c5eec7 Mon Sep 17 00:00:00 2001 From: William Benton Date: Fri, 20 Jun 2014 08:13:24 -0500 Subject: [PATCH 6/6] Added casts to HAVING expressions --- .../src/main/scala/org/apache/spark/sql/hive/HiveQl.scala | 7 +++++-- .../apache/spark/sql/hive/execution/HiveQuerySuite.scala | 7 ++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 6901acff5c644..c69e3dba6b467 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -587,8 +587,11 @@ private[hive] object HiveQl { throw new SemanticException("HAVING specified without GROUP BY") } - val Seq(havingExpr) = h.getChildren.toSeq - Filter(nodeToExpr(havingExpr), withDistinct) + val havingExpr = h.getChildren.toSeq match { + case Seq(hexpr) => nodeToExpr(hexpr) + } + + Filter(Cast(havingExpr, BooleanType), withDistinct) }.getOrElse(withDistinct) val withSort = diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 92d7027e89371..80185098bf24f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -230,7 +230,8 @@ class HiveQuerySuite extends HiveComparisonTest { TestHive.sparkContext.parallelize(fixture).registerAsTable("having_test") - val results = hql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value HAVING attr > 3") + val results = + hql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value HAVING attr > 3") .collect() .map(x => Pair(x.getString(0), x.getInt(1))) @@ -244,6 +245,10 @@ class HiveQuerySuite extends HiveComparisonTest { hql("SELECT value, attr FROM having_test HAVING attr > 3") } } + + test("SPARK-2180: HAVING with non-boolean clause raises no exceptions") { + val results = hql("select key, count(*) c from src group by key having c").collect() + } test("Query Hive native command execution result") { val tableName = "test_native_commands"