Skip to content

Commit

Permalink
[SPARK-44550][SQL] Enable correctness fixes for `null IN (empty list)…
Browse files Browse the repository at this point in the history
…` under ANSI

### What changes were proposed in this pull request?
Enables the correctness fixes for `null IN (empty list)` expressions by default when ANSI is enabled. Under non-ANSI the old behavior remains the default for now. After soaking for some time under ANSI, we should switch the new behavior to default in both cases.

Prior to this, `null IN (empty list)` incorrectly evaluated to null, when it should evaluate to false. (The reason it should be false is because a IN (b1, b2) is defined as a = b1 OR a = b2, and an empty IN list is treated as an empty OR which is false. This is specified by ANSI SQL.)

Many places in Spark execution (In, InSet, InSubquery) and optimization (OptimizeIn, NullPropagation) implemented this wrong behavior. This is a longstanding correctness issue which has existed since null support for IN expressions was first added to Spark.

See previous PRs where the fixes were implemented: #42007 and #42163.

See [this doc](https://docs.google.com/document/d/1k8AY8oyT-GI04SnP7eXttPDnDj-Ek-c3luF2zL6DPNU/edit) for more information.

### Why are the changes needed?
Fix wrong SQL semantics

### Does this PR introduce _any_ user-facing change?
Yes, fix wrong SQL semantics

### How was this patch tested?
Unit tests

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #43068 from jchen5/null-in-empty-enable.

Authored-by: Jack Chen <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
jchen5 authored and cloud-fan committed Sep 26, 2023
1 parent c1b12bd commit 7741fe7
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {

final override val nodePatterns: Seq[TreePattern] = Seq(IN)
private val legacyNullInEmptyBehavior =
SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)
SQLConf.get.legacyNullInEmptyBehavior

override lazy val canonicalized: Expression = {
val basic = withNewChildren(children.map(_.canonicalized)).asInstanceOf[In]
Expand Down Expand Up @@ -626,7 +626,7 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with

final override val nodePatterns: Seq[TreePattern] = Seq(INSET)
private val legacyNullInEmptyBehavior =
SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)
SQLConf.get.legacyNullInEmptyBehavior

override def eval(input: InternalRow): Any = {
if (hset.isEmpty && !legacyNullInEmptyBehavior) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ object OptimizeIn extends Rule[LogicalPlan] {
case In(v, list) if list.isEmpty =>
// IN (empty list) is always false under current behavior.
// Under legacy behavior it's null if the left side is null, otherwise false (SPARK-44550).
if (!SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)) {
if (!SQLConf.get.legacyNullInEmptyBehavior) {
FalseLiteral
} else {
If(IsNotNull(v), FalseLiteral, Literal(null, BooleanType))
Expand Down Expand Up @@ -845,20 +845,20 @@ object NullPropagation extends Rule[LogicalPlan] {

// If the list is empty, transform the In expression to false literal.
case In(_, list)
if list.isEmpty && !SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR) =>
if list.isEmpty && !SQLConf.get.legacyNullInEmptyBehavior =>
Literal.create(false, BooleanType)
// If the value expression is NULL (and the list is non-empty), then transform the
// In expression to null literal.
// If the legacy flag is set, then it becomes null even if the list is empty (which is
// incorrect legacy behavior)
case In(Literal(null, _), list)
if list.nonEmpty || SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)
if list.nonEmpty || SQLConf.get.legacyNullInEmptyBehavior
=> Literal.create(null, BooleanType)
case InSubquery(Seq(Literal(null, _)), _)
if SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR) =>
if SQLConf.get.legacyNullInEmptyBehavior =>
Literal.create(null, BooleanType)
case InSubquery(Seq(Literal(null, _)), ListQuery(sub, _, _, _, conditions, _))
if !SQLConf.get.getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR)
if !SQLConf.get.legacyNullInEmptyBehavior
&& conditions.isEmpty =>
If(Exists(sub), Literal(null, BooleanType), FalseLiteral)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4361,6 +4361,8 @@ object SQLConf {
.booleanConf
.createWithDefault(false)

// Default is false (new, correct behavior) when ANSI is on, true (legacy, incorrect behavior)
// when ANSI is off. See legacyNullInEmptyBehavior.
val LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR =
buildConf("spark.sql.legacy.nullInEmptyListBehavior")
.internal()
Expand All @@ -4370,7 +4372,7 @@ object SQLConf {
"incorrectly evaluates to null in the legacy behavior.")
.version("3.5.0")
.booleanConf
.createWithDefault(true)
.createOptional

val ERROR_MESSAGE_FORMAT = buildConf("spark.sql.error.messageFormat")
.doc("When PRETTY, the error message consists of textual representation of error class, " +
Expand Down Expand Up @@ -5185,6 +5187,10 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
getConf(SQLConf.LEGACY_SIZE_OF_NULL) && !getConf(ANSI_ENABLED)
}

def legacyNullInEmptyBehavior: Boolean = {
getConf(SQLConf.LEGACY_NULL_IN_EMPTY_LIST_BEHAVIOR).getOrElse(!ansiEnabled)
}

def isReplEagerEvalEnabled: Boolean = getConf(SQLConf.REPL_EAGER_EVAL_ENABLED)

def replEagerEvalMaxNumRows: Int = getConf(SQLConf.REPL_EAGER_EVAL_MAX_NUM_ROWS)
Expand Down
19 changes: 19 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/EmptyInSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,23 @@ with SharedSparkSession {
}
}
}

test("IN empty list behavior conf defaults") {
// Currently the fixed behavior is enabled when ANSI is on, and the legacy behavior when
// ANSI is off.
Seq(true, false).foreach { ansiEnabled =>
withSQLConf(SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) {
val legacyNullInBehavior = !ansiEnabled
assert(SQLConf.get.legacyNullInEmptyBehavior == legacyNullInBehavior)

val emptylist = Seq.empty[Literal]
val df = t.select(col("a"), col("a").isin(emptylist: _*))
val expectedResultForNullInEmpty =
if (legacyNullInBehavior) null else false
checkAnswer(
df,
Row(1, false) :: Row(null, expectedResultForNullInEmpty) :: Nil)
}
}
}
}

0 comments on commit 7741fe7

Please sign in to comment.