From 30858355a6fbf73d667a9fc80b45878daebee31d Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 5 Nov 2024 11:59:05 +0100 Subject: [PATCH 1/3] Fix incorrect `... LIKE '%'` simplification `expr LIKE '%'` was previously simplified to `true`, but the expression returns `NULL` when `expr` is null. The conversion was conditional on `!is_null(expr)` which means "is not always true, i.e. is not a null literal". This commit adds correct simplification logic. It additionally expands the rule coverage to include string view (Utf8View) and large string (LargeUtf8). This allows writing shared test cases even despite `utf8_view LIKE '%'` returning incorrect results at execution time (tracked by https://github.com/apache/datafusion/issues/12637). I.e. the simplification masks the bug for cases where pattern is statically known. --- .../simplify_expressions/expr_simplifier.rs | 63 ++++++++++++++----- .../sqllogictest/test_files/string/string.slt | 8 +++ .../test_files/string/string_query.slt.part | 52 +++++++++++++++ 3 files changed, 109 insertions(+), 14 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 40be1f85391d..fbab39fb3f3a 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1476,13 +1476,28 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { negated, escape_char: _, case_insensitive: _, - }) if !is_null(&expr) - && matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%" - ) => + }) if matches!( + pattern.as_ref(), + Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%" + ) || matches!( + pattern.as_ref(), + Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if pattern_str == "%" + ) || matches!( + pattern.as_ref(), + Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if pattern_str == "%" + ) => { - Transformed::yes(lit(!negated)) + // exp LIKE '%' is + // - when exp is not NULL, it's true + // - when exp is NULL, it's NULL + // exp NOT LIKE '%' is + // - when exp is not NULL, it's false + // - when exp is NULL, it's NULL + Transformed::yes(Expr::Case(Case { + expr: Some(Box::new(Expr::IsNotNull(expr))), + when_then_expr: vec![(Box::new(lit(true)), Box::new(lit(!negated)))], + else_expr: None, + })) } // a is not null/unknown --> true (if a is not nullable) @@ -2777,10 +2792,22 @@ mod tests { assert_no_change(regex_match(col("c1"), lit("f_o"))); // empty cases - assert_change(regex_match(col("c1"), lit("")), lit(true)); - assert_change(regex_not_match(col("c1"), lit("")), lit(false)); - assert_change(regex_imatch(col("c1"), lit("")), lit(true)); - assert_change(regex_not_imatch(col("c1"), lit("")), lit(false)); + assert_change( + regex_match(col("c1"), lit("")), + if_not_null(col("c1"), true), + ); + assert_change( + regex_not_match(col("c1"), lit("")), + if_not_null(col("c1"), false), + ); + assert_change( + regex_imatch(col("c1"), lit("")), + if_not_null(col("c1"), true), + ); + assert_change( + regex_not_imatch(col("c1"), lit("")), + if_not_null(col("c1"), false), + ); // single character assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%")); @@ -3608,16 +3635,16 @@ mod tests { fn test_like_and_ilke() { // test non-null values let expr = like(col("c1"), "%"); - assert_eq!(simplify(expr), lit(true)); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); let expr = not_like(col("c1"), "%"); - assert_eq!(simplify(expr), lit(false)); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); let expr = ilike(col("c1"), "%"); - assert_eq!(simplify(expr), lit(true)); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); let expr = not_ilike(col("c1"), "%"); - assert_eq!(simplify(expr), lit(false)); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); // test null values let null = lit(ScalarValue::Utf8(None)); @@ -4043,4 +4070,12 @@ mod tests { ); } } + + fn if_not_null(expr: Expr, then: bool) -> Expr { + Expr::Case(Case { + expr: Some(expr.is_not_null().into()), + when_then_expr: vec![(lit(true).into(), lit(then).into())], + else_expr: None, + }) + } } diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index 9e97712b6871..68111ba34696 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -34,6 +34,14 @@ statement ok create table test_substr as select arrow_cast(col1, 'Utf8') as c1 from test_substr_base; +query BBB +SELECT + NULL LIKE '%', + '' LIKE '%', + 'a' LIKE '%' +---- +NULL true true + # TODO: move it back to `string_query.slt.part` after fixing the issue # see detail: https://github.com/apache/datafusion/issues/12637 # Test pattern with wildcard characters diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index c4975b5b8c8d..57fb09bca9e4 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -873,6 +873,58 @@ NULL NULL NULL NULL NULL #Raphael datafusionДатаФусион false false false false #NULL NULL NULL NULL NULL NULL +# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections +query TTBB +SELECT ascii_1, unicode_1, + ascii_1 LIKE '%' AS ascii_1_like_percent, + unicode_1 LIKE '%' AS unicode_1_like_percent + -- ascii_1 LIKE '%%' AS ascii_1_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 + -- unicode_1 LIKE '%%' AS unicode_1_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 +FROM test_basic_operator +---- +Andrew datafusion📊🔥 true true +Xiangpeng datafusion数据融合 true true +Raphael datafusionДатаФусион true true +under_score un iść core true true +percent pan Tadeusz ma iść w kąt true true +(empty) (empty) true true +NULL NULL NULL NULL +NULL NULL NULL NULL + +# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections +query TTBB +SELECT ascii_1, unicode_1, + ascii_1 NOT LIKE '%' AS ascii_1_not_like_percent, + unicode_1 NOT LIKE '%' AS unicode_1_not_like_percent + -- ascii_1 NOT LIKE '%%' AS ascii_1_not_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 + -- unicode_1 NOT LIKE '%%' AS unicode_1_not_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 +FROM test_basic_operator +---- +Andrew datafusion📊🔥 false false +Xiangpeng datafusion数据融合 false false +Raphael datafusionДатаФусион false false +under_score un iść core false false +percent pan Tadeusz ma iść w kąt false false +(empty) (empty) false false +NULL NULL NULL NULL +NULL NULL NULL NULL + +query T +SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 LIKE '%' +---- +Andrew +Xiangpeng +Raphael +under_score +percent +(empty) + +# TODO: move it back to `string_query.slt.part` after fixing the issue +# see detail: https://github.com/apache/datafusion/issues/12637 +query T +SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 NOT LIKE '%' +---- + # Test pattern without wildcard characters query TTBBBB select ascii_1, unicode_1, From 4d7896a8dcfbc872df9343c208cb5d566c37ba72 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 5 Nov 2024 12:40:05 +0100 Subject: [PATCH 2/3] fixup! Fix incorrect `... LIKE '%'` simplification --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index fbab39fb3f3a..e0df6a3a68ce 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -3633,7 +3633,7 @@ mod tests { #[test] fn test_like_and_ilke() { - // test non-null values + // LIKE '%' let expr = like(col("c1"), "%"); assert_eq!(simplify(expr), if_not_null(col("c1"), true)); @@ -3646,7 +3646,7 @@ mod tests { let expr = not_ilike(col("c1"), "%"); assert_eq!(simplify(expr), if_not_null(col("c1"), false)); - // test null values + // null_constant LIKE '%' let null = lit(ScalarValue::Utf8(None)); let expr = like(null.clone(), "%"); assert_eq!(simplify(expr), lit_bool_null()); From f96698c0c625fad9df35405837e549072c7a6c03 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 5 Nov 2024 16:20:23 +0100 Subject: [PATCH 3/3] fix tests (re review comments) --- datafusion/sqllogictest/test_files/string/string.slt | 8 -------- .../sqllogictest/test_files/string/string_literal.slt | 8 ++++++++ .../sqllogictest/test_files/string/string_query.slt.part | 2 -- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index 68111ba34696..9e97712b6871 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -34,14 +34,6 @@ statement ok create table test_substr as select arrow_cast(col1, 'Utf8') as c1 from test_substr_base; -query BBB -SELECT - NULL LIKE '%', - '' LIKE '%', - 'a' LIKE '%' ----- -NULL true true - # TODO: move it back to `string_query.slt.part` after fixing the issue # see detail: https://github.com/apache/datafusion/issues/12637 # Test pattern with wildcard characters diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt index 80bd7fc59c00..6d12d8377394 100644 --- a/datafusion/sqllogictest/test_files/string/string_literal.slt +++ b/datafusion/sqllogictest/test_files/string/string_literal.slt @@ -821,3 +821,11 @@ query TT select ' ', '|' ---- | + +query BBB +SELECT + NULL LIKE '%', + '' LIKE '%', + 'a' LIKE '%' +---- +NULL true true diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 57fb09bca9e4..4960a02540be 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -919,8 +919,6 @@ under_score percent (empty) -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12637 query T SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 NOT LIKE '%' ----