From 88715131ff367b8eaa863598f6ff11cbe67be64d Mon Sep 17 00:00:00 2001 From: blaginin Date: Mon, 30 Sep 2024 20:52:05 +0100 Subject: [PATCH 1/3] Fix Regex signature types --- datafusion/functions/src/regex/regexplike.rs | 4 ++-- datafusion/functions/src/regex/regexpmatch.rs | 6 ++--- datafusion/sqllogictest/test_files/regexp.slt | 23 +++++++++++++++++++ 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 20029ba005c4..8cd26a824acc 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -48,9 +48,9 @@ impl RegexpLikeFunc { signature: Signature::one_of( vec![ Exact(vec![Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8]), + Exact(vec![LargeUtf8, LargeUtf8]), Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8, Utf8]), + Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index bf40eff11d30..498b591620ee 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -54,9 +54,9 @@ impl RegexpMatchFunc { // If that fails, it proceeds to `(LargeUtf8, Utf8)`. // TODO: Native support Utf8View for regexp_match. Exact(vec![Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8]), + Exact(vec![LargeUtf8, LargeUtf8]), Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8, Utf8]), + Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), ], Volatility::Immutable, ), @@ -131,7 +131,7 @@ pub fn regexp_match(args: &[ArrayRef]) -> Result { let flags = as_generic_string_array::(&args[2])?; if flags.iter().any(|s| s == Some("g")) { - return plan_err!("regexp_match() does not support the \"global\" option") + return plan_err!("regexp_match() does not support the \"global\" option"); } regexp::regexp_match(values, regex, Some(flags)) diff --git a/datafusion/sqllogictest/test_files/regexp.slt b/datafusion/sqllogictest/test_files/regexp.slt index eedc3ddb6d59..7bd45f359c1d 100644 --- a/datafusion/sqllogictest/test_files/regexp.slt +++ b/datafusion/sqllogictest/test_files/regexp.slt @@ -174,6 +174,18 @@ select regexp_like('aaa-555', '.*-(\d*)'); ---- true +query B +select regexp_like(arrow_cast('foobar', 'LargeUtf8'), 'b.r'); +---- +true + + +query B +select regexp_like(arrow_cast('foobar', 'LargeUtf8'), 'B.R', 'i'); +---- +true + + # # regexp_match tests # @@ -269,6 +281,17 @@ SELECT regexp_match('(?<=[A-Z]\w )Smith', 'John Smith', 'i'); ---- NULL +query ? +select regexp_match(arrow_cast('foobar', 'LargeUtf8'), 'b.r'); +---- +[bar] + +query ? +select regexp_match(arrow_cast('foobar', 'LargeUtf8'), 'B.R', 'i'); +---- +[bar] + + # ported test query ? SELECT regexp_match('aaa-555', '.*-(\d*)'); From 849b9854205cb8265d0a003d9668e64ec9af2d3a Mon Sep 17 00:00:00 2001 From: blaginin Date: Tue, 1 Oct 2024 19:14:37 +0100 Subject: [PATCH 2/3] Uncomment the shared tests in string_query.slt.part and removed tests copies everywhere else --- .../test_files/string/dictionary_utf8.slt | 30 ----------- .../sqllogictest/test_files/string/string.slt | 30 ----------- .../test_files/string/string_query.slt.part | 52 +++++++++---------- .../test_files/string/string_view.slt | 30 ----------- 4 files changed, 24 insertions(+), 118 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt b/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt index ea3c9b8eb6ca..c181f613ee9a 100644 --- a/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt +++ b/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt @@ -53,36 +53,6 @@ Xiangpeng datafusion数据融合 false true false true Raphael datafusionДатаФусион false false false false NULL NULL NULL NULL NULL NULL -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query BBBB -SELECT - REGEXP_LIKE(ascii_1, 'an'), - REGEXP_LIKE(unicode_1, 'таФ'), - REGEXP_LIKE(ascii_1, NULL), - REGEXP_LIKE(unicode_1, NULL) -FROM test_basic_operator; ----- -false false NULL NULL -true false NULL NULL -false true NULL NULL -NULL NULL NULL NULL - -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query ???? -SELECT - REGEXP_MATCH(ascii_1, 'an'), - REGEXP_MATCH(unicode_1, 'таФ'), - REGEXP_MATCH(ascii_1, NULL), - REGEXP_MATCH(unicode_1, NULL) -FROM test_basic_operator; ----- -NULL NULL NULL NULL -[an] NULL NULL NULL -NULL [таФ] NULL NULL -NULL NULL NULL NULL - # # common test for string-like functions and operators # diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index 6b89147c5c4f..f4e83966f78f 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -63,36 +63,6 @@ Xiangpeng datafusion数据融合 false true false true Raphael datafusionДатаФусион false false false false NULL NULL NULL NULL NULL NULL -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query BBBB -SELECT - REGEXP_LIKE(ascii_1, 'an'), - REGEXP_LIKE(unicode_1, 'таФ'), - REGEXP_LIKE(ascii_1, NULL), - REGEXP_LIKE(unicode_1, NULL) -FROM test_basic_operator; ----- -false false NULL NULL -true false NULL NULL -false true NULL NULL -NULL NULL NULL NULL - -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query ???? -SELECT - REGEXP_MATCH(ascii_1, 'an'), - REGEXP_MATCH(unicode_1, 'таФ'), - REGEXP_MATCH(ascii_1, NULL), - REGEXP_MATCH(unicode_1, NULL) -FROM test_basic_operator; ----- -NULL NULL NULL NULL -[an] NULL NULL NULL -NULL [таФ] NULL NULL -NULL NULL NULL NULL - # TODO: move it back to `string_query.slt.part` after fixing the issue # see detail: https://github.com/apache/datafusion/issues/12670 query IIIIII diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 0af0a6a642b2..e475d3fb2a13 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -856,39 +856,35 @@ NULL NULL # Test REGEXP_LIKE # -------------------------------------- -# TODO: LargeString does not support REGEXP_LIKE. Enable this after fixing the issue -# see issue: https://github.com/apache/datafusion/issues/12664 -#query BBBB -#SELECT -# REGEXP_LIKE(ascii_1, 'an'), -# REGEXP_LIKE(unicode_1, 'таФ'), -# REGEXP_LIKE(ascii_1, NULL), -# REGEXP_LIKE(unicode_1, NULL) -#FROM test_basic_operator; -#---- -#false false NULL NULL -#true false NULL NULL -#false true NULL NULL -#NULL NULL NULL NULL +query BBBB +SELECT + REGEXP_LIKE(ascii_1, 'an'), + REGEXP_LIKE(unicode_1, 'таФ'), + REGEXP_LIKE(ascii_1, NULL), + REGEXP_LIKE(unicode_1, NULL) +FROM test_basic_operator; +---- +false false NULL NULL +true false NULL NULL +false true NULL NULL +NULL NULL NULL NULL # -------------------------------------- # Test REGEXP_MATCH # -------------------------------------- -# TODO: LargeString does not support REGEXP_MATCH. Enable this after fixing the issue -# see issue: https://github.com/apache/datafusion/issues/12664 -#query ???? -#SELECT -# REGEXP_MATCH(ascii_1, 'an'), -# REGEXP_MATCH(unicode_1, 'таФ'), -# REGEXP_MATCH(ascii_1, NULL), -# REGEXP_MATCH(unicode_1, NULL) -#FROM test_basic_operator; -#---- -#NULL NULL NULL NULL -#[an] NULL NULL NULL -#NULL [таФ] NULL NULL -#NULL NULL NULL NULL +query ???? +SELECT + REGEXP_MATCH(ascii_1, 'an'), + REGEXP_MATCH(unicode_1, 'таФ'), + REGEXP_MATCH(ascii_1, NULL), + REGEXP_MATCH(unicode_1, NULL) +FROM test_basic_operator; +---- +NULL NULL NULL NULL +[an] NULL NULL NULL +NULL [таФ] NULL NULL +NULL NULL NULL NULL # -------------------------------------- # Test REPEAT diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index fb82726e3a9d..4e7857ad804b 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -50,36 +50,6 @@ false false false true NULL NULL -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query BBBB -SELECT - REGEXP_LIKE(ascii_1, 'an'), - REGEXP_LIKE(unicode_1, 'таФ'), - REGEXP_LIKE(ascii_1, NULL), - REGEXP_LIKE(unicode_1, NULL) -FROM test_basic_operator; ----- -false false NULL NULL -true false NULL NULL -false true NULL NULL -NULL NULL NULL NULL - -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query ???? -SELECT - REGEXP_MATCH(ascii_1, 'an'), - REGEXP_MATCH(unicode_1, 'таФ'), - REGEXP_MATCH(ascii_1, NULL), - REGEXP_MATCH(unicode_1, NULL) -FROM test_basic_operator; ----- -NULL NULL NULL NULL -[an] NULL NULL NULL -NULL [таФ] NULL NULL -NULL NULL NULL NULL - # TODO: move it back to `string_query.slt.part` after fixing the issue # see detail: https://github.com/apache/datafusion/issues/12670 query IIIIII From dbdb2fbb0dd0bb74e0f3ee3cab00a7585747421b Mon Sep 17 00:00:00 2001 From: blaginin Date: Tue, 1 Oct 2024 19:39:04 +0100 Subject: [PATCH 3/3] Test `LIKE` and `MATCH` with flags; Remove new tests from regexp.slt --- datafusion/sqllogictest/test_files/regexp.slt | 23 ---------- .../test_files/string/string_query.slt.part | 44 ++++++++++++------- 2 files changed, 28 insertions(+), 39 deletions(-) diff --git a/datafusion/sqllogictest/test_files/regexp.slt b/datafusion/sqllogictest/test_files/regexp.slt index 7bd45f359c1d..eedc3ddb6d59 100644 --- a/datafusion/sqllogictest/test_files/regexp.slt +++ b/datafusion/sqllogictest/test_files/regexp.slt @@ -174,18 +174,6 @@ select regexp_like('aaa-555', '.*-(\d*)'); ---- true -query B -select regexp_like(arrow_cast('foobar', 'LargeUtf8'), 'b.r'); ----- -true - - -query B -select regexp_like(arrow_cast('foobar', 'LargeUtf8'), 'B.R', 'i'); ----- -true - - # # regexp_match tests # @@ -281,17 +269,6 @@ SELECT regexp_match('(?<=[A-Z]\w )Smith', 'John Smith', 'i'); ---- NULL -query ? -select regexp_match(arrow_cast('foobar', 'LargeUtf8'), 'b.r'); ----- -[bar] - -query ? -select regexp_match(arrow_cast('foobar', 'LargeUtf8'), 'B.R', 'i'); ----- -[bar] - - # ported test query ? SELECT regexp_match('aaa-555', '.*-(\d*)'); diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index e475d3fb2a13..3ba2b31bbab2 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -856,35 +856,47 @@ NULL NULL # Test REGEXP_LIKE # -------------------------------------- -query BBBB +query BBBBBBBB SELECT + -- without flags REGEXP_LIKE(ascii_1, 'an'), REGEXP_LIKE(unicode_1, 'таФ'), REGEXP_LIKE(ascii_1, NULL), - REGEXP_LIKE(unicode_1, NULL) -FROM test_basic_operator; + REGEXP_LIKE(unicode_1, NULL), + -- with flags + REGEXP_LIKE(ascii_1, 'AN', 'i'), + REGEXP_LIKE(unicode_1, 'ТаФ', 'i'), + REGEXP_LIKE(ascii_1, NULL, 'i'), + REGEXP_LIKE(unicode_1, NULL, 'i') + FROM test_basic_operator; ---- -false false NULL NULL -true false NULL NULL -false true NULL NULL -NULL NULL NULL NULL +false false NULL NULL true false NULL NULL +true false NULL NULL true false NULL NULL +false true NULL NULL false true NULL NULL +NULL NULL NULL NULL NULL NULL NULL NULL # -------------------------------------- # Test REGEXP_MATCH # -------------------------------------- -query ???? +query ???????? SELECT + -- without flags REGEXP_MATCH(ascii_1, 'an'), - REGEXP_MATCH(unicode_1, 'таФ'), + REGEXP_MATCH(unicode_1, 'ТаФ'), REGEXP_MATCH(ascii_1, NULL), - REGEXP_MATCH(unicode_1, NULL) -FROM test_basic_operator; ----- -NULL NULL NULL NULL -[an] NULL NULL NULL -NULL [таФ] NULL NULL -NULL NULL NULL NULL + REGEXP_MATCH(unicode_1, NULL), + -- with flags + REGEXP_MATCH(ascii_1, 'AN', 'i'), + REGEXP_MATCH(unicode_1, 'таФ', 'i'), + REGEXP_MATCH(ascii_1, NULL, 'i'), + REGEXP_MATCH(unicode_1, NULL, 'i') +FROM test_basic_operator; +---- +NULL NULL NULL NULL [An] NULL NULL NULL +[an] NULL NULL NULL [an] NULL NULL NULL +NULL NULL NULL NULL NULL [таФ] NULL NULL +NULL NULL NULL NULL NULL NULL NULL NULL # -------------------------------------- # Test REPEAT