From 1bf07f129f0d34c631112bde244b1ae1c92cc7a1 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Fri, 2 Aug 2024 10:39:50 -0700 Subject: [PATCH 01/12] feat: support `Utf8View` for `starts_with` --- .../functions/src/string/starts_with.rs | 132 +++++++++++++++--- 1 file changed, 110 insertions(+), 22 deletions(-) diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 05bd960ff14b..85b98b51ebb5 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -18,7 +18,7 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, OffsetSizeTrait}; +use arrow::array::{ArrayRef, AsArray, OffsetSizeTrait}; use arrow::datatypes::DataType; use datafusion_common::{cast::as_generic_string_array, internal_err, Result}; @@ -29,14 +29,45 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use crate::utils::make_scalar_function; /// Returns true if string starts with prefix. -/// starts_with('alphabet', 'alph') = 't' +/// starts_with('alphabet', 'alph') = TRUE pub fn starts_with(args: &[ArrayRef]) -> Result { - let left = as_generic_string_array::(&args[0])?; - let right = as_generic_string_array::(&args[1])?; + let bool_result = match (args[0].data_type(), args[1].data_type()) { + (DataType::Utf8View, DataType::Utf8View) => { + let left = args[0].as_string_view(); + let right = args[1].as_string_view(); - let result = arrow::compute::kernels::comparison::starts_with(left, right)?; + let result = arrow::compute::kernels::comparison::starts_with(left, right)?; - Ok(Arc::new(result) as ArrayRef) + result + } + (DataType::Utf8View, DataType::Utf8 | DataType::LargeUtf8) => { + let left = args[0].as_string_view(); + let right = as_generic_string_array::(args[1].as_ref())?; + + let result = arrow::compute::kernels::comparison::starts_with(left, right)?; + + result + } + (DataType::Utf8 | DataType::LargeUtf8, DataType::Utf8View) => { + let left = as_generic_string_array::(args[0].as_ref())?; + let right = args[1].as_string_view(); + + let result = arrow::compute::kernels::comparison::starts_with(left, right)?; + + result + } + (DataType::Utf8 | DataType::LargeUtf8, DataType::Utf8 | DataType::LargeUtf8) => { + let left = as_generic_string_array::(args[0].as_ref())?; + let right = as_generic_string_array::(args[1].as_ref())?; + + let result = arrow::compute::kernels::comparison::starts_with(left, right)?; + + result + } + _ => internal_err!("Unsupported data types for starts_with")?, + }; + + Ok(Arc::new(bool_result) as ArrayRef) } #[derive(Debug)] @@ -53,16 +84,18 @@ impl Default for StartsWithFunc { impl StartsWithFunc { pub fn new() -> Self { use DataType::*; + + let string_types = vec![Utf8, LargeUtf8, Utf8View]; + let mut type_signatures = vec![]; + + for left in &string_types { + for right in &string_types { + type_signatures.push(Exact(vec![left.clone(), right.clone()])); + } + } + Self { - signature: Signature::one_of( - vec![ - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8, LargeUtf8]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::one_of(type_signatures, Volatility::Immutable), } } } @@ -81,18 +114,73 @@ impl ScalarUDFImpl for StartsWithFunc { } fn return_type(&self, _arg_types: &[DataType]) -> Result { - use DataType::*; - - Ok(Boolean) + Ok(DataType::Boolean) } fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { DataType::Utf8 => make_scalar_function(starts_with::, vec![])(args), - DataType::LargeUtf8 => { - return make_scalar_function(starts_with::, vec![])(args); - } - _ => internal_err!("Unsupported data type"), + DataType::LargeUtf8 => make_scalar_function(starts_with::, vec![])(args), + DataType::Utf8View => make_scalar_function(starts_with::, vec![])(args), + _ => internal_err!("Unsupported data types for starts_with")?, } } } + +#[cfg(test)] +mod tests { + use crate::utils::test::test_function; + use arrow::array::{Array, BooleanArray}; + use arrow::datatypes::DataType::Boolean; + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use super::*; + + #[test] + fn test_functions() -> Result<()> { + // Generate test cases for starts_with + let test_cases = vec![ + (Some("alphabet"), Some("alph"), Some(true)), + (Some("alphabet"), Some("bet"), Some(false)), + ( + Some("somewhat large string"), + Some("somewhat large"), + Some(true), + ), + (Some("somewhat large string"), Some("large"), Some(false)), + ] + .into_iter() + .flat_map(|(a, b, c)| { + let utf_8_args = vec![ + ColumnarValue::Scalar(ScalarValue::Utf8(a.map(|s| s.to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(b.map(|s| s.to_string()))), + ]; + + let large_utf_8_args = vec![ + ColumnarValue::Scalar(ScalarValue::LargeUtf8(a.map(|s| s.to_string()))), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(b.map(|s| s.to_string()))), + ]; + + let utf_8_view_args = vec![ + ColumnarValue::Scalar(ScalarValue::Utf8View(a.map(|s| s.to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8View(b.map(|s| s.to_string()))), + ]; + + vec![(utf_8_args, c), (large_utf_8_args, c), (utf_8_view_args, c)] + }); + + for (args, expected) in test_cases { + test_function!( + StartsWithFunc::new(), + &args, + Ok(expected), + bool, + Boolean, + BooleanArray + ); + } + + Ok(()) + } +} From 2c1dda9f60f6eb6fa055661a85ecf2e32d2365d5 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Fri, 2 Aug 2024 10:58:15 -0700 Subject: [PATCH 02/12] style: clippy --- datafusion/functions/src/string/starts_with.rs | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 85b98b51ebb5..16f32b6986d5 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -29,40 +29,32 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use crate::utils::make_scalar_function; /// Returns true if string starts with prefix. -/// starts_with('alphabet', 'alph') = TRUE +/// starts_with('alphabet', 'alph') = t pub fn starts_with(args: &[ArrayRef]) -> Result { let bool_result = match (args[0].data_type(), args[1].data_type()) { (DataType::Utf8View, DataType::Utf8View) => { let left = args[0].as_string_view(); let right = args[1].as_string_view(); - let result = arrow::compute::kernels::comparison::starts_with(left, right)?; - - result + arrow::compute::kernels::comparison::starts_with(left, right)? } (DataType::Utf8View, DataType::Utf8 | DataType::LargeUtf8) => { let left = args[0].as_string_view(); let right = as_generic_string_array::(args[1].as_ref())?; - let result = arrow::compute::kernels::comparison::starts_with(left, right)?; - - result + arrow::compute::kernels::comparison::starts_with(left, right)? } (DataType::Utf8 | DataType::LargeUtf8, DataType::Utf8View) => { let left = as_generic_string_array::(args[0].as_ref())?; let right = args[1].as_string_view(); - let result = arrow::compute::kernels::comparison::starts_with(left, right)?; - - result + arrow::compute::kernels::comparison::starts_with(left, right)? } (DataType::Utf8 | DataType::LargeUtf8, DataType::Utf8 | DataType::LargeUtf8) => { let left = as_generic_string_array::(args[0].as_ref())?; let right = as_generic_string_array::(args[1].as_ref())?; - let result = arrow::compute::kernels::comparison::starts_with(left, right)?; - - result + arrow::compute::kernels::comparison::starts_with(left, right)? } _ => internal_err!("Unsupported data types for starts_with")?, }; From 65ba700ebd3ed7504967a9fb2782da05e7d70a9e Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Fri, 2 Aug 2024 13:28:50 -0700 Subject: [PATCH 03/12] simplify string view handling --- .../functions/src/string/starts_with.rs | 65 +++++-------------- .../sqllogictest/test_files/string_view.slt | 12 ++++ 2 files changed, 29 insertions(+), 48 deletions(-) diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 16f32b6986d5..019956abe1c6 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -18,10 +18,10 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, AsArray, OffsetSizeTrait}; +use arrow::array::ArrayRef; use arrow::datatypes::DataType; -use datafusion_common::{cast::as_generic_string_array, internal_err, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_expr::ColumnarValue; use datafusion_expr::TypeSignature::*; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; @@ -29,37 +29,10 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use crate::utils::make_scalar_function; /// Returns true if string starts with prefix. -/// starts_with('alphabet', 'alph') = t -pub fn starts_with(args: &[ArrayRef]) -> Result { - let bool_result = match (args[0].data_type(), args[1].data_type()) { - (DataType::Utf8View, DataType::Utf8View) => { - let left = args[0].as_string_view(); - let right = args[1].as_string_view(); - - arrow::compute::kernels::comparison::starts_with(left, right)? - } - (DataType::Utf8View, DataType::Utf8 | DataType::LargeUtf8) => { - let left = args[0].as_string_view(); - let right = as_generic_string_array::(args[1].as_ref())?; - - arrow::compute::kernels::comparison::starts_with(left, right)? - } - (DataType::Utf8 | DataType::LargeUtf8, DataType::Utf8View) => { - let left = as_generic_string_array::(args[0].as_ref())?; - let right = args[1].as_string_view(); - - arrow::compute::kernels::comparison::starts_with(left, right)? - } - (DataType::Utf8 | DataType::LargeUtf8, DataType::Utf8 | DataType::LargeUtf8) => { - let left = as_generic_string_array::(args[0].as_ref())?; - let right = as_generic_string_array::(args[1].as_ref())?; - - arrow::compute::kernels::comparison::starts_with(left, right)? - } - _ => internal_err!("Unsupported data types for starts_with")?, - }; - - Ok(Arc::new(bool_result) as ArrayRef) +/// starts_with('alphabet', 'alph') = 't' +pub fn starts_with(args: &[ArrayRef]) -> Result { + let result = arrow::compute::kernels::comparison::starts_with(&args[0], &args[1])?; + Ok(Arc::new(result) as ArrayRef) } #[derive(Debug)] @@ -75,19 +48,15 @@ impl Default for StartsWithFunc { impl StartsWithFunc { pub fn new() -> Self { - use DataType::*; - - let string_types = vec![Utf8, LargeUtf8, Utf8View]; - let mut type_signatures = vec![]; - - for left in &string_types { - for right in &string_types { - type_signatures.push(Exact(vec![left.clone(), right.clone()])); - } - } - Self { - signature: Signature::one_of(type_signatures, Volatility::Immutable), + signature: Signature::one_of( + vec![ + Exact(vec![DataType::Utf8View, DataType::Utf8View]), + Exact(vec![DataType::Utf8, DataType::Utf8]), + Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), + ], + Volatility::Immutable, + ), } } } @@ -111,9 +80,9 @@ impl ScalarUDFImpl for StartsWithFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8 => make_scalar_function(starts_with::, vec![])(args), - DataType::LargeUtf8 => make_scalar_function(starts_with::, vec![])(args), - DataType::Utf8View => make_scalar_function(starts_with::, vec![])(args), + DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => { + make_scalar_function(starts_with, vec![])(args) + } _ => internal_err!("Unsupported data types for starts_with")?, } } diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 763b4e99c614..0ba7a3950161 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -321,6 +321,7 @@ logical_plan 02)--Filter: CAST(test.column2_utf8 AS Utf8View) = test.column1_utf8view 03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view] +<<<<<<< HEAD ## Test distinct aggregates query III SELECT @@ -355,6 +356,17 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT test.column1_utf8), count(DISTINCT test.column1_utf8view), count(DISTINCT test.column1_dict)]] 02)--TableScan: test projection=[column1_utf8, column1_utf8view, column1_dict] +======= +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8view, 'foo') as c, + STARTS_WITH(column1_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: starts_with(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] +>>>>>>> 222263c15 (simplify string view handling) statement ok drop table test; From f38b363546a236061d981b8a81358bdec3dd133f Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Fri, 2 Aug 2024 15:55:44 -0700 Subject: [PATCH 04/12] fix: allow utf8 and largeutf8 to be cast into utf8view --- datafusion/expr/src/expr_schema.rs | 1 + datafusion/expr/src/type_coercion/functions.rs | 14 ++++++++++++++ datafusion/sqllogictest/test_files/string_view.slt | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 5e0571f712ee..12c8421eda8f 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -148,6 +148,7 @@ impl ExprSchemable for Expr { .iter() .map(|e| e.get_type(schema)) .collect::>>()?; + // verify that function is invoked with correct number and type of arguments as defined in `TypeSignature` data_types_with_scalar_udf(&arg_data_types, func).map_err(|err| { plan_datafusion_err!( diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index ef52a01e0598..147978b06f8d 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -573,6 +573,8 @@ fn coerced_from<'a>( (Interval(_), _) if matches!(type_from, Utf8 | LargeUtf8) => { Some(type_into.clone()) } + // We can go into a Utf8View from a Utf8 or LargeUtf8 + (Utf8View, _) if matches!(type_from, Utf8 | LargeUtf8) => Some(type_into.clone()), // Any type can be coerced into strings (Utf8 | LargeUtf8, _) => Some(type_into.clone()), (Null, _) if can_cast_types(type_from, type_into) => Some(type_into.clone()), @@ -636,6 +638,18 @@ mod tests { use super::*; use arrow::datatypes::Field; + #[test] + fn test_string_conversion() { + let cases = vec![ + (DataType::Utf8View, DataType::Utf8, true), + (DataType::Utf8View, DataType::LargeUtf8, true), + ]; + + for case in cases { + assert_eq!(can_coerce_from(&case.0, &case.1), case.2); + } + } + #[test] fn test_maybe_data_types() { // this vec contains: arg1, arg2, expected result diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 0ba7a3950161..70abe3ca1cf7 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -364,7 +364,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: starts_with(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 +01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] >>>>>>> 222263c15 (simplify string view handling) From 96c05528a2760207c0d6a6d7430010a4b7e9ee2a Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Fri, 2 Aug 2024 16:00:50 -0700 Subject: [PATCH 05/12] fix: fix test --- datafusion/sqllogictest/test_files/string_view.slt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 70abe3ca1cf7..ed29d81f5352 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -321,7 +321,6 @@ logical_plan 02)--Filter: CAST(test.column2_utf8 AS Utf8View) = test.column1_utf8view 03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view] -<<<<<<< HEAD ## Test distinct aggregates query III SELECT @@ -356,7 +355,6 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT test.column1_utf8), count(DISTINCT test.column1_utf8view), count(DISTINCT test.column1_dict)]] 02)--TableScan: test projection=[column1_utf8, column1_utf8view, column1_dict] -======= query TT EXPLAIN SELECT STARTS_WITH(column1_utf8view, 'foo') as c, @@ -366,7 +364,6 @@ FROM test; logical_plan 01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ->>>>>>> 222263c15 (simplify string view handling) statement ok drop table test; @@ -388,6 +385,5 @@ select t.dt from dates t where arrow_cast('2024-01-01', 'Utf8View') < t.dt; ---- 2024-01-23 - statement ok drop table dates; From f203553d389c991b2915d094019da30d3baedd23 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sat, 3 Aug 2024 10:13:53 -0700 Subject: [PATCH 06/12] Apply suggestions from code review Co-authored-by: Yongting You <2010youy01@gmail.com> Co-authored-by: Andrew Lamb --- datafusion/functions/src/string/starts_with.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 019956abe1c6..642ee97a5002 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -51,6 +51,9 @@ impl StartsWithFunc { Self { signature: Signature::one_of( vec![ + // Planner attempts coercion to the target type starting with the most preferred candidate. + // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. + // If that fails, it proceeds to `(Utf8, Utf8)`. Exact(vec![DataType::Utf8View, DataType::Utf8View]), Exact(vec![DataType::Utf8, DataType::Utf8]), Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), @@ -83,7 +86,7 @@ impl ScalarUDFImpl for StartsWithFunc { DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => { make_scalar_function(starts_with, vec![])(args) } - _ => internal_err!("Unsupported data types for starts_with")?, + _ => internal_err!("Unsupported data types for starts_with. Expected Utf8, LargeUtf8 or Utf8View")?, } } } From 2a15df3a7f0289f401503d0fe8276a80703d43bd Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sat, 3 Aug 2024 10:16:46 -0700 Subject: [PATCH 07/12] style: fix format --- datafusion/functions/src/string/starts_with.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 642ee97a5002..8450697cbf30 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -51,7 +51,7 @@ impl StartsWithFunc { Self { signature: Signature::one_of( vec![ - // Planner attempts coercion to the target type starting with the most preferred candidate. + // Planner attempts coercion to the target type starting with the most preferred candidate. // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. // If that fails, it proceeds to `(Utf8, Utf8)`. Exact(vec![DataType::Utf8View, DataType::Utf8View]), From 221dc7677a5a9cafcbda0704f05914acce01b943 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sat, 3 Aug 2024 11:56:38 -0700 Subject: [PATCH 08/12] feat: add addiontal tests --- .../sqllogictest/test_files/string_view.slt | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index ed29d81f5352..a3cad075948c 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -355,15 +355,30 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT test.column1_utf8), count(DISTINCT test.column1_utf8view), count(DISTINCT test.column1_dict)]] 02)--TableScan: test projection=[column1_utf8, column1_utf8view, column1_dict] +# Test STARTS_WITH with utf8view against utf8view, dict, utf8, and largeutf8 query TT EXPLAIN SELECT - STARTS_WITH(column1_utf8view, 'foo') as c, - STARTS_WITH(column1_utf8view, column2_utf8view) as c2 + STARTS_WITH(column1_utf8view, column2_utf8view) as c1, + STARTS_WITH(column1_utf8view, column2_utf8) as c3, + STARTS_WITH(column1_utf8view, column2_large_utf8) as c4 FROM test; ---- logical_plan -01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 -02)--TableScan: test projection=[column1_utf8view, column2_utf8view] +01)Projection: starts_with(test.column1_utf8view, test.column2_utf8view) AS c1, starts_with(test.column1_utf8view, CAST(test.column2_utf8 AS Utf8View)) AS c3, starts_with(test.column1_utf8view, CAST(test.column2_large_utf8 AS Utf8View)) AS c4 +02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view, column2_utf8view] + +# Test STARTS_WITH with utf8 against utf8view, dict, utf8, and largeutf8 +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8, column2_utf8view) as c1, + STARTS_WITH(column1_utf8, column2_utf8) as c3, + STARTS_WITH(column1_utf8, column2_large_utf8) as c4 +FROM test; +---- +logical_plan +01)Projection: starts_with(__common_expr_1, test.column2_utf8view) AS c1, starts_with(test.column1_utf8, test.column2_utf8) AS c3, starts_with(__common_expr_1, CAST(test.column2_large_utf8 AS Utf8View)) AS c4 +02)--Projection: CAST(test.column1_utf8 AS Utf8View) AS __common_expr_1, test.column1_utf8, test.column2_utf8, test.column2_large_utf8, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8, column2_utf8, column2_large_utf8, column2_utf8view] statement ok drop table test; From a80e1bc68b266ecc6ef4ff57c4067e5748d642e3 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sun, 4 Aug 2024 11:40:12 -0700 Subject: [PATCH 09/12] tests: improve tests --- .../sqllogictest/test_files/string_view.slt | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index a3cad075948c..dbb0500f26b0 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -355,19 +355,19 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT test.column1_utf8), count(DISTINCT test.column1_utf8view), count(DISTINCT test.column1_dict)]] 02)--TableScan: test projection=[column1_utf8, column1_utf8view, column1_dict] -# Test STARTS_WITH with utf8view against utf8view, dict, utf8, and largeutf8 +# Test STARTS_WITH with utf8view against utf8view, utf8, and largeutf8 query TT EXPLAIN SELECT STARTS_WITH(column1_utf8view, column2_utf8view) as c1, - STARTS_WITH(column1_utf8view, column2_utf8) as c3, - STARTS_WITH(column1_utf8view, column2_large_utf8) as c4 + STARTS_WITH(column1_utf8view, column2_utf8) as c2, + STARTS_WITH(column1_utf8view, column2_large_utf8) as c3 FROM test; ---- logical_plan -01)Projection: starts_with(test.column1_utf8view, test.column2_utf8view) AS c1, starts_with(test.column1_utf8view, CAST(test.column2_utf8 AS Utf8View)) AS c3, starts_with(test.column1_utf8view, CAST(test.column2_large_utf8 AS Utf8View)) AS c4 +01)Projection: starts_with(test.column1_utf8view, test.column2_utf8view) AS c1, starts_with(test.column1_utf8view, CAST(test.column2_utf8 AS Utf8View)) AS c2, starts_with(test.column1_utf8view, CAST(test.column2_large_utf8 AS Utf8View)) AS c3 02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view, column2_utf8view] -# Test STARTS_WITH with utf8 against utf8view, dict, utf8, and largeutf8 +# Test STARTS_WITH with utf8 against utf8view, utf8, and largeutf8 query TT EXPLAIN SELECT STARTS_WITH(column1_utf8, column2_utf8view) as c1, @@ -380,6 +380,18 @@ logical_plan 02)--Projection: CAST(test.column1_utf8 AS Utf8View) AS __common_expr_1, test.column1_utf8, test.column2_utf8, test.column2_large_utf8, test.column2_utf8view 03)----TableScan: test projection=[column1_utf8, column2_utf8, column2_large_utf8, column2_utf8view] +# Test STARTS_WITH with utf8view against various literals +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8view, 'äöüß') as c1, + STARTS_WITH(column1_utf8view, '') as c2, + STARTS_WITH(column1_utf8view, NULL) as c3 +FROM test; +---- +logical_plan +01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(CAST(test.column1_utf8view AS Utf8), Utf8(NULL)) AS c3 +02)--TableScan: test projection=[column1_utf8view] + statement ok drop table test; From f06f34286db9c965e59710802de8da0cfab2c42c Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sun, 4 Aug 2024 11:47:55 -0700 Subject: [PATCH 10/12] fix: fix null case --- datafusion/expr/src/type_coercion/functions.rs | 4 +++- datafusion/sqllogictest/test_files/string_view.slt | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 147978b06f8d..af50f297bf3c 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -574,7 +574,9 @@ fn coerced_from<'a>( Some(type_into.clone()) } // We can go into a Utf8View from a Utf8 or LargeUtf8 - (Utf8View, _) if matches!(type_from, Utf8 | LargeUtf8) => Some(type_into.clone()), + (Utf8View, _) if matches!(type_from, Utf8 | LargeUtf8 | Null) => { + Some(type_into.clone()) + } // Any type can be coerced into strings (Utf8 | LargeUtf8, _) => Some(type_into.clone()), (Null, _) if can_cast_types(type_from, type_into) => Some(type_into.clone()), diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index dbb0500f26b0..e4cdf4a46267 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -389,7 +389,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(CAST(test.column1_utf8view AS Utf8), Utf8(NULL)) AS c3 +01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3 02)--TableScan: test projection=[column1_utf8view] statement ok From 5806155c295be3378d5d53fa8b74c485ad80a72a Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sun, 4 Aug 2024 11:50:07 -0700 Subject: [PATCH 11/12] tests: one more null test --- datafusion/sqllogictest/test_files/string_view.slt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index e4cdf4a46267..fc795ccef3a6 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -385,11 +385,12 @@ query TT EXPLAIN SELECT STARTS_WITH(column1_utf8view, 'äöüß') as c1, STARTS_WITH(column1_utf8view, '') as c2, - STARTS_WITH(column1_utf8view, NULL) as c3 + STARTS_WITH(column1_utf8view, NULL) as c3, + STARTS_WITH(NULL, column1_utf8view) as c4 FROM test; ---- logical_plan -01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3 +01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 02)--TableScan: test projection=[column1_utf8view] statement ok From 82ce1fa37f625b5fcf6f8587d78515984598bb2c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 5 Aug 2024 16:02:44 -0400 Subject: [PATCH 12/12] Test comments and execution tests --- .../sqllogictest/test_files/string_view.slt | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index fc795ccef3a6..584d3b330690 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -355,7 +355,10 @@ logical_plan 01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT test.column1_utf8), count(DISTINCT test.column1_utf8view), count(DISTINCT test.column1_dict)]] 02)--TableScan: test projection=[column1_utf8, column1_utf8view, column1_dict] +### `STARTS_WITH` + # Test STARTS_WITH with utf8view against utf8view, utf8, and largeutf8 +# (should be no casts) query TT EXPLAIN SELECT STARTS_WITH(column1_utf8view, column2_utf8view) as c1, @@ -367,7 +370,21 @@ logical_plan 01)Projection: starts_with(test.column1_utf8view, test.column2_utf8view) AS c1, starts_with(test.column1_utf8view, CAST(test.column2_utf8 AS Utf8View)) AS c2, starts_with(test.column1_utf8view, CAST(test.column2_large_utf8 AS Utf8View)) AS c3 02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view, column2_utf8view] +query BBB +SELECT + STARTS_WITH(column1_utf8view, column2_utf8view) as c1, + STARTS_WITH(column1_utf8view, column2_utf8) as c2, + STARTS_WITH(column1_utf8view, column2_large_utf8) as c3 +FROM test; +---- +false false false +true true true +true true true +NULL NULL NULL + # Test STARTS_WITH with utf8 against utf8view, utf8, and largeutf8 +# Should work, but will have to cast to common types +# should cast utf8 -> utf8view and largeutf8 -> utf8view query TT EXPLAIN SELECT STARTS_WITH(column1_utf8, column2_utf8view) as c1, @@ -380,7 +397,22 @@ logical_plan 02)--Projection: CAST(test.column1_utf8 AS Utf8View) AS __common_expr_1, test.column1_utf8, test.column2_utf8, test.column2_large_utf8, test.column2_utf8view 03)----TableScan: test projection=[column1_utf8, column2_utf8, column2_large_utf8, column2_utf8view] -# Test STARTS_WITH with utf8view against various literals +query BBB + SELECT + STARTS_WITH(column1_utf8, column2_utf8view) as c1, + STARTS_WITH(column1_utf8, column2_utf8) as c3, + STARTS_WITH(column1_utf8, column2_large_utf8) as c4 +FROM test; +---- +false false false +true true true +true true true +NULL NULL NULL + + +# Test STARTS_WITH with utf8view against literals +# In this case, the literals should be cast to utf8view. The columns +# should not be cast to utf8. query TT EXPLAIN SELECT STARTS_WITH(column1_utf8view, 'äöüß') as c1,