Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support Utf8View type in starts_with function #11787

Merged
merged 13 commits into from
Aug 6, 2024
1 change: 1 addition & 0 deletions datafusion/expr/src/expr_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ impl ExprSchemable for Expr {
.iter()
.map(|e| e.get_type(schema))
.collect::<Result<Vec<_>>>()?;

// verify that function is invoked with correct number and type of arguments as defined in `TypeSignature`
data_types_with_scalar_udf(&arg_data_types, func).map_err(|err| {
plan_datafusion_err!(
Expand Down
16 changes: 16 additions & 0 deletions datafusion/expr/src/type_coercion/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,10 @@ fn coerced_from<'a>(
(Interval(_), _) if matches!(type_from, Utf8 | LargeUtf8) => {
Some(type_into.clone())
}
// We can go into a Utf8View from a Utf8 or LargeUtf8
(Utf8View, _) if matches!(type_from, Utf8 | LargeUtf8 | Null) => {
Some(type_into.clone())
}
// Any type can be coerced into strings
(Utf8 | LargeUtf8, _) => Some(type_into.clone()),
(Null, _) if can_cast_types(type_from, type_into) => Some(type_into.clone()),
Expand Down Expand Up @@ -636,6 +640,18 @@ mod tests {
use super::*;
use arrow::datatypes::Field;

#[test]
fn test_string_conversion() {
let cases = vec![
(DataType::Utf8View, DataType::Utf8, true),
(DataType::Utf8View, DataType::LargeUtf8, true),
];

for case in cases {
assert_eq!(can_coerce_from(&case.0, &case.1), case.2);
}
}

#[test]
fn test_maybe_data_types() {
// this vec contains: arg1, arg2, expected result
Expand Down
92 changes: 72 additions & 20 deletions datafusion/functions/src/string/starts_with.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
use std::any::Any;
use std::sync::Arc;

use arrow::array::{ArrayRef, OffsetSizeTrait};
use arrow::array::ArrayRef;
use arrow::datatypes::DataType;

use datafusion_common::{cast::as_generic_string_array, internal_err, Result};
use datafusion_common::{internal_err, Result};
use datafusion_expr::ColumnarValue;
use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
Expand All @@ -30,12 +30,8 @@ use crate::utils::make_scalar_function;

/// Returns true if string starts with prefix.
/// starts_with('alphabet', 'alph') = 't'
pub fn starts_with<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let left = as_generic_string_array::<T>(&args[0])?;
let right = as_generic_string_array::<T>(&args[1])?;

let result = arrow::compute::kernels::comparison::starts_with(left, right)?;

pub fn starts_with(args: &[ArrayRef]) -> Result<ArrayRef> {
let result = arrow::compute::kernels::comparison::starts_with(&args[0], &args[1])?;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Ok(Arc::new(result) as ArrayRef)
}

Expand All @@ -52,14 +48,15 @@ impl Default for StartsWithFunc {

impl StartsWithFunc {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::one_of(
vec![
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8, LargeUtf8]),
Exact(vec![LargeUtf8, Utf8]),
Exact(vec![LargeUtf8, LargeUtf8]),
// Planner attempts coercion to the target type starting with the most preferred candidate.
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
// If that fails, it proceeds to `(Utf8, Utf8)`.
Exact(vec![DataType::Utf8View, DataType::Utf8View]),
tshauck marked this conversation as resolved.
Show resolved Hide resolved
Exact(vec![DataType::Utf8, DataType::Utf8]),
Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]),
],
Volatility::Immutable,
),
Expand All @@ -81,18 +78,73 @@ impl ScalarUDFImpl for StartsWithFunc {
}

fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
use DataType::*;

Ok(Boolean)
Ok(DataType::Boolean)
}

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(starts_with::<i32>, vec![])(args),
DataType::LargeUtf8 => {
return make_scalar_function(starts_with::<i64>, vec![])(args);
DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => {
make_scalar_function(starts_with, vec![])(args)
}
_ => internal_err!("Unsupported data type"),
_ => internal_err!("Unsupported data types for starts_with. Expected Utf8, LargeUtf8 or Utf8View")?,
}
}
}

#[cfg(test)]
mod tests {
use crate::utils::test::test_function;
use arrow::array::{Array, BooleanArray};
use arrow::datatypes::DataType::Boolean;
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};

use super::*;

#[test]
fn test_functions() -> Result<()> {
// Generate test cases for starts_with
let test_cases = vec![
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is the best way to test this change. It tests the function but doesn't a view wouldn't be coerced.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have tests for the coercion in #11753 -- you beat me to filing the ticket ❤️ -- I plan to make an epic / start filing items for the various other functions

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recommend using a slt test (example of sql test above)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to throw it out there, if you make the top level epic, I'd be happy to work though that PR and make individual issues for the problematic functions.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deal! I filed #11790

What I recommend doing is starting with a single ticket and making it a complete template / description with the idea being that that someone that is not familar DataFusion or Utf8View can work on it

Then when we file tickets for the rest of the functions we can reuse the same description over again. This

  1. Reduces the barrier to contribution
  2. I have found people love working on the code when there is a clear description

I took a shot at updating #11786 with background for use a template. Let me know what you think

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me. I'll have a go at replicating one of the other functions following #11786, and then if we're happy with it, I can grind through the others.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good -- thank you. What I have found works well in the past is to do like 10 at a time (if you make more than that the number of parallel threads / work I found overwhelmed our ability to review them)

(Some("alphabet"), Some("alph"), Some(true)),
(Some("alphabet"), Some("bet"), Some(false)),
(
Some("somewhat large string"),
Some("somewhat large"),
Some(true),
),
(Some("somewhat large string"), Some("large"), Some(false)),
]
.into_iter()
.flat_map(|(a, b, c)| {
let utf_8_args = vec![
ColumnarValue::Scalar(ScalarValue::Utf8(a.map(|s| s.to_string()))),
ColumnarValue::Scalar(ScalarValue::Utf8(b.map(|s| s.to_string()))),
];

let large_utf_8_args = vec![
ColumnarValue::Scalar(ScalarValue::LargeUtf8(a.map(|s| s.to_string()))),
ColumnarValue::Scalar(ScalarValue::LargeUtf8(b.map(|s| s.to_string()))),
];

let utf_8_view_args = vec![
ColumnarValue::Scalar(ScalarValue::Utf8View(a.map(|s| s.to_string()))),
ColumnarValue::Scalar(ScalarValue::Utf8View(b.map(|s| s.to_string()))),
];

vec![(utf_8_args, c), (large_utf_8_args, c), (utf_8_view_args, c)]
});

for (args, expected) in test_cases {
test_function!(
StartsWithFunc::new(),
&args,
Ok(expected),
bool,
Boolean,
BooleanArray
);
}

Ok(())
}
}
70 changes: 69 additions & 1 deletion datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,75 @@ logical_plan
01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT test.column1_utf8), count(DISTINCT test.column1_utf8view), count(DISTINCT test.column1_dict)]]
02)--TableScan: test projection=[column1_utf8, column1_utf8view, column1_dict]

### `STARTS_WITH`

# Test STARTS_WITH with utf8view against utf8view, utf8, and largeutf8
# (should be no casts)
query TT
EXPLAIN SELECT
STARTS_WITH(column1_utf8view, column2_utf8view) as c1,
STARTS_WITH(column1_utf8view, column2_utf8) as c2,
STARTS_WITH(column1_utf8view, column2_large_utf8) as c3
FROM test;
----
logical_plan
01)Projection: starts_with(test.column1_utf8view, test.column2_utf8view) AS c1, starts_with(test.column1_utf8view, CAST(test.column2_utf8 AS Utf8View)) AS c2, starts_with(test.column1_utf8view, CAST(test.column2_large_utf8 AS Utf8View)) AS c3
02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view, column2_utf8view]

query BBB
SELECT
STARTS_WITH(column1_utf8view, column2_utf8view) as c1,
STARTS_WITH(column1_utf8view, column2_utf8) as c2,
STARTS_WITH(column1_utf8view, column2_large_utf8) as c3
FROM test;
----
false false false
true true true
true true true
NULL NULL NULL

# Test STARTS_WITH with utf8 against utf8view, utf8, and largeutf8
# Should work, but will have to cast to common types
# should cast utf8 -> utf8view and largeutf8 -> utf8view
query TT
EXPLAIN SELECT
STARTS_WITH(column1_utf8, column2_utf8view) as c1,
STARTS_WITH(column1_utf8, column2_utf8) as c3,
STARTS_WITH(column1_utf8, column2_large_utf8) as c4
FROM test;
----
logical_plan
01)Projection: starts_with(__common_expr_1, test.column2_utf8view) AS c1, starts_with(test.column1_utf8, test.column2_utf8) AS c3, starts_with(__common_expr_1, CAST(test.column2_large_utf8 AS Utf8View)) AS c4
02)--Projection: CAST(test.column1_utf8 AS Utf8View) AS __common_expr_1, test.column1_utf8, test.column2_utf8, test.column2_large_utf8, test.column2_utf8view
03)----TableScan: test projection=[column1_utf8, column2_utf8, column2_large_utf8, column2_utf8view]

query BBB
SELECT
STARTS_WITH(column1_utf8, column2_utf8view) as c1,
STARTS_WITH(column1_utf8, column2_utf8) as c3,
STARTS_WITH(column1_utf8, column2_large_utf8) as c4
FROM test;
----
false false false
true true true
true true true
NULL NULL NULL


# Test STARTS_WITH with utf8view against literals
# In this case, the literals should be cast to utf8view. The columns
# should not be cast to utf8.
query TT
EXPLAIN SELECT
STARTS_WITH(column1_utf8view, 'äöüß') as c1,
STARTS_WITH(column1_utf8view, '') as c2,
STARTS_WITH(column1_utf8view, NULL) as c3,
STARTS_WITH(NULL, column1_utf8view) as c4
FROM test;
----
logical_plan
01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
02)--TableScan: test projection=[column1_utf8view]

statement ok
drop table test;
Expand All @@ -376,6 +445,5 @@ select t.dt from dates t where arrow_cast('2024-01-01', 'Utf8View') < t.dt;
----
2024-01-23


statement ok
drop table dates;