From cda52307700446e8a75c8c70e3c7ec02eb9e4d66 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Thu, 28 Mar 2024 19:15:07 -0400 Subject: [PATCH] move the Translate, SubstrIndex, FindInSet functions to new datafusion-functions crate --- datafusion-cli/Cargo.lock | 1 + datafusion/expr/src/built_in_function.rs | 40 ---- datafusion/expr/src/expr_fn.rs | 7 - datafusion/functions/Cargo.toml | 3 +- .../functions/src/unicode/find_in_set.rs | 129 +++++++++++ datafusion/functions/src/unicode/mod.rs | 24 ++ .../functions/src/unicode/substrindex.rs | 148 ++++++++++++ datafusion/functions/src/unicode/translate.rs | 218 ++++++++++++++++++ datafusion/physical-expr/src/functions.rs | 148 +----------- datafusion/physical-expr/src/lib.rs | 2 - .../physical-expr/src/unicode_expressions.rs | 172 -------------- datafusion/proto/proto/datafusion.proto | 6 +- datafusion/proto/src/generated/pbjson.rs | 9 - datafusion/proto/src/generated/prost.rs | 12 +- .../proto/src/logical_plan/from_proto.rs | 26 +-- datafusion/proto/src/logical_plan/to_proto.rs | 3 - 16 files changed, 534 insertions(+), 414 deletions(-) create mode 100644 datafusion/functions/src/unicode/find_in_set.rs create mode 100644 datafusion/functions/src/unicode/substrindex.rs create mode 100644 datafusion/functions/src/unicode/translate.rs delete mode 100644 datafusion/physical-expr/src/unicode_expressions.rs diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index ba60c04cea55..522531a83179 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1267,6 +1267,7 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-physical-expr", + "hashbrown 0.14.3", "hex", "itertools", "log", diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 423fc11c1d8c..487bb893016f 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -113,12 +113,6 @@ pub enum BuiltinScalarFunction { InitCap, /// random Random, - /// translate - Translate, - /// substr_index - SubstrIndex, - /// find_in_set - FindInSet, } /// Maps the sql function name to `BuiltinScalarFunction` @@ -207,9 +201,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::EndsWith => Volatility::Immutable, BuiltinScalarFunction::InitCap => Volatility::Immutable, BuiltinScalarFunction::Radians => Volatility::Immutable, - BuiltinScalarFunction::Translate => Volatility::Immutable, - BuiltinScalarFunction::SubstrIndex => Volatility::Immutable, - BuiltinScalarFunction::FindInSet => Volatility::Immutable, // Volatile builtin functions BuiltinScalarFunction::Random => Volatility::Volatile, @@ -246,15 +237,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Pi => Ok(Float64), BuiltinScalarFunction::Random => Ok(Float64), BuiltinScalarFunction::EndsWith => Ok(Boolean), - BuiltinScalarFunction::SubstrIndex => { - utf8_to_str_type(&input_expr_types[0], "substr_index") - } - BuiltinScalarFunction::FindInSet => { - utf8_to_int_type(&input_expr_types[0], "find_in_set") - } - BuiltinScalarFunction::Translate => { - utf8_to_str_type(&input_expr_types[0], "translate") - } BuiltinScalarFunction::Factorial | BuiltinScalarFunction::Gcd @@ -338,22 +320,6 @@ impl BuiltinScalarFunction { ], self.volatility(), ), - - BuiltinScalarFunction::SubstrIndex => Signature::one_of( - vec![ - Exact(vec![Utf8, Utf8, Int64]), - Exact(vec![LargeUtf8, LargeUtf8, Int64]), - ], - self.volatility(), - ), - BuiltinScalarFunction::FindInSet => Signature::one_of( - vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])], - self.volatility(), - ), - - BuiltinScalarFunction::Translate => { - Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])], self.volatility()) - } BuiltinScalarFunction::Pi => Signature::exact(vec![], self.volatility()), BuiltinScalarFunction::Random => Signature::exact(vec![], self.volatility()), BuiltinScalarFunction::Power => Signature::one_of( @@ -513,9 +479,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"], BuiltinScalarFunction::EndsWith => &["ends_with"], BuiltinScalarFunction::InitCap => &["initcap"], - BuiltinScalarFunction::Translate => &["translate"], - BuiltinScalarFunction::SubstrIndex => &["substr_index", "substring_index"], - BuiltinScalarFunction::FindInSet => &["find_in_set"], } } } @@ -580,9 +543,6 @@ macro_rules! get_optimal_return_type { // `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input type size. get_optimal_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8); -// `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type size. -get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32); - #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 09170ae639ff..6cbe32b81e9d 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -579,7 +579,6 @@ scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`"); scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase"); scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`"); -scalar_expr!(Translate, translate, string from to, "replaces the characters in `from` with the counterpart in `to`"); nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which evaluates to the value of the first [Expr] which is not NULL"); //there is a func concat_ws before, so use concat_ws_expr as name.c nary_scalar_expr!( @@ -596,9 +595,6 @@ scalar_expr!( "returns true if a given number is +0.0 or -0.0 otherwise returns false" ); -scalar_expr!(SubstrIndex, substr_index, string delimiter count, "Returns the substring from str before count occurrences of the delimiter"); -scalar_expr!(FindInSet, find_in_set, str strlist, "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings"); - /// Create a CASE WHEN statement with literal WHEN expressions for comparison to the base expression. pub fn case(expr: Expr) -> CaseBuilder { CaseBuilder::new(Some(Box::new(expr)), vec![], vec![], None) @@ -1012,8 +1008,5 @@ mod test { test_scalar_expr!(Lcm, lcm, arg_1, arg_2); test_scalar_expr!(InitCap, initcap, string); test_scalar_expr!(EndsWith, ends_with, string, characters); - test_scalar_expr!(Translate, translate, string, from, to); - test_scalar_expr!(SubstrIndex, substr_index, string, delimiter, count); - test_scalar_expr!(FindInSet, find_in_set, string, stringlist); } } diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 0cab0276ff4b..70c6b3e238d6 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -54,7 +54,7 @@ regex_expressions = ["regex"] # enable string functions string_expressions = [] # enable unicode functions -unicode_expressions = ["unicode-segmentation"] +unicode_expressions = ["hashbrown", "unicode-segmentation"] [lib] name = "datafusion_functions" @@ -72,6 +72,7 @@ datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-physical-expr = { workspace = true, default-features = true } +hashbrown = { version = "0.14", features = ["raw"], optional = true } hex = { version = "0.4", optional = true } itertools = { workspace = true } log = { workspace = true } diff --git a/datafusion/functions/src/unicode/find_in_set.rs b/datafusion/functions/src/unicode/find_in_set.rs new file mode 100644 index 000000000000..9a9ab09de2f3 --- /dev/null +++ b/datafusion/functions/src/unicode/find_in_set.rs @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ + ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, PrimitiveArray, +}; +use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; + +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_int_type}; + +#[derive(Debug)] +pub(super) struct FindInSetFunc { + signature: Signature, +} + +impl FindInSetFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for FindInSetFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "find_in_set" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_int_type(&arg_types[0], "find_in_set") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => { + make_scalar_function(find_in_set::, vec![])(args) + } + DataType::LargeUtf8 => { + make_scalar_function(find_in_set::, vec![])(args) + } + other => { + exec_err!("Unsupported data type {other:?} for function find_in_set") + } + } + } +} + +///Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings +///A string list is a string composed of substrings separated by , characters. +pub fn find_in_set(args: &[ArrayRef]) -> Result +where + T::Native: OffsetSizeTrait, +{ + if args.len() != 2 { + return exec_err!( + "find_in_set was called with {} arguments. It requires 2.", + args.len() + ); + } + + let str_array: &GenericStringArray = + as_generic_string_array::(&args[0])?; + let str_list_array: &GenericStringArray = + as_generic_string_array::(&args[1])?; + + let result = str_array + .iter() + .zip(str_list_array.iter()) + .map(|(string, str_list)| match (string, str_list) { + (Some(string), Some(str_list)) => { + let mut res = 0; + let str_set: Vec<&str> = str_list.split(',').collect(); + for (idx, str) in str_set.iter().enumerate() { + if str == &string { + res = idx + 1; + break; + } + } + T::Native::from_usize(res) + } + _ => None, + }) + .collect::>(); + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use datafusion_common::Result; + + #[test] + fn test_functions() -> Result<()> { + Ok(()) + } +} diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs index ddab0d1e27c9..eba4cd5048eb 100644 --- a/datafusion/functions/src/unicode/mod.rs +++ b/datafusion/functions/src/unicode/mod.rs @@ -22,6 +22,7 @@ use std::sync::Arc; use datafusion_expr::ScalarUDF; mod character_length; +mod find_in_set; mod left; mod lpad; mod reverse; @@ -29,6 +30,8 @@ mod right; mod rpad; mod strpos; mod substr; +mod substrindex; +mod translate; // create UDFs make_udf_function!( @@ -36,6 +39,7 @@ make_udf_function!( CHARACTER_LENGTH, character_length ); +make_udf_function!(find_in_set::FindInSetFunc, FIND_IN_SET, find_in_set); make_udf_function!(left::LeftFunc, LEFT, left); make_udf_function!(lpad::LPadFunc, LPAD, lpad); make_udf_function!(right::RightFunc, RIGHT, right); @@ -43,6 +47,8 @@ make_udf_function!(reverse::ReverseFunc, REVERSE, reverse); make_udf_function!(rpad::RPadFunc, RPAD, rpad); make_udf_function!(strpos::StrposFunc, STRPOS, strpos); make_udf_function!(substr::SubstrFunc, SUBSTR, substr); +make_udf_function!(substrindex::SubstrIndexFunc, SUBSTR_INDEX, substr_index); +make_udf_function!(translate::TranslateFunc, TRANSLATE, translate); pub mod expr_fn { use datafusion_expr::Expr; @@ -57,6 +63,11 @@ pub mod expr_fn { super::character_length().call(vec![string]) } + #[doc = "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings"] + pub fn find_in_set(string: Expr, strlist: Expr) -> Expr { + super::find_in_set().call(vec![string, strlist]) + } + #[doc = "finds the position from where the `substring` matches the `string`"] pub fn instr(string: Expr, substring: Expr) -> Expr { strpos(string, substring) @@ -111,12 +122,23 @@ pub mod expr_fn { pub fn substring(string: Expr, position: Expr, length: Expr) -> Expr { super::substr().call(vec![string, position, length]) } + + #[doc = "Returns the substring from str before count occurrences of the delimiter"] + pub fn substr_index(string: Expr, delimiter: Expr, count: Expr) -> Expr { + super::substr_index().call(vec![string, delimiter, count]) + } + + #[doc = "replaces the characters in `from` with the counterpart in `to`"] + pub fn translate(string: Expr, from: Expr, to: Expr) -> Expr { + super::translate().call(vec![string, from, to]) + } } /// Return a list of all functions in this package pub fn functions() -> Vec> { vec![ character_length(), + find_in_set(), left(), lpad(), reverse(), @@ -124,5 +146,7 @@ pub fn functions() -> Vec> { rpad(), strpos(), substr(), + substr_index(), + translate(), ] } diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs new file mode 100644 index 000000000000..d115a31cb54f --- /dev/null +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; + +#[derive(Debug)] +pub(super) struct SubstrIndexFunc { + signature: Signature, + aliases: Vec, +} + +impl SubstrIndexFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Utf8, Int64]), + Exact(vec![LargeUtf8, LargeUtf8, Int64]), + ], + Volatility::Immutable, + ), + aliases: vec![String::from("substring_index")], + } + } +} + +impl ScalarUDFImpl for SubstrIndexFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "substr_index" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "substr_index") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(substr_index::, vec![])(args), + DataType::LargeUtf8 => { + make_scalar_function(substr_index::, vec![])(args) + } + other => { + exec_err!("Unsupported data type {other:?} for function substr_index") + } + } + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +/// Returns the substring from str before count occurrences of the delimiter delim. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. +/// SUBSTRING_INDEX('www.apache.org', '.', 1) = www +/// SUBSTRING_INDEX('www.apache.org', '.', 2) = www.apache +/// SUBSTRING_INDEX('www.apache.org', '.', -2) = apache.org +/// SUBSTRING_INDEX('www.apache.org', '.', -1) = org +pub fn substr_index(args: &[ArrayRef]) -> Result { + if args.len() != 3 { + return exec_err!( + "substr_index was called with {} arguments. It requires 3.", + args.len() + ); + } + + let string_array = as_generic_string_array::(&args[0])?; + let delimiter_array = as_generic_string_array::(&args[1])?; + let count_array = as_int64_array(&args[2])?; + + let result = string_array + .iter() + .zip(delimiter_array.iter()) + .zip(count_array.iter()) + .map(|((string, delimiter), n)| match (string, delimiter, n) { + (Some(string), Some(delimiter), Some(n)) => { + // In MySQL, these cases will return an empty string. + if n == 0 || string.is_empty() || delimiter.is_empty() { + return Some(String::new()); + } + + let splitted: Box> = if n > 0 { + Box::new(string.split(delimiter)) + } else { + Box::new(string.rsplit(delimiter)) + }; + let occurrences = usize::try_from(n.unsigned_abs()).unwrap_or(usize::MAX); + // The length of the substring covered by substr_index. + let length = splitted + .take(occurrences) // at least 1 element, since n != 0 + .map(|s| s.len() + delimiter.len()) + .sum::() + - delimiter.len(); + if n > 0 { + Some(string[..length].to_owned()) + } else { + Some(string[string.len() - length..].to_owned()) + } + } + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use datafusion_common::Result; + + #[test] + fn test_functions() -> Result<()> { + Ok(()) + } +} diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs new file mode 100644 index 000000000000..25f6aed55fd1 --- /dev/null +++ b/datafusion/functions/src/unicode/translate.rs @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; +use hashbrown::HashMap; +use unicode_segmentation::UnicodeSegmentation; + +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; + +#[derive(Debug)] +pub(super) struct TranslateFunc { + signature: Signature, +} + +impl TranslateFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Utf8, Utf8])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for TranslateFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "translate" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "translate") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(translate::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(translate::, vec![])(args), + other => { + exec_err!("Unsupported data type {other:?} for function translate") + } + } + } +} + +/// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted. +/// translate('12345', '143', 'ax') = 'a2x5' +fn translate(args: &[ArrayRef]) -> Result { + let string_array = as_generic_string_array::(&args[0])?; + let from_array = as_generic_string_array::(&args[1])?; + let to_array = as_generic_string_array::(&args[2])?; + + let result = string_array + .iter() + .zip(from_array.iter()) + .zip(to_array.iter()) + .map(|((string, from), to)| match (string, from, to) { + (Some(string), Some(from), Some(to)) => { + // create a hashmap of [char, index] to change from O(n) to O(1) for from list + let from_map: HashMap<&str, usize> = from + .graphemes(true) + .collect::>() + .iter() + .enumerate() + .map(|(index, c)| (c.to_owned(), index)) + .collect(); + + let to = to.graphemes(true).collect::>(); + + Some( + string + .graphemes(true) + .collect::>() + .iter() + .flat_map(|c| match from_map.get(*c) { + Some(n) => to.get(*n).copied(), + None => Some(*c), + }) + .collect::>() + .concat(), + ) + } + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::translate::TranslateFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + #[cfg(feature = "unicode_expressions")] + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("12345")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("143")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("ax")))) + ], + Ok(Some("a2x5")), + &str, + Utf8, + StringArray + ); + #[cfg(feature = "unicode_expressions")] + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("143")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("ax")))) + ], + Ok(None), + &str, + Utf8, + StringArray + ); + #[cfg(feature = "unicode_expressions")] + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("12345")))), + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("ax")))) + ], + Ok(None), + &str, + Utf8, + StringArray + ); + #[cfg(feature = "unicode_expressions")] + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("12345")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("143")))), + ColumnarValue::Scalar(ScalarValue::Utf8(None)) + ], + Ok(None), + &str, + Utf8, + StringArray + ); + #[cfg(feature = "unicode_expressions")] + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("é2íñ5")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("éñí")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("óü")))), + ], + Ok(Some("ó2ü5")), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("12345")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("143")))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("ax")))), + ], + internal_err!( + "function translate requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 513dd71d4074..3717ab6b3cad 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -35,7 +35,7 @@ use std::sync::Arc; use arrow::{ array::ArrayRef, - datatypes::{DataType, Int32Type, Int64Type, Schema}, + datatypes::{DataType, Schema}, }; use arrow_array::Array; @@ -85,26 +85,6 @@ pub fn create_physical_expr( ))) } -#[cfg(feature = "unicode_expressions")] -macro_rules! invoke_if_unicode_expressions_feature_flag { - ($FUNC:ident, $T:tt, $NAME:expr) => {{ - use crate::unicode_expressions; - unicode_expressions::$FUNC::<$T> - }}; -} - -#[cfg(not(feature = "unicode_expressions"))] -macro_rules! invoke_if_unicode_expressions_feature_flag { - ($FUNC:ident, $T:tt, $NAME:expr) => { - |_: &[ArrayRef]| -> Result { - internal_err!( - "function {} requires compilation with feature flag: unicode_expressions.", - $NAME - ) - } - }; -} - #[derive(Debug, Clone, Copy)] pub enum Hint { /// Indicates the argument needs to be padded if it is scalar @@ -281,71 +261,6 @@ pub fn create_physical_fun( exec_err!("Unsupported data type {other:?} for function ends_with") } }), - BuiltinScalarFunction::Translate => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - translate, - i32, - "translate" - ); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - translate, - i64, - "translate" - ); - make_scalar_function_inner(func)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function translate") - } - }), - BuiltinScalarFunction::SubstrIndex => { - Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - substr_index, - i32, - "substr_index" - ); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - substr_index, - i64, - "substr_index" - ); - make_scalar_function_inner(func)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function substr_index") - } - }) - } - BuiltinScalarFunction::FindInSet => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - find_in_set, - Int32Type, - "find_in_set" - ); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - find_in_set, - Int64Type, - "find_in_set" - ); - make_scalar_function_inner(func)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function find_in_set") - } - }), }) } @@ -634,66 +549,7 @@ mod tests { Boolean, BooleanArray ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Translate, - &[lit("12345"), lit("143"), lit("ax"),], - Ok(Some("a2x5")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Translate, - &[lit(ScalarValue::Utf8(None)), lit("143"), lit("ax"),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Translate, - &[lit("12345"), lit(ScalarValue::Utf8(None)), lit("ax"),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Translate, - &[lit("12345"), lit("143"), lit(ScalarValue::Utf8(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Translate, - &[lit("é2íñ5"), lit("éñí"), lit("óü"),], - Ok(Some("ó2ü5")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Translate, - &[ - lit("12345"), - lit("143"), - lit("ax"), - ], - internal_err!( - "function translate requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); + Ok(()) } diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 1dead099540b..7819d5116160 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -33,8 +33,6 @@ pub mod sort_properties; pub mod string_expressions; pub mod tree_node; pub mod udf; -#[cfg(feature = "unicode_expressions")] -pub mod unicode_expressions; pub mod utils; pub mod window; diff --git a/datafusion/physical-expr/src/unicode_expressions.rs b/datafusion/physical-expr/src/unicode_expressions.rs deleted file mode 100644 index ecbd1ea320d4..000000000000 --- a/datafusion/physical-expr/src/unicode_expressions.rs +++ /dev/null @@ -1,172 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Some of these functions reference the Postgres documentation -// or implementation to ensure compatibility and are subject to -// the Postgres license. - -//! Unicode expressions - -use std::sync::Arc; - -use arrow::{ - array::{ArrayRef, GenericStringArray, OffsetSizeTrait, PrimitiveArray}, - datatypes::{ArrowNativeType, ArrowPrimitiveType}, -}; -use hashbrown::HashMap; -use unicode_segmentation::UnicodeSegmentation; - -use datafusion_common::{ - cast::{as_generic_string_array, as_int64_array}, - exec_err, Result, -}; - -/// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted. -/// translate('12345', '143', 'ax') = 'a2x5' -pub fn translate(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let from_array = as_generic_string_array::(&args[1])?; - let to_array = as_generic_string_array::(&args[2])?; - - let result = string_array - .iter() - .zip(from_array.iter()) - .zip(to_array.iter()) - .map(|((string, from), to)| match (string, from, to) { - (Some(string), Some(from), Some(to)) => { - // create a hashmap of [char, index] to change from O(n) to O(1) for from list - let from_map: HashMap<&str, usize> = from - .graphemes(true) - .collect::>() - .iter() - .enumerate() - .map(|(index, c)| (c.to_owned(), index)) - .collect(); - - let to = to.graphemes(true).collect::>(); - - Some( - string - .graphemes(true) - .collect::>() - .iter() - .flat_map(|c| match from_map.get(*c) { - Some(n) => to.get(*n).copied(), - None => Some(*c), - }) - .collect::>() - .concat(), - ) - } - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Returns the substring from str before count occurrences of the delimiter delim. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. -/// SUBSTRING_INDEX('www.apache.org', '.', 1) = www -/// SUBSTRING_INDEX('www.apache.org', '.', 2) = www.apache -/// SUBSTRING_INDEX('www.apache.org', '.', -2) = apache.org -/// SUBSTRING_INDEX('www.apache.org', '.', -1) = org -pub fn substr_index(args: &[ArrayRef]) -> Result { - if args.len() != 3 { - return exec_err!( - "substr_index was called with {} arguments. It requires 3.", - args.len() - ); - } - - let string_array = as_generic_string_array::(&args[0])?; - let delimiter_array = as_generic_string_array::(&args[1])?; - let count_array = as_int64_array(&args[2])?; - - let result = string_array - .iter() - .zip(delimiter_array.iter()) - .zip(count_array.iter()) - .map(|((string, delimiter), n)| match (string, delimiter, n) { - (Some(string), Some(delimiter), Some(n)) => { - // In MySQL, these cases will return an empty string. - if n == 0 || string.is_empty() || delimiter.is_empty() { - return Some(String::new()); - } - - let splitted: Box> = if n > 0 { - Box::new(string.split(delimiter)) - } else { - Box::new(string.rsplit(delimiter)) - }; - let occurrences = usize::try_from(n.unsigned_abs()).unwrap_or(usize::MAX); - // The length of the substring covered by substr_index. - let length = splitted - .take(occurrences) // at least 1 element, since n != 0 - .map(|s| s.len() + delimiter.len()) - .sum::() - - delimiter.len(); - if n > 0 { - Some(string[..length].to_owned()) - } else { - Some(string[string.len() - length..].to_owned()) - } - } - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -///Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings -///A string list is a string composed of substrings separated by , characters. -pub fn find_in_set(args: &[ArrayRef]) -> Result -where - T::Native: OffsetSizeTrait, -{ - if args.len() != 2 { - return exec_err!( - "find_in_set was called with {} arguments. It requires 2.", - args.len() - ); - } - - let str_array: &GenericStringArray = - as_generic_string_array::(&args[0])?; - let str_list_array: &GenericStringArray = - as_generic_string_array::(&args[1])?; - - let result = str_array - .iter() - .zip(str_list_array.iter()) - .map(|(string, str_list)| match (string, str_list) { - (Some(string), Some(str_list)) => { - let mut res = 0; - let str_set: Vec<&str> = str_list.split(',').collect(); - for (idx, str) in str_set.iter().enumerate() { - if str == &string { - res = idx + 1; - break; - } - } - T::Native::from_usize(res) - } - _ => None, - }) - .collect::>(); - Ok(Arc::new(result) as ArrayRef) -} diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 3a187eabe836..b9fc92e1eeb0 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -601,7 +601,7 @@ enum ScalarFunction { // 57 was ToTimestampMicros // 58 was ToTimestampSeconds // 59 was Now - Translate = 60; + // 60 was Translate // Trim = 61; // Upper = 62; Coalesce = 63; @@ -665,8 +665,8 @@ enum ScalarFunction { // 123 is ArrayExcept // 124 was ArrayPopFront // 125 was Levenshtein - SubstrIndex = 126; - FindInSet = 127; + // 126 was SubstrIndex + // 127 was FindInSet // 128 was ArraySort // 129 was ArrayDistinct // 130 was ArrayResize diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 07b91b26d60b..179fb0aaf824 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22932,7 +22932,6 @@ impl serde::Serialize for ScalarFunction { Self::ConcatWithSeparator => "ConcatWithSeparator", Self::InitCap => "InitCap", Self::Random => "Random", - Self::Translate => "Translate", Self::Coalesce => "Coalesce", Self::Power => "Power", Self::Atan2 => "Atan2", @@ -22951,8 +22950,6 @@ impl serde::Serialize for ScalarFunction { Self::Cot => "Cot", Self::Nanvl => "Nanvl", Self::Iszero => "Iszero", - Self::SubstrIndex => "SubstrIndex", - Self::FindInSet => "FindInSet", Self::EndsWith => "EndsWith", }; serializer.serialize_str(variant) @@ -22984,7 +22981,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ConcatWithSeparator", "InitCap", "Random", - "Translate", "Coalesce", "Power", "Atan2", @@ -23003,8 +22999,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Cot", "Nanvl", "Iszero", - "SubstrIndex", - "FindInSet", "EndsWith", ]; @@ -23065,7 +23059,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ConcatWithSeparator" => Ok(ScalarFunction::ConcatWithSeparator), "InitCap" => Ok(ScalarFunction::InitCap), "Random" => Ok(ScalarFunction::Random), - "Translate" => Ok(ScalarFunction::Translate), "Coalesce" => Ok(ScalarFunction::Coalesce), "Power" => Ok(ScalarFunction::Power), "Atan2" => Ok(ScalarFunction::Atan2), @@ -23084,8 +23077,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Cot" => Ok(ScalarFunction::Cot), "Nanvl" => Ok(ScalarFunction::Nanvl), "Iszero" => Ok(ScalarFunction::Iszero), - "SubstrIndex" => Ok(ScalarFunction::SubstrIndex), - "FindInSet" => Ok(ScalarFunction::FindInSet), "EndsWith" => Ok(ScalarFunction::EndsWith), _ => Err(serde::de::Error::unknown_variant(value, FIELDS)), } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index babeccec595f..34610c0f6477 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2900,7 +2900,7 @@ pub enum ScalarFunction { /// 57 was ToTimestampMicros /// 58 was ToTimestampSeconds /// 59 was Now - Translate = 60, + /// 60 was Translate /// Trim = 61; /// Upper = 62; Coalesce = 63, @@ -2964,8 +2964,8 @@ pub enum ScalarFunction { /// 123 is ArrayExcept /// 124 was ArrayPopFront /// 125 was Levenshtein - SubstrIndex = 126, - FindInSet = 127, + /// 126 was SubstrIndex + /// 127 was FindInSet /// 128 was ArraySort /// 129 was ArrayDistinct /// 130 was ArrayResize @@ -3005,7 +3005,6 @@ impl ScalarFunction { ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator", ScalarFunction::InitCap => "InitCap", ScalarFunction::Random => "Random", - ScalarFunction::Translate => "Translate", ScalarFunction::Coalesce => "Coalesce", ScalarFunction::Power => "Power", ScalarFunction::Atan2 => "Atan2", @@ -3024,8 +3023,6 @@ impl ScalarFunction { ScalarFunction::Cot => "Cot", ScalarFunction::Nanvl => "Nanvl", ScalarFunction::Iszero => "Iszero", - ScalarFunction::SubstrIndex => "SubstrIndex", - ScalarFunction::FindInSet => "FindInSet", ScalarFunction::EndsWith => "EndsWith", } } @@ -3051,7 +3048,6 @@ impl ScalarFunction { "ConcatWithSeparator" => Some(Self::ConcatWithSeparator), "InitCap" => Some(Self::InitCap), "Random" => Some(Self::Random), - "Translate" => Some(Self::Translate), "Coalesce" => Some(Self::Coalesce), "Power" => Some(Self::Power), "Atan2" => Some(Self::Atan2), @@ -3070,8 +3066,6 @@ impl ScalarFunction { "Cot" => Some(Self::Cot), "Nanvl" => Some(Self::Nanvl), "Iszero" => Some(Self::Iszero), - "SubstrIndex" => Some(Self::SubstrIndex), - "FindInSet" => Some(Self::FindInSet), "EndsWith" => Some(Self::EndsWith), _ => None, } diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index ff3d6773d512..46732e596629 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -40,12 +40,11 @@ use datafusion_expr::{ acosh, asinh, atan, atan2, atanh, cbrt, ceil, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, degrees, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, - factorial, find_in_set, floor, gcd, initcap, iszero, lcm, ln, log, log10, log2, + factorial, floor, gcd, initcap, iszero, lcm, ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, - nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, substr_index, - translate, trunc, AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, - BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField, - GroupingSet, + nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, trunc, + AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, + Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, @@ -455,15 +454,12 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::EndsWith => Self::EndsWith, ScalarFunction::InitCap => Self::InitCap, ScalarFunction::Random => Self::Random, - ScalarFunction::Translate => Self::Translate, ScalarFunction::Coalesce => Self::Coalesce, ScalarFunction::Pi => Self::Pi, ScalarFunction::Power => Self::Power, ScalarFunction::Atan2 => Self::Atan2, ScalarFunction::Nanvl => Self::Nanvl, ScalarFunction::Iszero => Self::Iszero, - ScalarFunction::SubstrIndex => Self::SubstrIndex, - ScalarFunction::FindInSet => Self::FindInSet, } } } @@ -1387,11 +1383,6 @@ pub fn parse_expr( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, )), - ScalarFunction::Translate => Ok(translate( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - parse_expr(&args[2], registry, codec)?, - )), ScalarFunction::Coalesce => { Ok(coalesce(parse_exprs(args, registry, codec)?)) } @@ -1416,15 +1407,6 @@ pub fn parse_expr( ScalarFunction::Iszero => { Ok(iszero(parse_expr(&args[0], registry, codec)?)) } - ScalarFunction::SubstrIndex => Ok(substr_index( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - parse_expr(&args[2], registry, codec)?, - )), - ScalarFunction::FindInSet => Ok(find_in_set( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), } } ExprType::ScalarUdfExpr(protobuf::ScalarUdfExprNode { diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 89d49c5658a2..d7ef3d0d3f40 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1446,15 +1446,12 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::EndsWith => Self::EndsWith, BuiltinScalarFunction::InitCap => Self::InitCap, BuiltinScalarFunction::Random => Self::Random, - BuiltinScalarFunction::Translate => Self::Translate, BuiltinScalarFunction::Coalesce => Self::Coalesce, BuiltinScalarFunction::Pi => Self::Pi, BuiltinScalarFunction::Power => Self::Power, BuiltinScalarFunction::Atan2 => Self::Atan2, BuiltinScalarFunction::Nanvl => Self::Nanvl, BuiltinScalarFunction::Iszero => Self::Iszero, - BuiltinScalarFunction::SubstrIndex => Self::SubstrIndex, - BuiltinScalarFunction::FindInSet => Self::FindInSet, }; Ok(scalar_function)