diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 2bbe89f24bbe..3be92221d3ee 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1268,6 +1268,7 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-physical-expr", + "hashbrown 0.14.3", "hex", "itertools", "log", @@ -1342,7 +1343,6 @@ dependencies = [ "rand", "regex", "sha2", - "unicode-segmentation", ] [[package]] diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index fbbe047880b1..1206871aceda 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -67,8 +67,6 @@ regex_expressions = [ ] serde = ["arrow-schema/serde"] unicode_expressions = [ - "datafusion-physical-expr/unicode_expressions", - "datafusion-optimizer/unicode_expressions", "datafusion-sql/unicode_expressions", "datafusion-functions/unicode_expressions", ] diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index f07e84027552..f8d16f465091 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -107,12 +107,6 @@ pub enum BuiltinScalarFunction { InitCap, /// random Random, - /// translate - Translate, - /// substr_index - SubstrIndex, - /// find_in_set - FindInSet, } /// Maps the sql function name to `BuiltinScalarFunction` @@ -198,9 +192,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::EndsWith => Volatility::Immutable, BuiltinScalarFunction::InitCap => Volatility::Immutable, BuiltinScalarFunction::Radians => Volatility::Immutable, - BuiltinScalarFunction::Translate => Volatility::Immutable, - BuiltinScalarFunction::SubstrIndex => Volatility::Immutable, - BuiltinScalarFunction::FindInSet => Volatility::Immutable, // Volatile builtin functions BuiltinScalarFunction::Random => Volatility::Volatile, @@ -237,15 +228,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Pi => Ok(Float64), BuiltinScalarFunction::Random => Ok(Float64), BuiltinScalarFunction::EndsWith => Ok(Boolean), - BuiltinScalarFunction::SubstrIndex => { - utf8_to_str_type(&input_expr_types[0], "substr_index") - } - BuiltinScalarFunction::FindInSet => { - utf8_to_int_type(&input_expr_types[0], "find_in_set") - } - BuiltinScalarFunction::Translate => { - utf8_to_str_type(&input_expr_types[0], "translate") - } BuiltinScalarFunction::Factorial | BuiltinScalarFunction::Gcd @@ -326,22 +308,6 @@ impl BuiltinScalarFunction { ], self.volatility(), ), - - BuiltinScalarFunction::SubstrIndex => Signature::one_of( - vec![ - Exact(vec![Utf8, Utf8, Int64]), - Exact(vec![LargeUtf8, LargeUtf8, Int64]), - ], - self.volatility(), - ), - BuiltinScalarFunction::FindInSet => Signature::one_of( - vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])], - self.volatility(), - ), - - BuiltinScalarFunction::Translate => { - Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])], self.volatility()) - } BuiltinScalarFunction::Pi => Signature::exact(vec![], self.volatility()), BuiltinScalarFunction::Random => Signature::exact(vec![], self.volatility()), BuiltinScalarFunction::Power => Signature::one_of( @@ -492,9 +458,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"], BuiltinScalarFunction::EndsWith => &["ends_with"], BuiltinScalarFunction::InitCap => &["initcap"], - BuiltinScalarFunction::Translate => &["translate"], - BuiltinScalarFunction::SubstrIndex => &["substr_index", "substring_index"], - BuiltinScalarFunction::FindInSet => &["find_in_set"], } } } @@ -559,9 +522,6 @@ macro_rules! get_optimal_return_type { // `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input type size. get_optimal_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8); -// `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type size. -get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32); - #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index e216e4e86dc1..ab5628fece12 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -576,7 +576,6 @@ scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`"); scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase"); scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`"); -scalar_expr!(Translate, translate, string from to, "replaces the characters in `from` with the counterpart in `to`"); nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which evaluates to the value of the first [Expr] which is not NULL"); //there is a func concat_ws before, so use concat_ws_expr as name.c nary_scalar_expr!( @@ -593,9 +592,6 @@ scalar_expr!( "returns true if a given number is +0.0 or -0.0 otherwise returns false" ); -scalar_expr!(SubstrIndex, substr_index, string delimiter count, "Returns the substring from str before count occurrences of the delimiter"); -scalar_expr!(FindInSet, find_in_set, str strlist, "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings"); - /// Create a CASE WHEN statement with literal WHEN expressions for comparison to the base expression. pub fn case(expr: Expr) -> CaseBuilder { CaseBuilder::new(Some(Box::new(expr)), vec![], vec![], None) @@ -1006,8 +1002,5 @@ mod test { test_scalar_expr!(Lcm, lcm, arg_1, arg_2); test_scalar_expr!(InitCap, initcap, string); test_scalar_expr!(EndsWith, ends_with, string, characters); - test_scalar_expr!(Translate, translate, string, from, to); - test_scalar_expr!(SubstrIndex, substr_index, string, delimiter, count); - test_scalar_expr!(FindInSet, find_in_set, string, stringlist); } } diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 3ae3061012e0..b9fae321a49c 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -54,7 +54,7 @@ regex_expressions = ["regex"] # enable string functions string_expressions = ["uuid"] # enable unicode functions -unicode_expressions = ["unicode-segmentation"] +unicode_expressions = ["hashbrown", "unicode-segmentation"] [lib] name = "datafusion_functions" @@ -72,6 +72,7 @@ datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-physical-expr = { workspace = true, default-features = true } +hashbrown = { version = "0.14", features = ["raw"], optional = true } hex = { version = "0.4", optional = true } itertools = { workspace = true } log = { workspace = true } diff --git a/datafusion/functions/src/unicode/find_in_set.rs b/datafusion/functions/src/unicode/find_in_set.rs new file mode 100644 index 000000000000..7e0306d49454 --- /dev/null +++ b/datafusion/functions/src/unicode/find_in_set.rs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ + ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, PrimitiveArray, +}; +use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; + +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_int_type}; + +#[derive(Debug)] +pub(super) struct FindInSetFunc { + signature: Signature, +} + +impl FindInSetFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for FindInSetFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "find_in_set" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_int_type(&arg_types[0], "find_in_set") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => { + make_scalar_function(find_in_set::, vec![])(args) + } + DataType::LargeUtf8 => { + make_scalar_function(find_in_set::, vec![])(args) + } + other => { + exec_err!("Unsupported data type {other:?} for function find_in_set") + } + } + } +} + +///Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings +///A string list is a string composed of substrings separated by , characters. +pub fn find_in_set(args: &[ArrayRef]) -> Result +where + T::Native: OffsetSizeTrait, +{ + if args.len() != 2 { + return exec_err!( + "find_in_set was called with {} arguments. It requires 2.", + args.len() + ); + } + + let str_array: &GenericStringArray = + as_generic_string_array::(&args[0])?; + let str_list_array: &GenericStringArray = + as_generic_string_array::(&args[1])?; + + let result = str_array + .iter() + .zip(str_list_array.iter()) + .map(|(string, str_list)| match (string, str_list) { + (Some(string), Some(str_list)) => { + let mut res = 0; + let str_set: Vec<&str> = str_list.split(',').collect(); + for (idx, str) in str_set.iter().enumerate() { + if str == &string { + res = idx + 1; + break; + } + } + T::Native::from_usize(res) + } + _ => None, + }) + .collect::>(); + Ok(Arc::new(result) as ArrayRef) +} diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs index ddab0d1e27c9..eba4cd5048eb 100644 --- a/datafusion/functions/src/unicode/mod.rs +++ b/datafusion/functions/src/unicode/mod.rs @@ -22,6 +22,7 @@ use std::sync::Arc; use datafusion_expr::ScalarUDF; mod character_length; +mod find_in_set; mod left; mod lpad; mod reverse; @@ -29,6 +30,8 @@ mod right; mod rpad; mod strpos; mod substr; +mod substrindex; +mod translate; // create UDFs make_udf_function!( @@ -36,6 +39,7 @@ make_udf_function!( CHARACTER_LENGTH, character_length ); +make_udf_function!(find_in_set::FindInSetFunc, FIND_IN_SET, find_in_set); make_udf_function!(left::LeftFunc, LEFT, left); make_udf_function!(lpad::LPadFunc, LPAD, lpad); make_udf_function!(right::RightFunc, RIGHT, right); @@ -43,6 +47,8 @@ make_udf_function!(reverse::ReverseFunc, REVERSE, reverse); make_udf_function!(rpad::RPadFunc, RPAD, rpad); make_udf_function!(strpos::StrposFunc, STRPOS, strpos); make_udf_function!(substr::SubstrFunc, SUBSTR, substr); +make_udf_function!(substrindex::SubstrIndexFunc, SUBSTR_INDEX, substr_index); +make_udf_function!(translate::TranslateFunc, TRANSLATE, translate); pub mod expr_fn { use datafusion_expr::Expr; @@ -57,6 +63,11 @@ pub mod expr_fn { super::character_length().call(vec![string]) } + #[doc = "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings"] + pub fn find_in_set(string: Expr, strlist: Expr) -> Expr { + super::find_in_set().call(vec![string, strlist]) + } + #[doc = "finds the position from where the `substring` matches the `string`"] pub fn instr(string: Expr, substring: Expr) -> Expr { strpos(string, substring) @@ -111,12 +122,23 @@ pub mod expr_fn { pub fn substring(string: Expr, position: Expr, length: Expr) -> Expr { super::substr().call(vec![string, position, length]) } + + #[doc = "Returns the substring from str before count occurrences of the delimiter"] + pub fn substr_index(string: Expr, delimiter: Expr, count: Expr) -> Expr { + super::substr_index().call(vec![string, delimiter, count]) + } + + #[doc = "replaces the characters in `from` with the counterpart in `to`"] + pub fn translate(string: Expr, from: Expr, to: Expr) -> Expr { + super::translate().call(vec![string, from, to]) + } } /// Return a list of all functions in this package pub fn functions() -> Vec> { vec![ character_length(), + find_in_set(), left(), lpad(), reverse(), @@ -124,5 +146,7 @@ pub fn functions() -> Vec> { rpad(), strpos(), substr(), + substr_index(), + translate(), ] } diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs new file mode 100644 index 000000000000..77e8116fff4c --- /dev/null +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; + +#[derive(Debug)] +pub(super) struct SubstrIndexFunc { + signature: Signature, + aliases: Vec, +} + +impl SubstrIndexFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Utf8, Int64]), + Exact(vec![LargeUtf8, LargeUtf8, Int64]), + ], + Volatility::Immutable, + ), + aliases: vec![String::from("substring_index")], + } + } +} + +impl ScalarUDFImpl for SubstrIndexFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "substr_index" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "substr_index") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(substr_index::, vec![])(args), + DataType::LargeUtf8 => { + make_scalar_function(substr_index::, vec![])(args) + } + other => { + exec_err!("Unsupported data type {other:?} for function substr_index") + } + } + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +/// Returns the substring from str before count occurrences of the delimiter delim. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. +/// SUBSTRING_INDEX('www.apache.org', '.', 1) = www +/// SUBSTRING_INDEX('www.apache.org', '.', 2) = www.apache +/// SUBSTRING_INDEX('www.apache.org', '.', -2) = apache.org +/// SUBSTRING_INDEX('www.apache.org', '.', -1) = org +pub fn substr_index(args: &[ArrayRef]) -> Result { + if args.len() != 3 { + return exec_err!( + "substr_index was called with {} arguments. It requires 3.", + args.len() + ); + } + + let string_array = as_generic_string_array::(&args[0])?; + let delimiter_array = as_generic_string_array::(&args[1])?; + let count_array = as_int64_array(&args[2])?; + + let result = string_array + .iter() + .zip(delimiter_array.iter()) + .zip(count_array.iter()) + .map(|((string, delimiter), n)| match (string, delimiter, n) { + (Some(string), Some(delimiter), Some(n)) => { + // In MySQL, these cases will return an empty string. + if n == 0 || string.is_empty() || delimiter.is_empty() { + return Some(String::new()); + } + + let splitted: Box> = if n > 0 { + Box::new(string.split(delimiter)) + } else { + Box::new(string.rsplit(delimiter)) + }; + let occurrences = usize::try_from(n.unsigned_abs()).unwrap_or(usize::MAX); + // The length of the substring covered by substr_index. + let length = splitted + .take(occurrences) // at least 1 element, since n != 0 + .map(|s| s.len() + delimiter.len()) + .sum::() + - delimiter.len(); + if n > 0 { + Some(string[..length].to_owned()) + } else { + Some(string[string.len() - length..].to_owned()) + } + } + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs new file mode 100644 index 000000000000..bc1836700304 --- /dev/null +++ b/datafusion/functions/src/unicode/translate.rs @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; +use hashbrown::HashMap; +use unicode_segmentation::UnicodeSegmentation; + +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +use crate::utils::{make_scalar_function, utf8_to_str_type}; + +#[derive(Debug)] +pub(super) struct TranslateFunc { + signature: Signature, +} + +impl TranslateFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Utf8, Utf8])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for TranslateFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "translate" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "translate") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(translate::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(translate::, vec![])(args), + other => { + exec_err!("Unsupported data type {other:?} for function translate") + } + } + } +} + +/// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted. +/// translate('12345', '143', 'ax') = 'a2x5' +fn translate(args: &[ArrayRef]) -> Result { + let string_array = as_generic_string_array::(&args[0])?; + let from_array = as_generic_string_array::(&args[1])?; + let to_array = as_generic_string_array::(&args[2])?; + + let result = string_array + .iter() + .zip(from_array.iter()) + .zip(to_array.iter()) + .map(|((string, from), to)| match (string, from, to) { + (Some(string), Some(from), Some(to)) => { + // create a hashmap of [char, index] to change from O(n) to O(1) for from list + let from_map: HashMap<&str, usize> = from + .graphemes(true) + .collect::>() + .iter() + .enumerate() + .map(|(index, c)| (c.to_owned(), index)) + .collect(); + + let to = to.graphemes(true).collect::>(); + + Some( + string + .graphemes(true) + .collect::>() + .iter() + .flat_map(|c| match from_map.get(*c) { + Some(n) => to.get(*n).copied(), + None => Some(*c), + }) + .collect::>() + .concat(), + ) + } + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +#[cfg(test)] +mod tests { + use arrow::array::{Array, StringArray}; + use arrow::datatypes::DataType::Utf8; + + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use crate::unicode::translate::TranslateFunc; + use crate::utils::test::test_function; + + #[test] + fn test_functions() -> Result<()> { + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("12345")), + ColumnarValue::Scalar(ScalarValue::from("143")), + ColumnarValue::Scalar(ScalarValue::from("ax")) + ], + Ok(Some("a2x5")), + &str, + Utf8, + StringArray + ); + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from("143")), + ColumnarValue::Scalar(ScalarValue::from("ax")) + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("12345")), + ColumnarValue::Scalar(ScalarValue::Utf8(None)), + ColumnarValue::Scalar(ScalarValue::from("ax")) + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("12345")), + ColumnarValue::Scalar(ScalarValue::from("143")), + ColumnarValue::Scalar(ScalarValue::Utf8(None)) + ], + Ok(None), + &str, + Utf8, + StringArray + ); + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("é2íñ5")), + ColumnarValue::Scalar(ScalarValue::from("éñí")), + ColumnarValue::Scalar(ScalarValue::from("óü")), + ], + Ok(Some("ó2ü5")), + &str, + Utf8, + StringArray + ); + #[cfg(not(feature = "unicode_expressions"))] + test_function!( + TranslateFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::from("12345")), + ColumnarValue::Scalar(ScalarValue::from("143")), + ColumnarValue::Scalar(ScalarValue::from("ax")), + ], + internal_err!( + "function translate requires compilation with feature flag: unicode_expressions." + ), + &str, + Utf8, + StringArray + ); + + Ok(()) + } +} diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index 861715b351a6..1d64a22f1463 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -34,9 +34,8 @@ path = "src/lib.rs" [features] crypto_expressions = ["datafusion-physical-expr/crypto_expressions"] -default = ["unicode_expressions", "crypto_expressions", "regex_expressions"] +default = ["crypto_expressions", "regex_expressions"] regex_expressions = ["datafusion-physical-expr/regex_expressions"] -unicode_expressions = ["datafusion-physical-expr/unicode_expressions"] [dependencies] arrow = { workspace = true } diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 24b831e7c575..baca00bea724 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -37,12 +37,10 @@ crypto_expressions = ["md-5", "sha2", "blake2", "blake3"] default = [ "crypto_expressions", "regex_expressions", - "unicode_expressions", "encoding_expressions", ] encoding_expressions = ["base64", "hex"] regex_expressions = ["regex"] -unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = { version = "0.8", default-features = false, features = [ @@ -73,7 +71,6 @@ petgraph = "0.6.2" rand = { workspace = true } regex = { version = "1.8", optional = true } sha2 = { version = "^0.10.1", optional = true } -unicode-segmentation = { version = "^1.7.1", optional = true } [dev-dependencies] criterion = "0.5" diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 515511b15fbb..5b9b46c3991b 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -35,7 +35,7 @@ use std::sync::Arc; use arrow::{ array::ArrayRef, - datatypes::{DataType, Int32Type, Int64Type, Schema}, + datatypes::{DataType, Schema}, }; use arrow_array::Array; @@ -85,26 +85,6 @@ pub fn create_physical_expr( ))) } -#[cfg(feature = "unicode_expressions")] -macro_rules! invoke_if_unicode_expressions_feature_flag { - ($FUNC:ident, $T:tt, $NAME:expr) => {{ - use crate::unicode_expressions; - unicode_expressions::$FUNC::<$T> - }}; -} - -#[cfg(not(feature = "unicode_expressions"))] -macro_rules! invoke_if_unicode_expressions_feature_flag { - ($FUNC:ident, $T:tt, $NAME:expr) => { - |_: &[ArrayRef]| -> Result { - internal_err!( - "function {} requires compilation with feature flag: unicode_expressions.", - $NAME - ) - } - }; -} - #[derive(Debug, Clone, Copy)] pub enum Hint { /// Indicates the argument needs to be padded if it is scalar @@ -278,71 +258,6 @@ pub fn create_physical_fun( exec_err!("Unsupported data type {other:?} for function ends_with") } }), - BuiltinScalarFunction::Translate => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - translate, - i32, - "translate" - ); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - translate, - i64, - "translate" - ); - make_scalar_function_inner(func)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function translate") - } - }), - BuiltinScalarFunction::SubstrIndex => { - Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - substr_index, - i32, - "substr_index" - ); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - substr_index, - i64, - "substr_index" - ); - make_scalar_function_inner(func)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function substr_index") - } - }) - } - BuiltinScalarFunction::FindInSet => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - find_in_set, - Int32Type, - "find_in_set" - ); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - find_in_set, - Int64Type, - "find_in_set" - ); - make_scalar_function_inner(func)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function find_in_set") - } - }), }) } @@ -631,66 +546,7 @@ mod tests { Boolean, BooleanArray ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Translate, - &[lit("12345"), lit("143"), lit("ax"),], - Ok(Some("a2x5")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Translate, - &[lit(ScalarValue::Utf8(None)), lit("143"), lit("ax"),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Translate, - &[lit("12345"), lit(ScalarValue::Utf8(None)), lit("ax"),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Translate, - &[lit("12345"), lit("143"), lit(ScalarValue::Utf8(None)),], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "unicode_expressions")] - test_function!( - Translate, - &[lit("é2íñ5"), lit("éñí"), lit("óü"),], - Ok(Some("ó2ü5")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "unicode_expressions"))] - test_function!( - Translate, - &[ - lit("12345"), - lit("143"), - lit("ax"), - ], - internal_err!( - "function translate requires compilation with feature flag: unicode_expressions." - ), - &str, - Utf8, - StringArray - ); + Ok(()) } diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 1dead099540b..7819d5116160 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -33,8 +33,6 @@ pub mod sort_properties; pub mod string_expressions; pub mod tree_node; pub mod udf; -#[cfg(feature = "unicode_expressions")] -pub mod unicode_expressions; pub mod utils; pub mod window; diff --git a/datafusion/physical-expr/src/unicode_expressions.rs b/datafusion/physical-expr/src/unicode_expressions.rs deleted file mode 100644 index ecbd1ea320d4..000000000000 --- a/datafusion/physical-expr/src/unicode_expressions.rs +++ /dev/null @@ -1,172 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Some of these functions reference the Postgres documentation -// or implementation to ensure compatibility and are subject to -// the Postgres license. - -//! Unicode expressions - -use std::sync::Arc; - -use arrow::{ - array::{ArrayRef, GenericStringArray, OffsetSizeTrait, PrimitiveArray}, - datatypes::{ArrowNativeType, ArrowPrimitiveType}, -}; -use hashbrown::HashMap; -use unicode_segmentation::UnicodeSegmentation; - -use datafusion_common::{ - cast::{as_generic_string_array, as_int64_array}, - exec_err, Result, -}; - -/// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted. -/// translate('12345', '143', 'ax') = 'a2x5' -pub fn translate(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let from_array = as_generic_string_array::(&args[1])?; - let to_array = as_generic_string_array::(&args[2])?; - - let result = string_array - .iter() - .zip(from_array.iter()) - .zip(to_array.iter()) - .map(|((string, from), to)| match (string, from, to) { - (Some(string), Some(from), Some(to)) => { - // create a hashmap of [char, index] to change from O(n) to O(1) for from list - let from_map: HashMap<&str, usize> = from - .graphemes(true) - .collect::>() - .iter() - .enumerate() - .map(|(index, c)| (c.to_owned(), index)) - .collect(); - - let to = to.graphemes(true).collect::>(); - - Some( - string - .graphemes(true) - .collect::>() - .iter() - .flat_map(|c| match from_map.get(*c) { - Some(n) => to.get(*n).copied(), - None => Some(*c), - }) - .collect::>() - .concat(), - ) - } - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// Returns the substring from str before count occurrences of the delimiter delim. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. -/// SUBSTRING_INDEX('www.apache.org', '.', 1) = www -/// SUBSTRING_INDEX('www.apache.org', '.', 2) = www.apache -/// SUBSTRING_INDEX('www.apache.org', '.', -2) = apache.org -/// SUBSTRING_INDEX('www.apache.org', '.', -1) = org -pub fn substr_index(args: &[ArrayRef]) -> Result { - if args.len() != 3 { - return exec_err!( - "substr_index was called with {} arguments. It requires 3.", - args.len() - ); - } - - let string_array = as_generic_string_array::(&args[0])?; - let delimiter_array = as_generic_string_array::(&args[1])?; - let count_array = as_int64_array(&args[2])?; - - let result = string_array - .iter() - .zip(delimiter_array.iter()) - .zip(count_array.iter()) - .map(|((string, delimiter), n)| match (string, delimiter, n) { - (Some(string), Some(delimiter), Some(n)) => { - // In MySQL, these cases will return an empty string. - if n == 0 || string.is_empty() || delimiter.is_empty() { - return Some(String::new()); - } - - let splitted: Box> = if n > 0 { - Box::new(string.split(delimiter)) - } else { - Box::new(string.rsplit(delimiter)) - }; - let occurrences = usize::try_from(n.unsigned_abs()).unwrap_or(usize::MAX); - // The length of the substring covered by substr_index. - let length = splitted - .take(occurrences) // at least 1 element, since n != 0 - .map(|s| s.len() + delimiter.len()) - .sum::() - - delimiter.len(); - if n > 0 { - Some(string[..length].to_owned()) - } else { - Some(string[string.len() - length..].to_owned()) - } - } - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -///Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings -///A string list is a string composed of substrings separated by , characters. -pub fn find_in_set(args: &[ArrayRef]) -> Result -where - T::Native: OffsetSizeTrait, -{ - if args.len() != 2 { - return exec_err!( - "find_in_set was called with {} arguments. It requires 2.", - args.len() - ); - } - - let str_array: &GenericStringArray = - as_generic_string_array::(&args[0])?; - let str_list_array: &GenericStringArray = - as_generic_string_array::(&args[1])?; - - let result = str_array - .iter() - .zip(str_list_array.iter()) - .map(|(string, str_list)| match (string, str_list) { - (Some(string), Some(str_list)) => { - let mut res = 0; - let str_set: Vec<&str> = str_list.split(',').collect(); - for (idx, str) in str_set.iter().enumerate() { - if str == &string { - res = idx + 1; - break; - } - } - T::Native::from_usize(res) - } - _ => None, - }) - .collect::>(); - Ok(Arc::new(result) as ArrayRef) -} diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 81451e40aa50..b756e0575d71 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -601,7 +601,7 @@ enum ScalarFunction { // 57 was ToTimestampMicros // 58 was ToTimestampSeconds // 59 was Now - Translate = 60; + // 60 was Translate // Trim = 61; // Upper = 62; Coalesce = 63; @@ -665,8 +665,8 @@ enum ScalarFunction { // 123 is ArrayExcept // 124 was ArrayPopFront // 125 was Levenshtein - SubstrIndex = 126; - FindInSet = 127; + // 126 was SubstrIndex + // 127 was FindInSet // 128 was ArraySort // 129 was ArrayDistinct // 130 was ArrayResize diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 2949ab807e04..3c3d60300786 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22929,7 +22929,6 @@ impl serde::Serialize for ScalarFunction { Self::ConcatWithSeparator => "ConcatWithSeparator", Self::InitCap => "InitCap", Self::Random => "Random", - Self::Translate => "Translate", Self::Coalesce => "Coalesce", Self::Power => "Power", Self::Atan2 => "Atan2", @@ -22948,8 +22947,6 @@ impl serde::Serialize for ScalarFunction { Self::Cot => "Cot", Self::Nanvl => "Nanvl", Self::Iszero => "Iszero", - Self::SubstrIndex => "SubstrIndex", - Self::FindInSet => "FindInSet", Self::EndsWith => "EndsWith", }; serializer.serialize_str(variant) @@ -22978,7 +22975,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ConcatWithSeparator", "InitCap", "Random", - "Translate", "Coalesce", "Power", "Atan2", @@ -22997,8 +22993,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Cot", "Nanvl", "Iszero", - "SubstrIndex", - "FindInSet", "EndsWith", ]; @@ -23056,7 +23050,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ConcatWithSeparator" => Ok(ScalarFunction::ConcatWithSeparator), "InitCap" => Ok(ScalarFunction::InitCap), "Random" => Ok(ScalarFunction::Random), - "Translate" => Ok(ScalarFunction::Translate), "Coalesce" => Ok(ScalarFunction::Coalesce), "Power" => Ok(ScalarFunction::Power), "Atan2" => Ok(ScalarFunction::Atan2), @@ -23075,8 +23068,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Cot" => Ok(ScalarFunction::Cot), "Nanvl" => Ok(ScalarFunction::Nanvl), "Iszero" => Ok(ScalarFunction::Iszero), - "SubstrIndex" => Ok(ScalarFunction::SubstrIndex), - "FindInSet" => Ok(ScalarFunction::FindInSet), "EndsWith" => Ok(ScalarFunction::EndsWith), _ => Err(serde::de::Error::unknown_variant(value, FIELDS)), } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 6f7e8a9789a6..9860587d3eca 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2900,7 +2900,7 @@ pub enum ScalarFunction { /// 57 was ToTimestampMicros /// 58 was ToTimestampSeconds /// 59 was Now - Translate = 60, + /// 60 was Translate /// Trim = 61; /// Upper = 62; Coalesce = 63, @@ -2964,8 +2964,8 @@ pub enum ScalarFunction { /// 123 is ArrayExcept /// 124 was ArrayPopFront /// 125 was Levenshtein - SubstrIndex = 126, - FindInSet = 127, + /// 126 was SubstrIndex + /// 127 was FindInSet /// 128 was ArraySort /// 129 was ArrayDistinct /// 130 was ArrayResize @@ -3002,7 +3002,6 @@ impl ScalarFunction { ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator", ScalarFunction::InitCap => "InitCap", ScalarFunction::Random => "Random", - ScalarFunction::Translate => "Translate", ScalarFunction::Coalesce => "Coalesce", ScalarFunction::Power => "Power", ScalarFunction::Atan2 => "Atan2", @@ -3021,8 +3020,6 @@ impl ScalarFunction { ScalarFunction::Cot => "Cot", ScalarFunction::Nanvl => "Nanvl", ScalarFunction::Iszero => "Iszero", - ScalarFunction::SubstrIndex => "SubstrIndex", - ScalarFunction::FindInSet => "FindInSet", ScalarFunction::EndsWith => "EndsWith", } } @@ -3045,7 +3042,6 @@ impl ScalarFunction { "ConcatWithSeparator" => Some(Self::ConcatWithSeparator), "InitCap" => Some(Self::InitCap), "Random" => Some(Self::Random), - "Translate" => Some(Self::Translate), "Coalesce" => Some(Self::Coalesce), "Power" => Some(Self::Power), "Atan2" => Some(Self::Atan2), @@ -3064,8 +3060,6 @@ impl ScalarFunction { "Cot" => Some(Self::Cot), "Nanvl" => Some(Self::Nanvl), "Iszero" => Some(Self::Iszero), - "SubstrIndex" => Some(Self::SubstrIndex), - "FindInSet" => Some(Self::FindInSet), "EndsWith" => Some(Self::EndsWith), _ => None, } diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index d372cb428c73..c068cfd46c1f 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -40,12 +40,11 @@ use datafusion_expr::{ acosh, asinh, atan, atan2, atanh, cbrt, ceil, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, degrees, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, - factorial, find_in_set, floor, gcd, initcap, iszero, lcm, log, + factorial, floor, gcd, initcap, iszero, lcm, log, logical_plan::{PlanType, StringifiedPlan}, - nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, substr_index, - translate, trunc, AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, - BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField, - GroupingSet, + nanvl, pi, power, radians, random, round, signum, sin, sinh, sqrt, trunc, + AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, + Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, @@ -452,15 +451,12 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::EndsWith => Self::EndsWith, ScalarFunction::InitCap => Self::InitCap, ScalarFunction::Random => Self::Random, - ScalarFunction::Translate => Self::Translate, ScalarFunction::Coalesce => Self::Coalesce, ScalarFunction::Pi => Self::Pi, ScalarFunction::Power => Self::Power, ScalarFunction::Atan2 => Self::Atan2, ScalarFunction::Nanvl => Self::Nanvl, ScalarFunction::Iszero => Self::Iszero, - ScalarFunction::SubstrIndex => Self::SubstrIndex, - ScalarFunction::FindInSet => Self::FindInSet, } } } @@ -1379,11 +1375,6 @@ pub fn parse_expr( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, )), - ScalarFunction::Translate => Ok(translate( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - parse_expr(&args[2], registry, codec)?, - )), ScalarFunction::Coalesce => { Ok(coalesce(parse_exprs(args, registry, codec)?)) } @@ -1408,15 +1399,6 @@ pub fn parse_expr( ScalarFunction::Iszero => { Ok(iszero(parse_expr(&args[0], registry, codec)?)) } - ScalarFunction::SubstrIndex => Ok(substr_index( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - parse_expr(&args[2], registry, codec)?, - )), - ScalarFunction::FindInSet => Ok(find_in_set( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), } } ExprType::ScalarUdfExpr(protobuf::ScalarUdfExprNode { diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 1e4e85c51f70..9d433bb6ff97 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1443,15 +1443,12 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::EndsWith => Self::EndsWith, BuiltinScalarFunction::InitCap => Self::InitCap, BuiltinScalarFunction::Random => Self::Random, - BuiltinScalarFunction::Translate => Self::Translate, BuiltinScalarFunction::Coalesce => Self::Coalesce, BuiltinScalarFunction::Pi => Self::Pi, BuiltinScalarFunction::Power => Self::Power, BuiltinScalarFunction::Atan2 => Self::Atan2, BuiltinScalarFunction::Nanvl => Self::Nanvl, BuiltinScalarFunction::Iszero => Self::Iszero, - BuiltinScalarFunction::SubstrIndex => Self::SubstrIndex, - BuiltinScalarFunction::FindInSet => Self::FindInSet, }; Ok(scalar_function)