diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 51cccf60a1e4..2f1d95d639d4 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1273,6 +1273,7 @@ dependencies = [ "md-5", "regex", "sha2", + "uuid", ] [[package]] @@ -1340,7 +1341,6 @@ dependencies = [ "regex", "sha2", "unicode-segmentation", - "uuid", ] [[package]] @@ -1370,7 +1370,6 @@ dependencies = [ "pin-project-lite", "rand", "tokio", - "uuid", ] [[package]] diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 2b6e869ec500..4966143782ba 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -76,4 +76,4 @@ tempfile = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } tonic = "0.11" url = { workspace = true } -uuid = "1.2" +uuid = "1.7" diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index a3570834fdb7..1e5c0d748e3d 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -122,7 +122,7 @@ tempfile = { workspace = true } tokio = { workspace = true } tokio-util = { version = "0.7.4", features = ["io"], optional = true } url = { workspace = true } -uuid = { version = "1.0", features = ["v4"] } +uuid = { version = "1.7", features = ["v4"] } xz2 = { version = "0.1", optional = true, features = ["static"] } zstd = { version = "0.13", optional = true, default-features = false } diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index d0ec1326c49e..1904d58cfc92 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -141,12 +141,6 @@ pub enum BuiltinScalarFunction { Substr, /// translate Translate, - /// uuid - Uuid, - /// overlay - OverLay, - /// levenshtein - Levenshtein, /// substr_index SubstrIndex, /// find_in_set @@ -253,14 +247,11 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Strpos => Volatility::Immutable, BuiltinScalarFunction::Substr => Volatility::Immutable, BuiltinScalarFunction::Translate => Volatility::Immutable, - BuiltinScalarFunction::OverLay => Volatility::Immutable, - BuiltinScalarFunction::Levenshtein => Volatility::Immutable, BuiltinScalarFunction::SubstrIndex => Volatility::Immutable, BuiltinScalarFunction::FindInSet => Volatility::Immutable, // Volatile builtin functions BuiltinScalarFunction::Random => Volatility::Volatile, - BuiltinScalarFunction::Uuid => Volatility::Volatile, } } @@ -302,7 +293,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Lpad => utf8_to_str_type(&input_expr_types[0], "lpad"), BuiltinScalarFunction::Pi => Ok(Float64), BuiltinScalarFunction::Random => Ok(Float64), - BuiltinScalarFunction::Uuid => Ok(Utf8), BuiltinScalarFunction::Repeat => { utf8_to_str_type(&input_expr_types[0], "repeat") } @@ -362,14 +352,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Iszero => Ok(Boolean), - BuiltinScalarFunction::OverLay => { - utf8_to_str_type(&input_expr_types[0], "overlay") - } - - BuiltinScalarFunction::Levenshtein => { - utf8_to_int_type(&input_expr_types[0], "levenshtein") - } - BuiltinScalarFunction::Atan | BuiltinScalarFunction::Acosh | BuiltinScalarFunction::Asinh @@ -490,7 +472,6 @@ impl BuiltinScalarFunction { } BuiltinScalarFunction::Pi => Signature::exact(vec![], self.volatility()), BuiltinScalarFunction::Random => Signature::exact(vec![], self.volatility()), - BuiltinScalarFunction::Uuid => Signature::exact(vec![], self.volatility()), BuiltinScalarFunction::Power => Signature::one_of( vec![Exact(vec![Int64, Int64]), Exact(vec![Float64, Float64])], self.volatility(), @@ -536,19 +517,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Gcd | BuiltinScalarFunction::Lcm => { Signature::uniform(2, vec![Int64], self.volatility()) } - BuiltinScalarFunction::OverLay => Signature::one_of( - vec![ - Exact(vec![Utf8, Utf8, Int64, Int64]), - Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]), - Exact(vec![Utf8, Utf8, Int64]), - Exact(vec![LargeUtf8, LargeUtf8, Int64]), - ], - self.volatility(), - ), - BuiltinScalarFunction::Levenshtein => Signature::one_of( - vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])], - self.volatility(), - ), BuiltinScalarFunction::Atan | BuiltinScalarFunction::Acosh | BuiltinScalarFunction::Asinh @@ -678,11 +646,8 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"], BuiltinScalarFunction::Substr => &["substr"], BuiltinScalarFunction::Translate => &["translate"], - BuiltinScalarFunction::Uuid => &["uuid"], - BuiltinScalarFunction::Levenshtein => &["levenshtein"], BuiltinScalarFunction::SubstrIndex => &["substr_index", "substring_index"], BuiltinScalarFunction::FindInSet => &["find_in_set"], - BuiltinScalarFunction::OverLay => &["overlay"], } } } diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index e1ab11c5b778..60db21e5f5fe 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -575,7 +575,6 @@ scalar_expr!(Log10, log10, num, "base 10 logarithm of number"); scalar_expr!(Ln, ln, num, "natural logarithm (base e) of number"); scalar_expr!(Power, power, base exponent, "`base` raised to the power of `exponent`"); scalar_expr!(Atan2, atan2, y x, "inverse tangent of a division given in the argument"); -scalar_expr!(Uuid, uuid, , "returns uuid v4 as a string value"); scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`"); // string functions @@ -628,12 +627,6 @@ nary_scalar_expr!( "concatenates several strings, placing a seperator between each one" ); nary_scalar_expr!(Concat, concat_expr, "concatenates several strings"); -nary_scalar_expr!( - OverLay, - overlay, - "replace the substring of string that starts at the start'th character and extends for count characters with new substring" -); - scalar_expr!(Nanvl, nanvl, x y, "returns x if x is not NaN otherwise returns y"); scalar_expr!( Iszero, @@ -642,7 +635,6 @@ scalar_expr!( "returns true if a given number is +0.0 or -0.0 otherwise returns false" ); -scalar_expr!(Levenshtein, levenshtein, string1 string2, "Returns the Levenshtein distance between the two given strings"); scalar_expr!(SubstrIndex, substr_index, string delimiter count, "Returns the substring from str before count occurrences of the delimiter"); scalar_expr!(FindInSet, find_in_set, str strlist, "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings"); @@ -1076,25 +1068,7 @@ mod test { test_scalar_expr!(Substr, substr, string, position); test_scalar_expr!(Substr, substring, string, position, count); test_scalar_expr!(Translate, translate, string, from, to); - test_nary_scalar_expr!(OverLay, overlay, string, characters, position, len); - test_nary_scalar_expr!(OverLay, overlay, string, characters, position); - test_scalar_expr!(Levenshtein, levenshtein, string1, string2); test_scalar_expr!(SubstrIndex, substr_index, string, delimiter, count); test_scalar_expr!(FindInSet, find_in_set, string, stringlist); } - - #[test] - fn uuid_function_definitions() { - if let Expr::ScalarFunction(ScalarFunction { - func_def: ScalarFunctionDefinition::BuiltIn(fun), - args, - }) = uuid() - { - let name = BuiltinScalarFunction::Uuid; - assert_eq!(name, fun); - assert_eq!(0, args.len()); - } else { - unreachable!(); - } - } } diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 0410d89d123f..81050dfddf66 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -75,6 +75,8 @@ log = { workspace = true } md-5 = { version = "^0.10.0", optional = true } regex = { version = "1.8", optional = true } sha2 = { version = "^0.10.1", optional = true } +uuid = { version = "1.7", features = ["v4"] } + [dev-dependencies] criterion = "0.5" rand = { workspace = true } diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs new file mode 100644 index 000000000000..b5de4b28948f --- /dev/null +++ b/datafusion/functions/src/string/levenshtein.rs @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::utils::datafusion_strsim; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::ColumnarValue; +use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; + +use crate::string::common::{make_scalar_function, utf8_to_int_type}; + +#[derive(Debug)] +pub(super) struct LevenshteinFunc { + signature: Signature, +} + +impl LevenshteinFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for LevenshteinFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "levenshtein" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_int_type(&arg_types[0], "levenshtein") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(levenshtein::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(levenshtein::, vec![])(args), + other => { + exec_err!("Unsupported data type {other:?} for function levenshtein") + } + } + } +} + +///Returns the Levenshtein distance between the two given strings. +/// LEVENSHTEIN('kitten', 'sitting') = 3 +pub fn levenshtein(args: &[ArrayRef]) -> Result { + if args.len() != 2 { + return exec_err!( + "levenshtein function requires two arguments, got {}", + args.len() + ); + } + let str1_array = as_generic_string_array::(&args[0])?; + let str2_array = as_generic_string_array::(&args[1])?; + match args[0].data_type() { + DataType::Utf8 => { + let result = str1_array + .iter() + .zip(str2_array.iter()) + .map(|(string1, string2)| match (string1, string2) { + (Some(string1), Some(string2)) => { + Some(datafusion_strsim::levenshtein(string1, string2) as i32) + } + _ => None, + }) + .collect::(); + Ok(Arc::new(result) as ArrayRef) + } + DataType::LargeUtf8 => { + let result = str1_array + .iter() + .zip(str2_array.iter()) + .map(|(string1, string2)| match (string1, string2) { + (Some(string1), Some(string2)) => { + Some(datafusion_strsim::levenshtein(string1, string2) as i64) + } + _ => None, + }) + .collect::(); + Ok(Arc::new(result) as ArrayRef) + } + other => { + exec_err!( + "levenshtein was called with {other} datatype arguments. It requires Utf8 or LargeUtf8." + ) + } + } +} + +#[cfg(test)] +mod tests { + use arrow::array::{Int32Array, StringArray}; + + use datafusion_common::cast::as_int32_array; + + use super::*; + + #[test] + fn to_levenshtein() -> Result<()> { + let string1_array = + Arc::new(StringArray::from(vec!["123", "abc", "xyz", "kitten"])); + let string2_array = + Arc::new(StringArray::from(vec!["321", "def", "zyx", "sitting"])); + let res = levenshtein::(&[string1_array, string2_array]).unwrap(); + let result = + as_int32_array(&res).expect("failed to initialized function levenshtein"); + let expected = Int32Array::from(vec![2, 3, 2, 3]); + assert_eq!(&expected, result); + + Ok(()) + } +} diff --git a/datafusion/functions/src/string/mod.rs b/datafusion/functions/src/string/mod.rs index a6d844932655..165a7c660404 100644 --- a/datafusion/functions/src/string/mod.rs +++ b/datafusion/functions/src/string/mod.rs @@ -24,24 +24,30 @@ use datafusion_expr::ScalarUDF; mod ascii; mod btrim; mod common; +mod levenshtein; mod lower; mod ltrim; mod octet_length; +mod overlay; mod rtrim; mod starts_with; mod to_hex; mod upper; +mod uuid; // create UDFs make_udf_function!(ascii::AsciiFunc, ASCII, ascii); make_udf_function!(btrim::BTrimFunc, BTRIM, btrim); +make_udf_function!(levenshtein::LevenshteinFunc, LEVENSHTEIN, levenshtein); make_udf_function!(ltrim::LtrimFunc, LTRIM, ltrim); make_udf_function!(lower::LowerFunc, LOWER, lower); make_udf_function!(octet_length::OctetLengthFunc, OCTET_LENGTH, octet_length); +make_udf_function!(overlay::OverlayFunc, OVERLAY, overlay); make_udf_function!(rtrim::RtrimFunc, RTRIM, rtrim); make_udf_function!(starts_with::StartsWithFunc, STARTS_WITH, starts_with); make_udf_function!(to_hex::ToHexFunc, TO_HEX, to_hex); make_udf_function!(upper::UpperFunc, UPPER, upper); +make_udf_function!(uuid::UuidFunc, UUID, uuid); pub mod expr_fn { use datafusion_expr::Expr; @@ -56,6 +62,11 @@ pub mod expr_fn { super::btrim().call(args) } + #[doc = "Returns the Levenshtein distance between the two given strings"] + pub fn levenshtein(arg1: Expr, arg2: Expr) -> Expr { + super::levenshtein().call(vec![arg1, arg2]) + } + #[doc = "Converts a string to lowercase."] pub fn lower(arg1: Expr) -> Expr { super::lower().call(vec![arg1]) @@ -71,6 +82,11 @@ pub mod expr_fn { super::octet_length().call(args) } + #[doc = "replace the substring of string that starts at the start'th character and extends for count characters with new substring"] + pub fn overlay(args: Vec) -> Expr { + super::overlay().call(args) + } + #[doc = "Removes all characters, spaces by default, from the end of a string"] pub fn rtrim(args: Vec) -> Expr { super::rtrim().call(args) @@ -95,6 +111,11 @@ pub mod expr_fn { pub fn upper(arg1: Expr) -> Expr { super::upper().call(vec![arg1]) } + + #[doc = "returns uuid v4 as a string value"] + pub fn uuid() -> Expr { + super::uuid().call(vec![]) + } } /// Return a list of all functions in this package @@ -102,12 +123,15 @@ pub fn functions() -> Vec> { vec![ ascii(), btrim(), + levenshtein(), lower(), ltrim(), octet_length(), + overlay(), rtrim(), starts_with(), to_hex(), upper(), + uuid(), ] } diff --git a/datafusion/functions/src/string/overlay.rs b/datafusion/functions/src/string/overlay.rs new file mode 100644 index 000000000000..d7cc0da8068e --- /dev/null +++ b/datafusion/functions/src/string/overlay.rs @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::datatypes::DataType; + +use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ScalarUDFImpl, Signature}; + +use crate::string::common::*; + +#[derive(Debug)] +pub(super) struct OverlayFunc { + signature: Signature, +} + +impl OverlayFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Utf8, Int64, Int64]), + Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]), + Exact(vec![Utf8, Utf8, Int64]), + Exact(vec![LargeUtf8, LargeUtf8, Int64]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for OverlayFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "overlay" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + utf8_to_str_type(&arg_types[0], "overlay") + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => make_scalar_function(overlay::, vec![])(args), + DataType::LargeUtf8 => make_scalar_function(overlay::, vec![])(args), + other => exec_err!("Unsupported data type {other:?} for function overlay"), + } + } +} + +/// OVERLAY(string1 PLACING string2 FROM integer FOR integer2) +/// Replaces a substring of string1 with string2 starting at the integer bit +/// pgsql overlay('Txxxxas' placing 'hom' from 2 for 4) → Thomas +/// overlay('Txxxxas' placing 'hom' from 2) -> Thomxas, without for option, str2's len is instead +pub fn overlay(args: &[ArrayRef]) -> Result { + match args.len() { + 3 => { + let string_array = as_generic_string_array::(&args[0])?; + let characters_array = as_generic_string_array::(&args[1])?; + let pos_num = as_int64_array(&args[2])?; + + let result = string_array + .iter() + .zip(characters_array.iter()) + .zip(pos_num.iter()) + .map(|((string, characters), start_pos)| { + match (string, characters, start_pos) { + (Some(string), Some(characters), Some(start_pos)) => { + let string_len = string.chars().count(); + let characters_len = characters.chars().count(); + let replace_len = characters_len as i64; + let mut res = + String::with_capacity(string_len.max(characters_len)); + + //as sql replace index start from 1 while string index start from 0 + if start_pos > 1 && start_pos - 1 < string_len as i64 { + let start = (start_pos - 1) as usize; + res.push_str(&string[..start]); + } + res.push_str(characters); + // if start + replace_len - 1 >= string_length, just to string end + if start_pos + replace_len - 1 < string_len as i64 { + let end = (start_pos + replace_len - 1) as usize; + res.push_str(&string[end..]); + } + Ok(Some(res)) + } + _ => Ok(None), + } + }) + .collect::>>()?; + Ok(Arc::new(result) as ArrayRef) + } + 4 => { + let string_array = as_generic_string_array::(&args[0])?; + let characters_array = as_generic_string_array::(&args[1])?; + let pos_num = as_int64_array(&args[2])?; + let len_num = as_int64_array(&args[3])?; + + let result = string_array + .iter() + .zip(characters_array.iter()) + .zip(pos_num.iter()) + .zip(len_num.iter()) + .map(|(((string, characters), start_pos), len)| { + match (string, characters, start_pos, len) { + (Some(string), Some(characters), Some(start_pos), Some(len)) => { + let string_len = string.chars().count(); + let characters_len = characters.chars().count(); + let replace_len = len.min(string_len as i64); + let mut res = + String::with_capacity(string_len.max(characters_len)); + + //as sql replace index start from 1 while string index start from 0 + if start_pos > 1 && start_pos - 1 < string_len as i64 { + let start = (start_pos - 1) as usize; + res.push_str(&string[..start]); + } + res.push_str(characters); + // if start + replace_len - 1 >= string_length, just to string end + if start_pos + replace_len - 1 < string_len as i64 { + let end = (start_pos + replace_len - 1) as usize; + res.push_str(&string[end..]); + } + Ok(Some(res)) + } + _ => Ok(None), + } + }) + .collect::>>()?; + Ok(Arc::new(result) as ArrayRef) + } + other => { + exec_err!("overlay was called with {other} arguments. It requires 3 or 4.") + } + } +} + +#[cfg(test)] +mod tests { + use arrow::array::{Int64Array, StringArray}; + + use super::*; + + #[test] + fn to_overlay() -> Result<()> { + let string = + Arc::new(StringArray::from(vec!["123", "abcdefg", "xyz", "Txxxxas"])); + let replace_string = + Arc::new(StringArray::from(vec!["abc", "qwertyasdfg", "ijk", "hom"])); + let start = Arc::new(Int64Array::from(vec![4, 1, 1, 2])); // start + let end = Arc::new(Int64Array::from(vec![5, 7, 2, 4])); // replace len + + let res = overlay::(&[string, replace_string, start, end]).unwrap(); + let result = as_generic_string_array::(&res).unwrap(); + let expected = StringArray::from(vec!["abc", "qwertyasdfg", "ijkz", "Thomas"]); + assert_eq!(&expected, result); + + Ok(()) + } +} diff --git a/datafusion/functions/src/string/uuid.rs b/datafusion/functions/src/string/uuid.rs new file mode 100644 index 000000000000..791ad6d3c4f3 --- /dev/null +++ b/datafusion/functions/src/string/uuid.rs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::iter; +use std::sync::Arc; + +use arrow::array::GenericStringArray; +use arrow::datatypes::DataType; +use arrow::datatypes::DataType::Utf8; +use uuid::Uuid; + +use datafusion_common::{exec_err, Result}; +use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ScalarUDFImpl, Signature}; + +#[derive(Debug)] +pub(super) struct UuidFunc { + signature: Signature, +} + +impl UuidFunc { + pub fn new() -> Self { + Self { + signature: Signature::exact(vec![], Volatility::Volatile), + } + } +} + +impl ScalarUDFImpl for UuidFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "uuid" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Utf8) + } + + /// Prints random (v4) uuid values per row + /// uuid() = 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11' + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let len: usize = match &args[0] { + ColumnarValue::Array(array) => array.len(), + _ => return exec_err!("Expect uuid function to take no param"), + }; + + let values = iter::repeat_with(|| Uuid::new_v4().to_string()).take(len); + let array = GenericStringArray::::from_iter_values(values); + Ok(ColumnarValue::Array(Arc::new(array))) + } +} diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index d63ad9bb4a3a..24b831e7c575 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -74,7 +74,6 @@ rand = { workspace = true } regex = { version = "1.8", optional = true } sha2 = { version = "^0.10.1", optional = true } unicode-segmentation = { version = "^1.7.1", optional = true } -uuid = { version = "^1.2", features = ["v4"] } [dev-dependencies] criterion = "0.5" diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 2436fa24d4ef..8759adc89b40 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -458,29 +458,6 @@ pub fn create_physical_fun( exec_err!("Unsupported data type {other:?} for function translate") } }), - BuiltinScalarFunction::Uuid => Arc::new(string_expressions::uuid), - BuiltinScalarFunction::OverLay => Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - make_scalar_function_inner(string_expressions::overlay::)(args) - } - DataType::LargeUtf8 => { - make_scalar_function_inner(string_expressions::overlay::)(args) - } - other => exec_err!("Unsupported data type {other:?} for function overlay"), - }), - BuiltinScalarFunction::Levenshtein => { - Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => make_scalar_function_inner( - string_expressions::levenshtein::, - )(args), - DataType::LargeUtf8 => make_scalar_function_inner( - string_expressions::levenshtein::, - )(args), - other => { - exec_err!("Unsupported data type {other:?} for function levenshtein") - } - }) - } BuiltinScalarFunction::SubstrIndex => { Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { @@ -1868,11 +1845,7 @@ mod tests { let execution_props = ExecutionProps::new(); let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let funs = [ - BuiltinScalarFunction::Pi, - BuiltinScalarFunction::Random, - BuiltinScalarFunction::Uuid, - ]; + let funs = [BuiltinScalarFunction::Pi, BuiltinScalarFunction::Random]; for fun in funs.iter() { create_physical_expr_with_type_coercion(fun, &[], &schema, &execution_props)?; diff --git a/datafusion/physical-expr/src/string_expressions.rs b/datafusion/physical-expr/src/string_expressions.rs index 13e4ce77e0ac..766e167a9426 100644 --- a/datafusion/physical-expr/src/string_expressions.rs +++ b/datafusion/physical-expr/src/string_expressions.rs @@ -21,7 +21,6 @@ //! String expressions -use std::iter; use std::sync::Arc; use arrow::{ @@ -31,9 +30,7 @@ use arrow::{ }, datatypes::DataType, }; -use uuid::Uuid; -use datafusion_common::utils::datafusion_strsim; use datafusion_common::Result; use datafusion_common::{ cast::{as_generic_string_array, as_int64_array, as_string_array}, @@ -333,192 +330,3 @@ pub fn ends_with(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } - -/// Prints random (v4) uuid values per row -/// uuid() = 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11' -pub fn uuid(args: &[ColumnarValue]) -> Result { - let len: usize = match &args[0] { - ColumnarValue::Array(array) => array.len(), - _ => return exec_err!("Expect uuid function to take no param"), - }; - - let values = iter::repeat_with(|| Uuid::new_v4().to_string()).take(len); - let array = GenericStringArray::::from_iter_values(values); - Ok(ColumnarValue::Array(Arc::new(array))) -} - -/// OVERLAY(string1 PLACING string2 FROM integer FOR integer2) -/// Replaces a substring of string1 with string2 starting at the integer bit -/// pgsql overlay('Txxxxas' placing 'hom' from 2 for 4) → Thomas -/// overlay('Txxxxas' placing 'hom' from 2) -> Thomxas, without for option, str2's len is instead -pub fn overlay(args: &[ArrayRef]) -> Result { - match args.len() { - 3 => { - let string_array = as_generic_string_array::(&args[0])?; - let characters_array = as_generic_string_array::(&args[1])?; - let pos_num = as_int64_array(&args[2])?; - - let result = string_array - .iter() - .zip(characters_array.iter()) - .zip(pos_num.iter()) - .map(|((string, characters), start_pos)| { - match (string, characters, start_pos) { - (Some(string), Some(characters), Some(start_pos)) => { - let string_len = string.chars().count(); - let characters_len = characters.chars().count(); - let replace_len = characters_len as i64; - let mut res = - String::with_capacity(string_len.max(characters_len)); - - //as sql replace index start from 1 while string index start from 0 - if start_pos > 1 && start_pos - 1 < string_len as i64 { - let start = (start_pos - 1) as usize; - res.push_str(&string[..start]); - } - res.push_str(characters); - // if start + replace_len - 1 >= string_length, just to string end - if start_pos + replace_len - 1 < string_len as i64 { - let end = (start_pos + replace_len - 1) as usize; - res.push_str(&string[end..]); - } - Ok(Some(res)) - } - _ => Ok(None), - } - }) - .collect::>>()?; - Ok(Arc::new(result) as ArrayRef) - } - 4 => { - let string_array = as_generic_string_array::(&args[0])?; - let characters_array = as_generic_string_array::(&args[1])?; - let pos_num = as_int64_array(&args[2])?; - let len_num = as_int64_array(&args[3])?; - - let result = string_array - .iter() - .zip(characters_array.iter()) - .zip(pos_num.iter()) - .zip(len_num.iter()) - .map(|(((string, characters), start_pos), len)| { - match (string, characters, start_pos, len) { - (Some(string), Some(characters), Some(start_pos), Some(len)) => { - let string_len = string.chars().count(); - let characters_len = characters.chars().count(); - let replace_len = len.min(string_len as i64); - let mut res = - String::with_capacity(string_len.max(characters_len)); - - //as sql replace index start from 1 while string index start from 0 - if start_pos > 1 && start_pos - 1 < string_len as i64 { - let start = (start_pos - 1) as usize; - res.push_str(&string[..start]); - } - res.push_str(characters); - // if start + replace_len - 1 >= string_length, just to string end - if start_pos + replace_len - 1 < string_len as i64 { - let end = (start_pos + replace_len - 1) as usize; - res.push_str(&string[end..]); - } - Ok(Some(res)) - } - _ => Ok(None), - } - }) - .collect::>>()?; - Ok(Arc::new(result) as ArrayRef) - } - other => { - exec_err!("overlay was called with {other} arguments. It requires 3 or 4.") - } - } -} - -///Returns the Levenshtein distance between the two given strings. -/// LEVENSHTEIN('kitten', 'sitting') = 3 -pub fn levenshtein(args: &[ArrayRef]) -> Result { - if args.len() != 2 { - return exec_err!( - "levenshtein function requires two arguments, got {}", - args.len() - ); - } - let str1_array = as_generic_string_array::(&args[0])?; - let str2_array = as_generic_string_array::(&args[1])?; - match args[0].data_type() { - DataType::Utf8 => { - let result = str1_array - .iter() - .zip(str2_array.iter()) - .map(|(string1, string2)| match (string1, string2) { - (Some(string1), Some(string2)) => { - Some(datafusion_strsim::levenshtein(string1, string2) as i32) - } - _ => None, - }) - .collect::(); - Ok(Arc::new(result) as ArrayRef) - } - DataType::LargeUtf8 => { - let result = str1_array - .iter() - .zip(str2_array.iter()) - .map(|(string1, string2)| match (string1, string2) { - (Some(string1), Some(string2)) => { - Some(datafusion_strsim::levenshtein(string1, string2) as i64) - } - _ => None, - }) - .collect::(); - Ok(Arc::new(result) as ArrayRef) - } - other => { - exec_err!( - "levenshtein was called with {other} datatype arguments. It requires Utf8 or LargeUtf8." - ) - } - } -} - -#[cfg(test)] -mod tests { - use arrow::array::Int32Array; - use arrow_array::Int64Array; - - use datafusion_common::cast::as_int32_array; - - use super::*; - - #[test] - fn to_overlay() -> Result<()> { - let string = - Arc::new(StringArray::from(vec!["123", "abcdefg", "xyz", "Txxxxas"])); - let replace_string = - Arc::new(StringArray::from(vec!["abc", "qwertyasdfg", "ijk", "hom"])); - let start = Arc::new(Int64Array::from(vec![4, 1, 1, 2])); // start - let end = Arc::new(Int64Array::from(vec![5, 7, 2, 4])); // replace len - - let res = overlay::(&[string, replace_string, start, end]).unwrap(); - let result = as_generic_string_array::(&res).unwrap(); - let expected = StringArray::from(vec!["abc", "qwertyasdfg", "ijkz", "Thomas"]); - assert_eq!(&expected, result); - - Ok(()) - } - - #[test] - fn to_levenshtein() -> Result<()> { - let string1_array = - Arc::new(StringArray::from(vec!["123", "abc", "xyz", "kitten"])); - let string2_array = - Arc::new(StringArray::from(vec!["321", "def", "zyx", "sitting"])); - let res = levenshtein::(&[string1_array, string2_array]).unwrap(); - let result = - as_int32_array(&res).expect("failed to initialized function levenshtein"); - let expected = Int32Array::from(vec![2, 3, 2, 3]); - assert_eq!(&expected, result); - - Ok(()) - } -} diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 72ee4fb3ef7e..1ba32bff746e 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -58,7 +58,6 @@ parking_lot = { workspace = true } pin-project-lite = "^0.2.7" rand = { workspace = true } tokio = { workspace = true } -uuid = { version = "^1.2", features = ["v4"] } [dev-dependencies] rstest = { workspace = true } diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index e4953283b184..795995ce2c46 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -613,7 +613,7 @@ enum ScalarFunction { // 69 was ArrowTypeof // 70 was CurrentDate // 71 was CurrentTime - Uuid = 72; + // 72 was Uuid Cbrt = 73; Acosh = 74; Asinh = 75; @@ -660,11 +660,11 @@ enum ScalarFunction { // 118 was ToTimestampNanos // 119 was ArrayIntersect // 120 was ArrayUnion - OverLay = 121; + // 121 was OverLay // 122 is Range // 123 is ArrayExcept // 124 was ArrayPopFront - Levenshtein = 125; + // 125 was Levenshtein SubstrIndex = 126; FindInSet = 127; // 128 was ArraySort diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 7cdebdf85944..3941171e4fe6 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22949,7 +22949,6 @@ impl serde::Serialize for ScalarFunction { Self::Coalesce => "Coalesce", Self::Power => "Power", Self::Atan2 => "Atan2", - Self::Uuid => "Uuid", Self::Cbrt => "Cbrt", Self::Acosh => "Acosh", Self::Asinh => "Asinh", @@ -22965,8 +22964,6 @@ impl serde::Serialize for ScalarFunction { Self::Cot => "Cot", Self::Nanvl => "Nanvl", Self::Iszero => "Iszero", - Self::OverLay => "OverLay", - Self::Levenshtein => "Levenshtein", Self::SubstrIndex => "SubstrIndex", Self::FindInSet => "FindInSet", Self::EndsWith => "EndsWith", @@ -23017,7 +23014,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Coalesce", "Power", "Atan2", - "Uuid", "Cbrt", "Acosh", "Asinh", @@ -23033,8 +23029,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Cot", "Nanvl", "Iszero", - "OverLay", - "Levenshtein", "SubstrIndex", "FindInSet", "EndsWith", @@ -23114,7 +23108,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Coalesce" => Ok(ScalarFunction::Coalesce), "Power" => Ok(ScalarFunction::Power), "Atan2" => Ok(ScalarFunction::Atan2), - "Uuid" => Ok(ScalarFunction::Uuid), "Cbrt" => Ok(ScalarFunction::Cbrt), "Acosh" => Ok(ScalarFunction::Acosh), "Asinh" => Ok(ScalarFunction::Asinh), @@ -23130,8 +23123,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Cot" => Ok(ScalarFunction::Cot), "Nanvl" => Ok(ScalarFunction::Nanvl), "Iszero" => Ok(ScalarFunction::Iszero), - "OverLay" => Ok(ScalarFunction::OverLay), - "Levenshtein" => Ok(ScalarFunction::Levenshtein), "SubstrIndex" => Ok(ScalarFunction::SubstrIndex), "FindInSet" => Ok(ScalarFunction::FindInSet), "EndsWith" => Ok(ScalarFunction::EndsWith), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 2932bcf6d93f..58fda7fcb5ad 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2912,7 +2912,7 @@ pub enum ScalarFunction { /// 69 was ArrowTypeof /// 70 was CurrentDate /// 71 was CurrentTime - Uuid = 72, + /// 72 was Uuid Cbrt = 73, Acosh = 74, Asinh = 75, @@ -2959,11 +2959,11 @@ pub enum ScalarFunction { /// 118 was ToTimestampNanos /// 119 was ArrayIntersect /// 120 was ArrayUnion - OverLay = 121, + /// 121 was OverLay /// 122 is Range /// 123 is ArrayExcept /// 124 was ArrayPopFront - Levenshtein = 125, + /// 125 was Levenshtein SubstrIndex = 126, FindInSet = 127, /// 128 was ArraySort @@ -3022,7 +3022,6 @@ impl ScalarFunction { ScalarFunction::Coalesce => "Coalesce", ScalarFunction::Power => "Power", ScalarFunction::Atan2 => "Atan2", - ScalarFunction::Uuid => "Uuid", ScalarFunction::Cbrt => "Cbrt", ScalarFunction::Acosh => "Acosh", ScalarFunction::Asinh => "Asinh", @@ -3038,8 +3037,6 @@ impl ScalarFunction { ScalarFunction::Cot => "Cot", ScalarFunction::Nanvl => "Nanvl", ScalarFunction::Iszero => "Iszero", - ScalarFunction::OverLay => "OverLay", - ScalarFunction::Levenshtein => "Levenshtein", ScalarFunction::SubstrIndex => "SubstrIndex", ScalarFunction::FindInSet => "FindInSet", ScalarFunction::EndsWith => "EndsWith", @@ -3084,7 +3081,6 @@ impl ScalarFunction { "Coalesce" => Some(Self::Coalesce), "Power" => Some(Self::Power), "Atan2" => Some(Self::Atan2), - "Uuid" => Some(Self::Uuid), "Cbrt" => Some(Self::Cbrt), "Acosh" => Some(Self::Acosh), "Asinh" => Some(Self::Asinh), @@ -3100,8 +3096,6 @@ impl ScalarFunction { "Cot" => Some(Self::Cot), "Nanvl" => Some(Self::Nanvl), "Iszero" => Some(Self::Iszero), - "OverLay" => Some(Self::OverLay), - "Levenshtein" => Some(Self::Levenshtein), "SubstrIndex" => Some(Self::SubstrIndex), "FindInSet" => Some(Self::FindInSet), "EndsWith" => Some(Self::EndsWith), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index d00aeeda462b..3b44c1cb276d 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -51,14 +51,13 @@ use datafusion_expr::{ acosh, asinh, atan, atan2, atanh, bit_length, cbrt, ceil, character_length, chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, degrees, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, - factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, levenshtein, ln, log, - log10, log2, + factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, - lpad, nanvl, overlay, pi, power, radians, random, repeat, replace, reverse, right, - round, rpad, signum, sin, sinh, split_part, sqrt, strpos, substr, substr_index, - substring, translate, trunc, uuid, AggregateFunction, Between, BinaryExpr, - BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, - GetIndexedField, GroupingSet, + lpad, nanvl, pi, power, radians, random, repeat, replace, reverse, right, round, + rpad, signum, sin, sinh, split_part, sqrt, strpos, substr, substr_index, substring, + translate, trunc, AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, + BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField, + GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, @@ -477,7 +476,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::SplitPart => Self::SplitPart, ScalarFunction::Strpos => Self::Strpos, ScalarFunction::Substr => Self::Substr, - ScalarFunction::Uuid => Self::Uuid, ScalarFunction::Translate => Self::Translate, ScalarFunction::Coalesce => Self::Coalesce, ScalarFunction::Pi => Self::Pi, @@ -485,8 +483,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Atan2 => Self::Atan2, ScalarFunction::Nanvl => Self::Nanvl, ScalarFunction::Iszero => Self::Iszero, - ScalarFunction::OverLay => Self::OverLay, - ScalarFunction::Levenshtein => Self::Levenshtein, ScalarFunction::SubstrIndex => Self::SubstrIndex, ScalarFunction::FindInSet => Self::FindInSet, } @@ -1449,7 +1445,6 @@ pub fn parse_expr( parse_expr(&args[1], registry, codec)?, )), ScalarFunction::Random => Ok(random()), - ScalarFunction::Uuid => Ok(uuid()), ScalarFunction::Repeat => Ok(repeat( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, @@ -1518,10 +1513,6 @@ pub fn parse_expr( )) } } - ScalarFunction::Levenshtein => Ok(levenshtein( - parse_expr(&args[0], registry, codec)?, - parse_expr(&args[1], registry, codec)?, - )), ScalarFunction::Translate => Ok(translate( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, @@ -1554,12 +1545,6 @@ pub fn parse_expr( ScalarFunction::Iszero => { Ok(iszero(parse_expr(&args[0], registry, codec)?)) } - ScalarFunction::OverLay => Ok(overlay( - args.to_owned() - .iter() - .map(|expr| parse_expr(expr, registry, codec)) - .collect::, _>>()?, - )), ScalarFunction::SubstrIndex => Ok(substr_index( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index edb8c4e4eb01..446a91a39a1b 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1490,7 +1490,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Left => Self::Left, BuiltinScalarFunction::Lpad => Self::Lpad, BuiltinScalarFunction::Random => Self::Random, - BuiltinScalarFunction::Uuid => Self::Uuid, BuiltinScalarFunction::Repeat => Self::Repeat, BuiltinScalarFunction::Replace => Self::Replace, BuiltinScalarFunction::Reverse => Self::Reverse, @@ -1506,8 +1505,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Atan2 => Self::Atan2, BuiltinScalarFunction::Nanvl => Self::Nanvl, BuiltinScalarFunction::Iszero => Self::Iszero, - BuiltinScalarFunction::OverLay => Self::OverLay, - BuiltinScalarFunction::Levenshtein => Self::Levenshtein, BuiltinScalarFunction::SubstrIndex => Self::SubstrIndex, BuiltinScalarFunction::FindInSet => Self::FindInSet, }; diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 04f8001bfc1b..d1fc03194997 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -795,7 +795,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { schema: &DFSchema, planner_context: &mut PlannerContext, ) -> Result { - let fun = BuiltinScalarFunction::OverLay; + let fun = self + .context_provider + .get_function_meta("overlay") + .ok_or_else(|| { + internal_datafusion_err!("Unable to find expected 'overlay' function") + })?; let arg = self.sql_expr_to_logical_expr(expr, schema, planner_context)?; let what_arg = self.sql_expr_to_logical_expr(overlay_what, schema, planner_context)?; @@ -809,7 +814,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } None => vec![arg, what_arg, from_arg], }; - Ok(Expr::ScalarFunction(ScalarFunction::new(fun, args))) + Ok(Expr::ScalarFunction(ScalarFunction::new_udf(fun, args))) } fn sql_position_to_expr( &self,