From 205777bc2d0d1b9ef7698b2f5f6d21dfdcdb0c00 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Thu, 14 Mar 2024 14:54:38 -0400 Subject: [PATCH 1/2] Move make_date, to_char to datafusion-functions (#9601) * Move make_date, to_char to datafusion-functions * Update post merge to remove datetime_expressions.rs * Fix benchmarks. * Cargo fmt. --- datafusion-cli/Cargo.lock | 1 + datafusion/expr/src/built_in_function.rs | 56 +- datafusion/expr/src/expr_fn.rs | 8 - datafusion/functions/Cargo.toml | 9 + .../benches/make_date.rs | 14 +- .../benches/to_char.rs | 11 +- .../functions/src/datetime/make_date.rs | 305 +++++++++++ datafusion/functions/src/datetime/mod.rs | 90 ++++ .../src/datetime/to_char.rs} | 506 +++++------------- datafusion/physical-expr/Cargo.toml | 8 - datafusion/physical-expr/src/functions.rs | 6 +- datafusion/physical-expr/src/lib.rs | 1 - datafusion/proto/proto/datafusion.proto | 4 +- datafusion/proto/src/generated/pbjson.rs | 6 - datafusion/proto/src/generated/prost.rs | 12 +- .../proto/src/logical_plan/from_proto.rs | 22 - datafusion/proto/src/logical_plan/to_proto.rs | 2 - docs/source/contributor-guide/index.md | 2 +- 18 files changed, 572 insertions(+), 491 deletions(-) rename datafusion/{physical-expr => functions}/benches/make_date.rs (89%) rename datafusion/{physical-expr => functions}/benches/to_char.rs (93%) create mode 100644 datafusion/functions/src/datetime/make_date.rs rename datafusion/{physical-expr/src/datetime_expressions.rs => functions/src/datetime/to_char.rs} (55%) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 1c2514811c7d..7c6286abb83e 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1248,6 +1248,7 @@ version = "36.0.0" dependencies = [ "arrow", "arrow-array", + "arrow-schema", "base64 0.22.0", "blake2", "blake3", diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index cf39a244c8db..ae2b46378c26 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -22,11 +22,10 @@ use std::fmt; use std::str::FromStr; use std::sync::{Arc, OnceLock}; -use crate::signature::TIMEZONE_WILDCARD; use crate::type_coercion::functions::data_types; use crate::{FuncMonotonicity, Signature, TypeSignature, Volatility}; -use arrow::datatypes::{DataType, Field, TimeUnit}; +use arrow::datatypes::{DataType, Field}; use datafusion_common::{plan_err, DataFusionError, Result}; use strum::IntoEnumIterator; @@ -190,8 +189,6 @@ pub enum BuiltinScalarFunction { Substr, /// to_hex ToHex, - /// make_date - MakeDate, /// translate Translate, /// trim @@ -208,8 +205,6 @@ pub enum BuiltinScalarFunction { SubstrIndex, /// find_in_set FindInSet, - /// to_char - ToChar, } /// Maps the sql function name to `BuiltinScalarFunction` @@ -335,8 +330,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Strpos => Volatility::Immutable, BuiltinScalarFunction::Substr => Volatility::Immutable, BuiltinScalarFunction::ToHex => Volatility::Immutable, - BuiltinScalarFunction::ToChar => Volatility::Immutable, - BuiltinScalarFunction::MakeDate => Volatility::Immutable, BuiltinScalarFunction::Translate => Volatility::Immutable, BuiltinScalarFunction::Trim => Volatility::Immutable, BuiltinScalarFunction::Upper => Volatility::Immutable, @@ -490,8 +483,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::FindInSet => { utf8_to_int_type(&input_expr_types[0], "find_in_set") } - BuiltinScalarFunction::ToChar => Ok(Utf8), - BuiltinScalarFunction::MakeDate => Ok(Date32), BuiltinScalarFunction::Translate => { utf8_to_str_type(&input_expr_types[0], "translate") } @@ -567,7 +558,6 @@ impl BuiltinScalarFunction { /// Return the argument [`Signature`] supported by this function pub fn signature(&self) -> Signature { use DataType::*; - use TimeUnit::*; use TypeSignature::*; // note: the physical expression must accept the type returned by this function or the execution panics. @@ -651,41 +641,6 @@ impl BuiltinScalarFunction { vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], self.volatility(), ), - BuiltinScalarFunction::ToChar => Signature::one_of( - vec![ - Exact(vec![Date32, Utf8]), - Exact(vec![Date64, Utf8]), - Exact(vec![Time32(Millisecond), Utf8]), - Exact(vec![Time32(Second), Utf8]), - Exact(vec![Time64(Microsecond), Utf8]), - Exact(vec![Time64(Nanosecond), Utf8]), - Exact(vec![Timestamp(Second, None), Utf8]), - Exact(vec![ - Timestamp(Second, Some(TIMEZONE_WILDCARD.into())), - Utf8, - ]), - Exact(vec![Timestamp(Millisecond, None), Utf8]), - Exact(vec![ - Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())), - Utf8, - ]), - Exact(vec![Timestamp(Microsecond, None), Utf8]), - Exact(vec![ - Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())), - Utf8, - ]), - Exact(vec![Timestamp(Nanosecond, None), Utf8]), - Exact(vec![ - Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), - Utf8, - ]), - Exact(vec![Duration(Second), Utf8]), - Exact(vec![Duration(Millisecond), Utf8]), - Exact(vec![Duration(Microsecond), Utf8]), - Exact(vec![Duration(Nanosecond), Utf8]), - ], - self.volatility(), - ), BuiltinScalarFunction::SplitPart => Signature::one_of( vec![ Exact(vec![Utf8, Utf8, Int64]), @@ -821,11 +776,6 @@ impl BuiltinScalarFunction { // will be as good as the number of digits in the number Signature::uniform(1, vec![Float64, Float32], self.volatility()) } - BuiltinScalarFunction::MakeDate => Signature::uniform( - 3, - vec![Int32, Int64, UInt32, UInt64, Utf8], - self.volatility(), - ), BuiltinScalarFunction::Iszero => Signature::one_of( vec![Exact(vec![Float32]), Exact(vec![Float64])], self.volatility(), @@ -943,10 +893,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::SubstrIndex => &["substr_index", "substring_index"], BuiltinScalarFunction::FindInSet => &["find_in_set"], - // time/date functions - BuiltinScalarFunction::MakeDate => &["make_date"], - BuiltinScalarFunction::ToChar => &["to_char", "date_format"], - // hashing functions BuiltinScalarFunction::ArrayElement => &[ "array_element", diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index f35b663edf24..9e09ddb0670d 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -768,14 +768,6 @@ nary_scalar_expr!( "replace the substring of string that starts at the start'th character and extends for count characters with new substring" ); -// date functions -scalar_expr!( - ToChar, - to_char, - datetime format, - "converts a date, time, timestamp or duration to a string based on the provided format" -); -scalar_expr!(MakeDate, make_date, year month day, "make a date from year, month and day component parts"); scalar_expr!(Nanvl, nanvl, x y, "returns x if x is not NaN otherwise returns y"); scalar_expr!( Iszero, diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 92c80208e35f..d40ad60d76b6 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -58,6 +58,7 @@ path = "src/lib.rs" [dependencies] arrow = { workspace = true } arrow-array = { workspace = true } +arrow-schema = { workspace = true } base64 = { version = "0.22", optional = true } blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } @@ -85,3 +86,11 @@ name = "to_timestamp" [[bench]] harness = false name = "regx" + +[[bench]] +harness = false +name = "make_date" + +[[bench]] +harness = false +name = "to_char" diff --git a/datafusion/physical-expr/benches/make_date.rs b/datafusion/functions/benches/make_date.rs similarity index 89% rename from datafusion/physical-expr/benches/make_date.rs rename to datafusion/functions/benches/make_date.rs index 819d9539f2ce..7c75277b913e 100644 --- a/datafusion/physical-expr/benches/make_date.rs +++ b/datafusion/functions/benches/make_date.rs @@ -26,7 +26,7 @@ use rand::Rng; use datafusion_common::ScalarValue; use datafusion_expr::ColumnarValue; -use datafusion_physical_expr::datetime_expressions::make_date; +use datafusion_functions::datetime::make_date; fn years(rng: &mut ThreadRng) -> Int32Array { let mut years = vec![]; @@ -63,7 +63,8 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - make_date(&[years.clone(), months.clone(), days.clone()]) + make_date() + .invoke(&[years.clone(), months.clone(), days.clone()]) .expect("make_date should work on valid values"), ) }) @@ -77,7 +78,8 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - make_date(&[year.clone(), months.clone(), days.clone()]) + make_date() + .invoke(&[year.clone(), months.clone(), days.clone()]) .expect("make_date should work on valid values"), ) }) @@ -91,7 +93,8 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - make_date(&[year.clone(), month.clone(), days.clone()]) + make_date() + .invoke(&[year.clone(), month.clone(), days.clone()]) .expect("make_date should work on valid values"), ) }) @@ -104,7 +107,8 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - make_date(&[year.clone(), month.clone(), day.clone()]) + make_date() + .invoke(&[year.clone(), month.clone(), day.clone()]) .expect("make_date should work on valid values"), ) }) diff --git a/datafusion/physical-expr/benches/to_char.rs b/datafusion/functions/benches/to_char.rs similarity index 93% rename from datafusion/physical-expr/benches/to_char.rs rename to datafusion/functions/benches/to_char.rs index 3bcea09acf03..cb0374c95bb3 100644 --- a/datafusion/physical-expr/benches/to_char.rs +++ b/datafusion/functions/benches/to_char.rs @@ -30,7 +30,7 @@ use rand::Rng; use datafusion_common::ScalarValue; use datafusion_common::ScalarValue::TimestampNanosecond; use datafusion_expr::ColumnarValue; -use datafusion_physical_expr::datetime_expressions::to_char; +use datafusion_functions::datetime::to_char; fn random_date_in_range( rng: &mut ThreadRng, @@ -87,7 +87,8 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - to_char(&[data.clone(), patterns.clone()]) + to_char() + .invoke(&[data.clone(), patterns.clone()]) .expect("to_char should work on valid values"), ) }) @@ -101,7 +102,8 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - to_char(&[data.clone(), patterns.clone()]) + to_char() + .invoke(&[data.clone(), patterns.clone()]) .expect("to_char should work on valid values"), ) }) @@ -123,7 +125,8 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - to_char(&[data.clone(), pattern.clone()]) + to_char() + .invoke(&[data.clone(), pattern.clone()]) .expect("to_char should work on valid values"), ) }) diff --git a/datafusion/functions/src/datetime/make_date.rs b/datafusion/functions/src/datetime/make_date.rs new file mode 100644 index 000000000000..b8010f64d270 --- /dev/null +++ b/datafusion/functions/src/datetime/make_date.rs @@ -0,0 +1,305 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::DataType; +use arrow::datatypes::DataType::{Date32, Int32, Int64, UInt32, UInt64, Utf8}; +use arrow_array::builder::PrimitiveBuilder; +use arrow_array::cast::AsArray; +use arrow_array::types::{Date32Type, Int32Type}; +use arrow_array::PrimitiveArray; +use chrono::prelude::*; + +use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Debug)] +pub(super) struct MakeDateFunc { + signature: Signature, +} + +impl MakeDateFunc { + pub fn new() -> Self { + Self { + signature: Signature::uniform( + 3, + vec![Int32, Int64, UInt32, UInt64, Utf8], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for MakeDateFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "make_date" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Date32) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.len() != 3 { + return exec_err!( + "make_date function requires 3 arguments, got {}", + args.len() + ); + } + + // first, identify if any of the arguments is an Array. If yes, store its `len`, + // as any scalar will need to be converted to an array of len `len`. + let len = args + .iter() + .fold(Option::::None, |acc, arg| match arg { + ColumnarValue::Scalar(_) => acc, + ColumnarValue::Array(a) => Some(a.len()), + }); + + let is_scalar = len.is_none(); + let array_size = if is_scalar { 1 } else { len.unwrap() }; + + let years = args[0].cast_to(&DataType::Int32, None)?; + let months = args[1].cast_to(&DataType::Int32, None)?; + let days = args[2].cast_to(&DataType::Int32, None)?; + + // since the epoch for the date32 datatype is the unix epoch + // we need to subtract the unix epoch from the current date + // note this can result in a negative value + let unix_days_from_ce = NaiveDate::from_ymd_opt(1970, 1, 1) + .unwrap() + .num_days_from_ce(); + + let mut builder: PrimitiveBuilder = + PrimitiveArray::builder(array_size); + + let construct_date_fn = |builder: &mut PrimitiveBuilder, + year: i32, + month: i32, + day: i32, + unix_days_from_ce: i32| + -> Result<()> { + let Ok(m) = u32::try_from(month) else { + return exec_err!("Month value '{month:?}' is out of range"); + }; + let Ok(d) = u32::try_from(day) else { + return exec_err!("Day value '{day:?}' is out of range"); + }; + + let date = NaiveDate::from_ymd_opt(year, m, d); + + match date { + Some(d) => builder.append_value(d.num_days_from_ce() - unix_days_from_ce), + None => { + return exec_err!("Unable to parse date from {year}, {month}, {day}") + } + }; + Ok(()) + }; + + let scalar_value_fn = |col: &ColumnarValue| -> Result { + let ColumnarValue::Scalar(s) = col else { + return exec_err!("Expected scalar value"); + }; + let ScalarValue::Int32(Some(i)) = s else { + return exec_err!("Unable to parse date from null/empty value"); + }; + Ok(*i) + }; + + // For scalar only columns the operation is faster without using the PrimitiveArray + if is_scalar { + construct_date_fn( + &mut builder, + scalar_value_fn(&years)?, + scalar_value_fn(&months)?, + scalar_value_fn(&days)?, + unix_days_from_ce, + )?; + } else { + let to_primitive_array = |col: &ColumnarValue, + scalar_count: usize| + -> Result> { + match col { + ColumnarValue::Array(a) => { + Ok(a.as_primitive::().to_owned()) + } + _ => { + let v = scalar_value_fn(col).unwrap(); + Ok(PrimitiveArray::::from_value(v, scalar_count)) + } + } + }; + + let years = to_primitive_array(&years, array_size).unwrap(); + let months = to_primitive_array(&months, array_size).unwrap(); + let days = to_primitive_array(&days, array_size).unwrap(); + for i in 0..array_size { + construct_date_fn( + &mut builder, + years.value(i), + months.value(i), + days.value(i), + unix_days_from_ce, + )?; + } + } + + let arr = builder.finish(); + + if is_scalar { + // If all inputs are scalar, keeps output as scalar + Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some( + arr.value(0), + )))) + } else { + Ok(ColumnarValue::Array(Arc::new(arr))) + } + } +} + +#[cfg(test)] +mod tests { + use crate::datetime::make_date::MakeDateFunc; + use arrow_array::{Array, Date32Array, Int32Array, Int64Array, UInt32Array}; + use datafusion_common::ScalarValue; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + use std::sync::Arc; + + #[test] + fn test_make_date() { + let res = MakeDateFunc::new() + .invoke(&[ + ColumnarValue::Scalar(ScalarValue::Int32(Some(2024))), + ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), + ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))), + ]) + .expect("that make_date parsed values without error"); + + if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res { + assert_eq!(19736, date.unwrap()); + } else { + panic!("Expected a scalar value") + } + + let res = MakeDateFunc::new() + .invoke(&[ + ColumnarValue::Scalar(ScalarValue::Int64(Some(2024))), + ColumnarValue::Scalar(ScalarValue::UInt64(Some(1))), + ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))), + ]) + .expect("that make_date parsed values without error"); + + if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res { + assert_eq!(19736, date.unwrap()); + } else { + panic!("Expected a scalar value") + } + + let res = MakeDateFunc::new() + .invoke(&[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some("2024".to_string()))), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("1".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("14".to_string()))), + ]) + .expect("that make_date parsed values without error"); + + if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res { + assert_eq!(19736, date.unwrap()); + } else { + panic!("Expected a scalar value") + } + + let years = Arc::new((2021..2025).map(Some).collect::()); + let months = Arc::new((1..5).map(Some).collect::()); + let days = Arc::new((11..15).map(Some).collect::()); + let res = MakeDateFunc::new() + .invoke(&[ + ColumnarValue::Array(years), + ColumnarValue::Array(months), + ColumnarValue::Array(days), + ]) + .expect("that make_date parsed values without error"); + + if let ColumnarValue::Array(array) = res { + assert_eq!(array.len(), 4); + let mut builder = Date32Array::builder(4); + builder.append_value(18_638); + builder.append_value(19_035); + builder.append_value(19_429); + builder.append_value(19_827); + assert_eq!(&builder.finish() as &dyn Array, array.as_ref()); + } else { + panic!("Expected a columnar array") + } + + // + // Fallible test cases + // + + // invalid number of arguments + let res = MakeDateFunc::new() + .invoke(&[ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "Execution error: make_date function requires 3 arguments, got 1" + ); + + // invalid type + let res = MakeDateFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::IntervalYearMonth(Some(1))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "Arrow error: Cast error: Casting from Interval(YearMonth) to Int32 not supported" + ); + + // overflow of month + let res = MakeDateFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))), + ColumnarValue::Scalar(ScalarValue::UInt64(Some(u64::MAX))), + ColumnarValue::Scalar(ScalarValue::Int32(Some(22))), + ]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "Arrow error: Cast error: Can't cast value 18446744073709551615 to type Int32" + ); + + // overflow of day + let res = MakeDateFunc::new().invoke(&[ + ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))), + ColumnarValue::Scalar(ScalarValue::Int32(Some(22))), + ColumnarValue::Scalar(ScalarValue::UInt32(Some(u32::MAX))), + ]); + assert_eq!( + res.err().unwrap().strip_backtrace(), + "Arrow error: Cast error: Can't cast value 4294967295 to type Int32" + ); + } +} diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index 4702820782c6..a2dfc93b05a3 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -28,7 +28,9 @@ mod date_bin; mod date_part; mod date_trunc; mod from_unixtime; +mod make_date; mod now; +mod to_char; mod to_date; mod to_timestamp; mod to_unixtime; @@ -39,12 +41,14 @@ make_udf_function!(current_time::CurrentTimeFunc, CURRENT_TIME, current_time); make_udf_function!(date_bin::DateBinFunc, DATE_BIN, date_bin); make_udf_function!(date_part::DatePartFunc, DATE_PART, date_part); make_udf_function!(date_trunc::DateTruncFunc, DATE_TRUNC, date_trunc); +make_udf_function!(make_date::MakeDateFunc, MAKE_DATE, make_date); make_udf_function!( from_unixtime::FromUnixtimeFunc, FROM_UNIXTIME, from_unixtime ); make_udf_function!(now::NowFunc, NOW, now); +make_udf_function!(to_char::ToCharFunc, TO_CHAR, to_char); make_udf_function!(to_date::ToDateFunc, TO_DATE, to_date); make_udf_function!(to_unixtime::ToUnixtimeFunc, TO_UNIXTIME, to_unixtime); make_udf_function!(to_timestamp::ToTimestampFunc, TO_TIMESTAMP, to_timestamp); @@ -105,11 +109,95 @@ pub mod expr_fn { super::from_unixtime().call(vec![unixtime]) } + #[doc = "make a date from year, month and day component parts"] + pub fn make_date(year: Expr, month: Expr, day: Expr) -> Expr { + super::make_date().call(vec![year, month, day]) + } + #[doc = "returns the current timestamp in nanoseconds, using the same value for all instances of now() in same statement"] pub fn now() -> Expr { super::now().call(vec![]) } + /// Returns a string representation of a date, time, timestamp or duration based + /// on a Chrono pattern. + /// + /// The syntax for the patterns can be found at + /// + /// + /// # Examples + /// + /// ```ignore + /// # use chrono::prelude::*; + /// # use datafusion::prelude::*; + /// # use datafusion::error::Result; + /// # use datafusion_common::ScalarValue::TimestampNanosecond; + /// # use std::sync::Arc; + /// # use arrow_array::{Date32Array, RecordBatch, StringArray}; + /// # use arrow_schema::{DataType, Field, Schema}; + /// # #[tokio::main] + /// # async fn main() -> Result<()> { + /// let schema = Arc::new(Schema::new(vec![ + /// Field::new("values", DataType::Date32, false), + /// Field::new("patterns", DataType::Utf8, false), + /// ])); + /// + /// let batch = RecordBatch::try_new( + /// schema, + /// vec![ + /// Arc::new(Date32Array::from(vec![ + /// 18506, + /// 18507, + /// 18508, + /// 18509, + /// ])), + /// Arc::new(StringArray::from(vec![ + /// "%Y-%m-%d", + /// "%Y:%m:%d", + /// "%Y%m%d", + /// "%d-%m-%Y", + /// ])), + /// ], + /// )?; + /// + /// let ctx = SessionContext::new(); + /// ctx.register_batch("t", batch)?; + /// let df = ctx.table("t").await?; + /// + /// // use the to_char function to convert col 'values', + /// // to strings using patterns in col 'patterns' + /// let df = df.with_column( + /// "date_str", + /// to_char(col("values"), col("patterns")) + /// )?; + /// // Note that providing a scalar value for the pattern + /// // is more performant + /// let df = df.with_column( + /// "date_str2", + /// to_char(col("values"), lit("%d-%m-%Y")) + /// )?; + /// // literals can be used as well with dataframe calls + /// let timestamp = "2026-07-08T09:10:11" + /// .parse::() + /// .unwrap() + /// .with_nanosecond(56789) + /// .unwrap() + /// .timestamp_nanos_opt() + /// .unwrap(); + /// let df = df.with_column( + /// "timestamp_str", + /// to_char(lit(TimestampNanosecond(Some(timestamp), None)), lit("%d-%m-%Y %H:%M:%S")) + /// )?; + /// + /// df.show().await?; + /// + /// # Ok(()) + /// # } + /// ``` + pub fn to_char(datetime: Expr, format: Expr) -> Expr { + super::to_char().call(vec![datetime, format]) + } + /// ```ignore /// # use std::sync::Arc; /// @@ -200,7 +288,9 @@ pub fn functions() -> Vec> { date_part(), date_trunc(), from_unixtime(), + make_date(), now(), + to_char(), to_date(), to_unixtime(), to_timestamp(), diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/functions/src/datetime/to_char.rs similarity index 55% rename from datafusion/physical-expr/src/datetime_expressions.rs rename to datafusion/functions/src/datetime/to_char.rs index e0e86e7bd44b..90b3a1d35353 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -15,124 +15,120 @@ // specific language governing permissions and limitations // under the License. -//! DateTime expressions - +use std::any::Any; use std::sync::Arc; +use arrow::datatypes::DataType; use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions}; -use arrow::{ - array::{Array, ArrayRef, PrimitiveArray}, - datatypes::DataType, -}; -use arrow_array::builder::PrimitiveBuilder; use arrow_array::cast::AsArray; -use arrow_array::types::{Date32Type, Int32Type}; -use arrow_array::StringArray; -use chrono::prelude::*; -use chrono::NaiveDate; +use arrow_array::{Array, ArrayRef, StringArray}; +use arrow_schema::DataType::{Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8}; +use arrow_schema::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_expr::ColumnarValue; - -/// Returns a string representation of a date, time, timestamp or duration based -/// on a Chrono pattern. -/// -/// The syntax for the patterns can be found at -/// -/// -/// # Examples -/// -/// ```ignore -/// # use chrono::prelude::*; -/// # use datafusion::prelude::*; -/// # use datafusion::error::Result; -/// # use datafusion_common::ScalarValue::TimestampNanosecond; -/// # use std::sync::Arc; -/// # use arrow_array::{Date32Array, RecordBatch, StringArray}; -/// # use arrow_schema::{DataType, Field, Schema}; -/// # #[tokio::main] -/// # async fn main() -> Result<()> { -/// let schema = Arc::new(Schema::new(vec![ -/// Field::new("values", DataType::Date32, false), -/// Field::new("patterns", DataType::Utf8, false), -/// ])); -/// -/// let batch = RecordBatch::try_new( -/// schema, -/// vec![ -/// Arc::new(Date32Array::from(vec![ -/// 18506, -/// 18507, -/// 18508, -/// 18509, -/// ])), -/// Arc::new(StringArray::from(vec![ -/// "%Y-%m-%d", -/// "%Y:%m:%d", -/// "%Y%m%d", -/// "%d-%m-%Y", -/// ])), -/// ], -/// )?; -/// -/// let ctx = SessionContext::new(); -/// ctx.register_batch("t", batch)?; -/// let df = ctx.table("t").await?; -/// -/// // use the to_char function to convert col 'values', -/// // to strings using patterns in col 'patterns' -/// let df = df.with_column( -/// "date_str", -/// to_char(col("values"), col("patterns")) -/// )?; -/// // Note that providing a scalar value for the pattern -/// // is more performant -/// let df = df.with_column( -/// "date_str2", -/// to_char(col("values"), lit("%d-%m-%Y")) -/// )?; -/// // literals can be used as well with dataframe calls -/// let timestamp = "2026-07-08T09:10:11" -/// .parse::() -/// .unwrap() -/// .with_nanosecond(56789) -/// .unwrap() -/// .timestamp_nanos_opt() -/// .unwrap(); -/// let df = df.with_column( -/// "timestamp_str", -/// to_char(lit(TimestampNanosecond(Some(timestamp), None)), lit("%d-%m-%Y %H:%M:%S")) -/// )?; -/// -/// df.show().await?; -/// -/// # Ok(()) -/// # } -/// ``` -pub fn to_char(args: &[ColumnarValue]) -> Result { - if args.len() != 2 { - return exec_err!("to_char function requires 2 arguments, got {}", args.len()); - } +use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::{ + ColumnarValue, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD, +}; + +#[derive(Debug)] +pub(super) struct ToCharFunc { + signature: Signature, + aliases: Vec, +} - match &args[1] { - // null format, use default formats - ColumnarValue::Scalar(ScalarValue::Utf8(None)) - | ColumnarValue::Scalar(ScalarValue::Null) => { - _to_char_scalar(args[0].clone(), None) +impl ToCharFunc { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Date32, Utf8]), + Exact(vec![Date64, Utf8]), + Exact(vec![Time32(Millisecond), Utf8]), + Exact(vec![Time32(Second), Utf8]), + Exact(vec![Time64(Microsecond), Utf8]), + Exact(vec![Time64(Nanosecond), Utf8]), + Exact(vec![Timestamp(Second, None), Utf8]), + Exact(vec![ + Timestamp(Second, Some(TIMEZONE_WILDCARD.into())), + Utf8, + ]), + Exact(vec![Timestamp(Millisecond, None), Utf8]), + Exact(vec![ + Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())), + Utf8, + ]), + Exact(vec![Timestamp(Microsecond, None), Utf8]), + Exact(vec![ + Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())), + Utf8, + ]), + Exact(vec![Timestamp(Nanosecond, None), Utf8]), + Exact(vec![ + Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), + Utf8, + ]), + Exact(vec![Duration(Second), Utf8]), + Exact(vec![Duration(Millisecond), Utf8]), + Exact(vec![Duration(Microsecond), Utf8]), + Exact(vec![Duration(Nanosecond), Utf8]), + ], + Volatility::Immutable, + ), + aliases: vec![String::from("date_format")], } - // constant format - ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => { - // invoke to_char_scalar with the known string, without converting to array - _to_char_scalar(args[0].clone(), Some(format)) + } +} + +impl ScalarUDFImpl for ToCharFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "to_char" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(Utf8) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.len() != 2 { + return exec_err!( + "to_char function requires 2 arguments, got {}", + args.len() + ); } - ColumnarValue::Array(_) => _to_char_array(args), - _ => { - exec_err!( - "Format for `to_char` must be non-null Utf8, received {:?}", - args[1].data_type() - ) + + match &args[1] { + // null format, use default formats + ColumnarValue::Scalar(ScalarValue::Utf8(None)) + | ColumnarValue::Scalar(ScalarValue::Null) => { + _to_char_scalar(args[0].clone(), None) + } + // constant format + ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => { + // invoke to_char_scalar with the known string, without converting to array + _to_char_scalar(args[0].clone(), Some(format)) + } + ColumnarValue::Array(_) => _to_char_array(args), + _ => { + exec_err!( + "Format for `to_char` must be non-null Utf8, received {:?}", + args[1].data_type() + ) + } } } + + fn aliases(&self) -> &[String] { + &self.aliases + } } fn _build_format_options<'a>( @@ -143,14 +139,14 @@ fn _build_format_options<'a>( return Ok(FormatOptions::new()); }; let format_options = match data_type { - DataType::Date32 => FormatOptions::new().with_date_format(Some(format)), - DataType::Date64 => FormatOptions::new().with_datetime_format(Some(format)), - DataType::Time32(_) => FormatOptions::new().with_time_format(Some(format)), - DataType::Time64(_) => FormatOptions::new().with_time_format(Some(format)), - DataType::Timestamp(_, _) => FormatOptions::new() + Date32 => FormatOptions::new().with_date_format(Some(format)), + Date64 => FormatOptions::new().with_datetime_format(Some(format)), + Time32(_) => FormatOptions::new().with_time_format(Some(format)), + Time64(_) => FormatOptions::new().with_time_format(Some(format)), + Timestamp(_, _) => FormatOptions::new() .with_timestamp_format(Some(format)) .with_timestamp_tz_format(Some(format)), - DataType::Duration(_) => FormatOptions::new().with_duration_format( + Duration(_) => FormatOptions::new().with_duration_format( if "ISO8601".eq_ignore_ascii_case(format) { DurationFormat::ISO8601 } else { @@ -237,243 +233,19 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { } } -/// make_date(year, month, day) SQL function implementation -pub fn make_date(args: &[ColumnarValue]) -> Result { - if args.len() != 3 { - return exec_err!( - "make_date function requires 3 arguments, got {}", - args.len() - ); - } - - // first, identify if any of the arguments is an Array. If yes, store its `len`, - // as any scalar will need to be converted to an array of len `len`. - let len = args - .iter() - .fold(Option::::None, |acc, arg| match arg { - ColumnarValue::Scalar(_) => acc, - ColumnarValue::Array(a) => Some(a.len()), - }); - - let is_scalar = len.is_none(); - let array_size = if is_scalar { 1 } else { len.unwrap() }; - - let years = args[0].cast_to(&DataType::Int32, None)?; - let months = args[1].cast_to(&DataType::Int32, None)?; - let days = args[2].cast_to(&DataType::Int32, None)?; - - // since the epoch for the date32 datatype is the unix epoch - // we need to subtract the unix epoch from the current date - // note this can result in a negative value - let unix_days_from_ce = NaiveDate::from_ymd_opt(1970, 1, 1) - .unwrap() - .num_days_from_ce(); - - let mut builder: PrimitiveBuilder = PrimitiveArray::builder(array_size); - - let construct_date_fn = |builder: &mut PrimitiveBuilder, - year: i32, - month: i32, - day: i32, - unix_days_from_ce: i32| - -> Result<()> { - let Ok(m) = u32::try_from(month) else { - return exec_err!("Month value '{month:?}' is out of range"); - }; - let Ok(d) = u32::try_from(day) else { - return exec_err!("Day value '{day:?}' is out of range"); - }; - - let date = NaiveDate::from_ymd_opt(year, m, d); - - match date { - Some(d) => builder.append_value(d.num_days_from_ce() - unix_days_from_ce), - None => return exec_err!("Unable to parse date from {year}, {month}, {day}"), - }; - Ok(()) - }; - - let scalar_value_fn = |col: &ColumnarValue| -> Result { - let ColumnarValue::Scalar(s) = col else { - return exec_err!("Expected scalar value"); - }; - let ScalarValue::Int32(Some(i)) = s else { - return exec_err!("Unable to parse date from null/empty value"); - }; - Ok(*i) - }; - - // For scalar only columns the operation is faster without using the PrimitiveArray - if is_scalar { - construct_date_fn( - &mut builder, - scalar_value_fn(&years)?, - scalar_value_fn(&months)?, - scalar_value_fn(&days)?, - unix_days_from_ce, - )?; - } else { - let to_primitive_array = |col: &ColumnarValue, - scalar_count: usize| - -> Result> { - match col { - ColumnarValue::Array(a) => Ok(a.as_primitive::().to_owned()), - _ => { - let v = scalar_value_fn(col).unwrap(); - Ok(PrimitiveArray::::from_value(v, scalar_count)) - } - } - }; - - let years = to_primitive_array(&years, array_size).unwrap(); - let months = to_primitive_array(&months, array_size).unwrap(); - let days = to_primitive_array(&days, array_size).unwrap(); - for i in 0..array_size { - construct_date_fn( - &mut builder, - years.value(i), - months.value(i), - days.value(i), - unix_days_from_ce, - )?; - } - } - - let arr = builder.finish(); - - if is_scalar { - // If all inputs are scalar, keeps output as scalar - Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some( - arr.value(0), - )))) - } else { - Ok(ColumnarValue::Array(Arc::new(arr))) - } -} - #[cfg(test)] mod tests { - use std::sync::Arc; - - use arrow::array::{ArrayRef, Int64Array}; + use crate::datetime::to_char::ToCharFunc; use arrow_array::{ - Date32Array, Date64Array, Int32Array, Time32MillisecondArray, Time32SecondArray, - Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, - UInt32Array, + Array, ArrayRef, Date32Array, Date64Array, StringArray, Time32MillisecondArray, + Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, }; - + use chrono::{NaiveDateTime, Timelike}; use datafusion_common::ScalarValue; - - use super::*; - - #[test] - fn test_make_date() { - let res = make_date(&[ - ColumnarValue::Scalar(ScalarValue::Int32(Some(2024))), - ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), - ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))), - ]) - .expect("that make_date parsed values without error"); - - if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res { - assert_eq!(19736, date.unwrap()); - } else { - panic!("Expected a scalar value") - } - - let res = make_date(&[ - ColumnarValue::Scalar(ScalarValue::Int64(Some(2024))), - ColumnarValue::Scalar(ScalarValue::UInt64(Some(1))), - ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))), - ]) - .expect("that make_date parsed values without error"); - - if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res { - assert_eq!(19736, date.unwrap()); - } else { - panic!("Expected a scalar value") - } - - let res = make_date(&[ - ColumnarValue::Scalar(ScalarValue::Utf8(Some("2024".to_string()))), - ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("1".to_string()))), - ColumnarValue::Scalar(ScalarValue::Utf8(Some("14".to_string()))), - ]) - .expect("that make_date parsed values without error"); - - if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res { - assert_eq!(19736, date.unwrap()); - } else { - panic!("Expected a scalar value") - } - - let years = Arc::new((2021..2025).map(Some).collect::()); - let months = Arc::new((1..5).map(Some).collect::()); - let days = Arc::new((11..15).map(Some).collect::()); - let res = make_date(&[ - ColumnarValue::Array(years), - ColumnarValue::Array(months), - ColumnarValue::Array(days), - ]) - .expect("that make_date parsed values without error"); - - if let ColumnarValue::Array(array) = res { - assert_eq!(array.len(), 4); - let mut builder = Date32Array::builder(4); - builder.append_value(18_638); - builder.append_value(19_035); - builder.append_value(19_429); - builder.append_value(19_827); - assert_eq!(&builder.finish() as &dyn Array, array.as_ref()); - } else { - panic!("Expected a columnar array") - } - - // - // Fallible test cases - // - - // invalid number of arguments - let res = make_date(&[ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "Execution error: make_date function requires 3 arguments, got 1" - ); - - // invalid type - let res = make_date(&[ - ColumnarValue::Scalar(ScalarValue::IntervalYearMonth(Some(1))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "Arrow error: Cast error: Casting from Interval(YearMonth) to Int32 not supported" - ); - - // overflow of month - let res = make_date(&[ - ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))), - ColumnarValue::Scalar(ScalarValue::UInt64(Some(u64::MAX))), - ColumnarValue::Scalar(ScalarValue::Int32(Some(22))), - ]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "Arrow error: Cast error: Can't cast value 18446744073709551615 to type Int32" - ); - - // overflow of day - let res = make_date(&[ - ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))), - ColumnarValue::Scalar(ScalarValue::Int32(Some(22))), - ColumnarValue::Scalar(ScalarValue::UInt32(Some(u32::MAX))), - ]); - assert_eq!( - res.err().unwrap().strip_backtrace(), - "Arrow error: Cast error: Can't cast value 4294967295 to type Int32" - ); - } + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + use std::sync::Arc; #[test] fn test_to_char() { @@ -551,9 +323,9 @@ mod tests { ]; for (value, format, expected) in scalar_data { - let result = - to_char(&[ColumnarValue::Scalar(value), ColumnarValue::Scalar(format)]) - .expect("that to_char parsed values without error"); + let result = ToCharFunc::new() + .invoke(&[ColumnarValue::Scalar(value), ColumnarValue::Scalar(format)]) + .expect("that to_char parsed values without error"); if let ColumnarValue::Scalar(ScalarValue::Utf8(date)) = result { assert_eq!(expected, date.unwrap()); @@ -625,11 +397,12 @@ mod tests { ]; for (value, format, expected) in scalar_array_data { - let result = to_char(&[ - ColumnarValue::Scalar(value), - ColumnarValue::Array(Arc::new(format) as ArrayRef), - ]) - .expect("that to_char parsed values without error"); + let result = ToCharFunc::new() + .invoke(&[ + ColumnarValue::Scalar(value), + ColumnarValue::Array(Arc::new(format) as ArrayRef), + ]) + .expect("that to_char parsed values without error"); if let ColumnarValue::Scalar(ScalarValue::Utf8(date)) = result { assert_eq!(expected, date.unwrap()); @@ -749,11 +522,12 @@ mod tests { ]; for (value, format, expected) in array_scalar_data { - let result = to_char(&[ - ColumnarValue::Array(value as ArrayRef), - ColumnarValue::Scalar(format), - ]) - .expect("that to_char parsed values without error"); + let result = ToCharFunc::new() + .invoke(&[ + ColumnarValue::Array(value as ArrayRef), + ColumnarValue::Scalar(format), + ]) + .expect("that to_char parsed values without error"); if let ColumnarValue::Array(result) = result { assert_eq!(result.len(), 2); @@ -764,11 +538,12 @@ mod tests { } for (value, format, expected) in array_array_data { - let result = to_char(&[ - ColumnarValue::Array(value), - ColumnarValue::Array(Arc::new(format) as ArrayRef), - ]) - .expect("that to_char parsed values without error"); + let result = ToCharFunc::new() + .invoke(&[ + ColumnarValue::Array(value), + ColumnarValue::Array(Arc::new(format) as ArrayRef), + ]) + .expect("that to_char parsed values without error"); if let ColumnarValue::Array(result) = result { assert_eq!(result.len(), 2); @@ -783,14 +558,15 @@ mod tests { // // invalid number of arguments - let result = to_char(&[ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))]); + let result = ToCharFunc::new() + .invoke(&[ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))]); assert_eq!( result.err().unwrap().strip_backtrace(), "Execution error: to_char function requires 2 arguments, got 1" ); // invalid type - let result = to_char(&[ + let result = ToCharFunc::new().invoke(&[ ColumnarValue::Scalar(ScalarValue::Int32(Some(1))), ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), ]); diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 0dd6fd2a0710..d63ad9bb4a3a 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -85,11 +85,3 @@ tokio = { workspace = true, features = ["rt-multi-thread"] } [[bench]] harness = false name = "in_list" - -[[bench]] -harness = false -name = "make_date" - -[[bench]] -harness = false -name = "to_char" diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 6da3980fede7..9169d83022f1 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -32,8 +32,8 @@ use crate::sort_properties::SortProperties; use crate::{ - array_expressions, conditional_expressions, datetime_expressions, math_expressions, - string_expressions, PhysicalExpr, ScalarFunctionExpr, + array_expressions, conditional_expressions, math_expressions, string_expressions, + PhysicalExpr, ScalarFunctionExpr, }; use arrow::{ array::ArrayRef, @@ -366,8 +366,6 @@ pub fn create_physical_fun( BuiltinScalarFunction::ConcatWithSeparator => Arc::new(|args| { make_scalar_function_inner(string_expressions::concat_ws)(args) }), - BuiltinScalarFunction::MakeDate => Arc::new(datetime_expressions::make_date), - BuiltinScalarFunction::ToChar => Arc::new(datetime_expressions::to_char), BuiltinScalarFunction::InitCap => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function_inner(string_expressions::initcap::)(args) diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 6d5d7e85c638..e8b80ee4e1e6 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -20,7 +20,6 @@ pub mod analysis; pub mod array_expressions; pub mod binary_map; pub mod conditional_expressions; -pub mod datetime_expressions; pub mod equivalence; pub mod expressions; pub mod functions; diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index c9e4e0d64ed8..f63602ccb06f 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -672,10 +672,10 @@ enum ScalarFunction { /// 130 was ArrayResize EndsWith = 131; /// 132 was InStr - MakeDate = 133; + /// 133 was MakeDate ArrayReverse = 134; /// 135 is RegexpLike - ToChar = 136; + /// 136 was ToChar /// 137 was ToDate /// 138 was ToUnixtime } diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index e3e1ee966b4a..ef7f0279f16e 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22974,9 +22974,7 @@ impl serde::Serialize for ScalarFunction { Self::SubstrIndex => "SubstrIndex", Self::FindInSet => "FindInSet", Self::EndsWith => "EndsWith", - Self::MakeDate => "MakeDate", Self::ArrayReverse => "ArrayReverse", - Self::ToChar => "ToChar", }; serializer.serialize_str(variant) } @@ -23070,9 +23068,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "SubstrIndex", "FindInSet", "EndsWith", - "MakeDate", "ArrayReverse", - "ToChar", ]; struct GeneratedVisitor; @@ -23195,9 +23191,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "SubstrIndex" => Ok(ScalarFunction::SubstrIndex), "FindInSet" => Ok(ScalarFunction::FindInSet), "EndsWith" => Ok(ScalarFunction::EndsWith), - "MakeDate" => Ok(ScalarFunction::MakeDate), "ArrayReverse" => Ok(ScalarFunction::ArrayReverse), - "ToChar" => Ok(ScalarFunction::ToChar), _ => Err(serde::de::Error::unknown_variant(value, FIELDS)), } } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index d7e122404397..4e59f4c69e66 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2945,13 +2945,13 @@ pub enum ScalarFunction { /// / 130 was ArrayResize EndsWith = 131, /// / 132 was InStr - MakeDate = 133, - ArrayReverse = 134, - /// / 135 is RegexpLike + /// / 133 was MakeDate /// + /// / 135 is RegexpLike + /// / 136 was ToChar /// / 137 was ToDate /// / 138 was ToUnixtime - ToChar = 136, + ArrayReverse = 134, } impl ScalarFunction { /// String value of the enum field names used in the ProtoBuf definition. @@ -3042,9 +3042,7 @@ impl ScalarFunction { ScalarFunction::SubstrIndex => "SubstrIndex", ScalarFunction::FindInSet => "FindInSet", ScalarFunction::EndsWith => "EndsWith", - ScalarFunction::MakeDate => "MakeDate", ScalarFunction::ArrayReverse => "ArrayReverse", - ScalarFunction::ToChar => "ToChar", } } /// Creates an enum from field names used in the ProtoBuf definition. @@ -3132,9 +3130,7 @@ impl ScalarFunction { "SubstrIndex" => Some(Self::SubstrIndex), "FindInSet" => Some(Self::FindInSet), "EndsWith" => Some(Self::EndsWith), - "MakeDate" => Some(Self::MakeDate), "ArrayReverse" => Some(Self::ArrayReverse), - "ToChar" => Some(Self::ToChar), _ => None, } } diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 024f16fd098c..0b6d979e4603 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -510,8 +510,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Strpos => Self::Strpos, ScalarFunction::Substr => Self::Substr, ScalarFunction::ToHex => Self::ToHex, - ScalarFunction::ToChar => Self::ToChar, - ScalarFunction::MakeDate => Self::MakeDate, ScalarFunction::Uuid => Self::Uuid, ScalarFunction::Translate => Self::Translate, ScalarFunction::Coalesce => Self::Coalesce, @@ -1623,26 +1621,6 @@ pub fn parse_expr( ScalarFunction::ToHex => { Ok(to_hex(parse_expr(&args[0], registry, codec)?)) } - ScalarFunction::MakeDate => { - let args: Vec<_> = args - .iter() - .map(|expr| parse_expr(expr, registry, codec)) - .collect::>()?; - Ok(Expr::ScalarFunction(expr::ScalarFunction::new( - BuiltinScalarFunction::MakeDate, - args, - ))) - } - ScalarFunction::ToChar => { - let args: Vec<_> = args - .iter() - .map(|expr| parse_expr(expr, registry, codec)) - .collect::>()?; - Ok(Expr::ScalarFunction(expr::ScalarFunction::new( - BuiltinScalarFunction::ToChar, - args, - ))) - } ScalarFunction::Translate => Ok(translate( parse_expr(&args[0], registry, codec)?, parse_expr(&args[1], registry, codec)?, diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index e492b96577ec..3835c67b9192 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1453,7 +1453,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Trim => Self::Trim, BuiltinScalarFunction::Ltrim => Self::Ltrim, BuiltinScalarFunction::Rtrim => Self::Rtrim, - BuiltinScalarFunction::ToChar => Self::ToChar, BuiltinScalarFunction::ArrayExcept => Self::ArrayExcept, BuiltinScalarFunction::ArrayElement => Self::ArrayElement, BuiltinScalarFunction::ArrayPopFront => Self::ArrayPopFront, @@ -1494,7 +1493,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Strpos => Self::Strpos, BuiltinScalarFunction::Substr => Self::Substr, BuiltinScalarFunction::ToHex => Self::ToHex, - BuiltinScalarFunction::MakeDate => Self::MakeDate, BuiltinScalarFunction::Translate => Self::Translate, BuiltinScalarFunction::Coalesce => Self::Coalesce, BuiltinScalarFunction::Pi => Self::Pi, diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index 5545ef52be64..9d3a177be6bd 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -279,7 +279,7 @@ Below is a checklist of what you need to do to add a new aggregate function to D - Add the actual implementation of an `Accumulator` and `AggregateExpr`: - [here](../../../datafusion/physical-expr/src/string_expressions.rs) for string functions - [here](../../../datafusion/physical-expr/src/math_expressions.rs) for math functions - - [here](../../../datafusion/physical-expr/src/datetime_expressions.rs) for datetime functions + - [here](../../../datafusion/functions/src/datetime/mod.rs) for datetime functions - create a new module [here](../../../datafusion/physical-expr/src) for other functions - In [datafusion/expr/src](../../../datafusion/expr/src/aggregate_function.rs), add: - a new variant to `AggregateFunction` From 3c26e597aeacde0a5e6a51f30394d3d31c6acd96 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Thu, 14 Mar 2024 14:56:24 -0400 Subject: [PATCH 2/2] Fix to_timestamp benchmark (#9608) --- datafusion/functions/benches/to_timestamp.rs | 173 ++++++++++--------- 1 file changed, 92 insertions(+), 81 deletions(-) diff --git a/datafusion/functions/benches/to_timestamp.rs b/datafusion/functions/benches/to_timestamp.rs index c83824526442..31d609dee9bc 100644 --- a/datafusion/functions/benches/to_timestamp.rs +++ b/datafusion/functions/benches/to_timestamp.rs @@ -17,97 +17,108 @@ extern crate criterion; +use std::sync::Arc; + +use arrow_array::builder::StringBuilder; +use arrow_array::ArrayRef; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use datafusion_expr::lit; -use datafusion_functions::expr_fn::to_timestamp; +use datafusion_expr::ColumnarValue; +use datafusion_functions::datetime::to_timestamp; fn criterion_benchmark(c: &mut Criterion) { c.bench_function("to_timestamp_no_formats", |b| { - let inputs = vec![ - lit("1997-01-31T09:26:56.123Z"), - lit("1997-01-31T09:26:56.123-05:00"), - lit("1997-01-31 09:26:56.123-05:00"), - lit("2023-01-01 04:05:06.789 -08"), - lit("1997-01-31T09:26:56.123"), - lit("1997-01-31 09:26:56.123"), - lit("1997-01-31 09:26:56"), - lit("1997-01-31 13:26:56"), - lit("1997-01-31 13:26:56+04:00"), - lit("1997-01-31"), - ]; + let mut inputs = StringBuilder::new(); + inputs.append_value("1997-01-31T09:26:56.123Z"); + inputs.append_value("1997-01-31T09:26:56.123-05:00"); + inputs.append_value("1997-01-31 09:26:56.123-05:00"); + inputs.append_value("2023-01-01 04:05:06.789 -08"); + inputs.append_value("1997-01-31T09:26:56.123"); + inputs.append_value("1997-01-31 09:26:56.123"); + inputs.append_value("1997-01-31 09:26:56"); + inputs.append_value("1997-01-31 13:26:56"); + inputs.append_value("1997-01-31 13:26:56+04:00"); + inputs.append_value("1997-01-31"); + + let string_array = ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef); + b.iter(|| { - for i in inputs.iter() { - black_box(to_timestamp(vec![i.clone()])); - } - }); + black_box( + to_timestamp() + .invoke(&[string_array.clone()]) + .expect("to_timestamp should work on valid values"), + ) + }) }); c.bench_function("to_timestamp_with_formats", |b| { - let mut inputs = vec![]; - let mut format1 = vec![]; - let mut format2 = vec![]; - let mut format3 = vec![]; - - inputs.push(lit("1997-01-31T09:26:56.123Z")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%dT%H:%M:%S%.f%Z")); - - inputs.push(lit("1997-01-31T09:26:56.123-05:00")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%dT%H:%M:%S%.f%z")); - - inputs.push(lit("1997-01-31 09:26:56.123-05:00")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H:%M:%S%.f%Z")); - - inputs.push(lit("2023-01-01 04:05:06.789 -08")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H:%M:%S%.f %#z")); - - inputs.push(lit("1997-01-31T09:26:56.123")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%dT%H:%M:%S%.f")); - - inputs.push(lit("1997-01-31 09:26:56.123")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H:%M:%S%.f")); - - inputs.push(lit("1997-01-31 09:26:56")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H:%M:%S")); - - inputs.push(lit("1997-01-31 092656")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H%M%S")); - - inputs.push(lit("1997-01-31 092656+04:00")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H%M%S%:z")); - - inputs.push(lit("Sun Jul 8 00:34:60 2001")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d 00:00:00")); - + let mut inputs = StringBuilder::new(); + let mut format1_builder = StringBuilder::with_capacity(2, 10); + let mut format2_builder = StringBuilder::with_capacity(2, 10); + let mut format3_builder = StringBuilder::with_capacity(2, 10); + + inputs.append_value("1997-01-31T09:26:56.123Z"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z"); + + inputs.append_value("1997-01-31T09:26:56.123-05:00"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z"); + + inputs.append_value("1997-01-31 09:26:56.123-05:00"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z"); + + inputs.append_value("2023-01-01 04:05:06.789 -08"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z"); + + inputs.append_value("1997-01-31T09:26:56.123"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f"); + + inputs.append_value("1997-01-31 09:26:56.123"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f"); + + inputs.append_value("1997-01-31 09:26:56"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S"); + + inputs.append_value("1997-01-31 092656"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H%M%S"); + + inputs.append_value("1997-01-31 092656+04:00"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H%M%S%:z"); + + inputs.append_value("Sun Jul 8 00:34:60 2001"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d 00:00:00"); + + let args = [ + ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), + ]; b.iter(|| { - inputs.iter().enumerate().for_each(|(idx, i)| { - black_box(to_timestamp(vec![ - i.clone(), - format1.get(idx).unwrap().clone(), - format2.get(idx).unwrap().clone(), - format3.get(idx).unwrap().clone(), - ])); - }) + black_box( + to_timestamp() + .invoke(&args.clone()) + .expect("to_timestamp should work on valid values"), + ) }) }); }