From e7c0482d05a4251db83eb7d4897318020df6f0b2 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 20 Jan 2024 05:28:24 -0500 Subject: [PATCH] support to_timestamp with optional chrono formats (#8886) * Support to_timestamp with chrono formatting #5398 * Updated user guide's to_timestamp to include chrono formatting information #5398 * Minor comment update. * Small documentation updates for to_timestamp functions. * Cargo fmt and clippy improvements. * Switched to assert and unwrap_err based on feedback * Fixed assert, code compiles and runs as expected now. * Fix fmt (again). * Add additional to_timestamp tests covering usage with tables with and without valid formats. * to_timestamp documentation fixes. * - Changed internal_err! -> exec_err! for unsupported data type errors. - Extracted out to_timestamp_impl method to reduce code duplication as per PR feedback. - Extracted out validate_to_timestamp_data_types to reduce code duplication as per PR feedback. - Added additional tests for argument validation and invalid arguments. - Removed unnecessary shim function 'string_to_timestamp_nanos_with_format_shim' * Resolved merge conflict, updated toStringXXX methods to reflect upstream change * prettier * Fix clippy --------- Co-authored-by: Andrew Lamb --- .../examples/dataframe_to_timestamp.rs | 109 +++ datafusion/expr/src/built_in_function.rs | 68 +- datafusion/expr/src/expr_fn.rs | 25 +- .../physical-expr/src/datetime_expressions.rs | 817 ++++++++++++++++-- .../proto/src/logical_plan/from_proto.rs | 69 +- .../sqllogictest/test_files/timestamps.slt | 143 ++- .../source/user-guide/sql/scalar_functions.md | 63 +- 7 files changed, 1115 insertions(+), 179 deletions(-) create mode 100644 datafusion-examples/examples/dataframe_to_timestamp.rs diff --git a/datafusion-examples/examples/dataframe_to_timestamp.rs b/datafusion-examples/examples/dataframe_to_timestamp.rs new file mode 100644 index 000000000000..8caa9245596b --- /dev/null +++ b/datafusion-examples/examples/dataframe_to_timestamp.rs @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::arrow::array::StringArray; +use datafusion::arrow::datatypes::{DataType, Field, Schema}; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::error::Result; +use datafusion::prelude::*; +use datafusion_common::assert_contains; + +/// This example demonstrates how to use the to_timestamp function in the DataFrame API as well as via sql. +#[tokio::main] +async fn main() -> Result<()> { + // define a schema. + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + ])); + + // define data. + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(vec![ + "2020-09-08T13:42:29Z", + "2020-09-08T13:42:29.190855-05:00", + "2020-08-09 12:13:29", + "2020-01-02", + ])), + Arc::new(StringArray::from(vec![ + "2020-09-08T13:42:29Z", + "2020-09-08T13:42:29.190855-05:00", + "08-09-2020 13/42/29", + "09-27-2020 13:42:29-05:30", + ])), + ], + )?; + + // declare a new context. In spark API, this corresponds to a new spark SQLsession + let ctx = SessionContext::new(); + + // declare a table in memory. In spark API, this corresponds to createDataFrame(...). + ctx.register_batch("t", batch)?; + let df = ctx.table("t").await?; + + // use to_timestamp function to convert col 'a' to timestamp type using the default parsing + let df = df.with_column("a", to_timestamp(vec![col("a")]))?; + // use to_timestamp_seconds function to convert col 'b' to timestamp(Seconds) type using a list of chrono formats to try + let df = df.with_column( + "b", + to_timestamp_seconds(vec![ + col("b"), + lit("%+"), + lit("%d-%m-%Y %H/%M/%S"), + lit("%m-%d-%Y %H:%M:%S%#z"), + ]), + )?; + + let df = df.select_columns(&["a", "b"])?; + + // print the results + df.show().await?; + + // use sql to convert col 'a' to timestamp using the default parsing + let df = ctx.sql("select to_timestamp(a) from t").await?; + + // print the results + df.show().await?; + + // use sql to convert col 'b' to timestamp using a list of chrono formats to try + let df = ctx.sql("select to_timestamp(b, '%+', '%d-%m-%Y %H/%M/%S', '%m-%d-%Y %H:%M:%S%#z') from t").await?; + + // print the results + df.show().await?; + + // use sql to convert a static string to a timestamp using a list of chrono formats to try + let df = ctx.sql("select to_timestamp('01-14-2023 01:01:30+05:30', '%+', '%d-%m-%Y %H/%M/%S', '%m-%d-%Y %H:%M:%S%#z')").await?; + + // print the results + df.show().await?; + + // use sql to convert a static string to a timestamp using a non-matching chrono format to try + let result = ctx + .sql("select to_timestamp('01-14-2023 01/01/30', '%d-%m-%Y %H:%M:%S')") + .await? + .collect() + .await; + + let expected = "Error parsing timestamp from '01-14-2023 01/01/30' using format '%d-%m-%Y %H:%M:%S': input contains invalid characters"; + assert_contains!(result.unwrap_err().to_string(), expected); + + Ok(()) +} diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 6f64642f60d9..b54cd68164c1 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -1053,67 +1053,13 @@ impl BuiltinScalarFunction { vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], self.volatility(), ), - BuiltinScalarFunction::ToTimestamp => Signature::uniform( - 1, - vec![ - Int64, - Float64, - Timestamp(Nanosecond, None), - Timestamp(Microsecond, None), - Timestamp(Millisecond, None), - Timestamp(Second, None), - Utf8, - ], - self.volatility(), - ), - BuiltinScalarFunction::ToTimestampMillis => Signature::uniform( - 1, - vec![ - Int64, - Timestamp(Nanosecond, None), - Timestamp(Microsecond, None), - Timestamp(Millisecond, None), - Timestamp(Second, None), - Utf8, - ], - self.volatility(), - ), - BuiltinScalarFunction::ToTimestampMicros => Signature::uniform( - 1, - vec![ - Int64, - Timestamp(Nanosecond, None), - Timestamp(Microsecond, None), - Timestamp(Millisecond, None), - Timestamp(Second, None), - Utf8, - ], - self.volatility(), - ), - BuiltinScalarFunction::ToTimestampNanos => Signature::uniform( - 1, - vec![ - Int64, - Timestamp(Nanosecond, None), - Timestamp(Microsecond, None), - Timestamp(Millisecond, None), - Timestamp(Second, None), - Utf8, - ], - self.volatility(), - ), - BuiltinScalarFunction::ToTimestampSeconds => Signature::uniform( - 1, - vec![ - Int64, - Timestamp(Nanosecond, None), - Timestamp(Microsecond, None), - Timestamp(Millisecond, None), - Timestamp(Second, None), - Utf8, - ], - self.volatility(), - ), + BuiltinScalarFunction::ToTimestamp + | BuiltinScalarFunction::ToTimestampSeconds + | BuiltinScalarFunction::ToTimestampMillis + | BuiltinScalarFunction::ToTimestampMicros + | BuiltinScalarFunction::ToTimestampNanos => { + Signature::variadic_any(self.volatility()) + } BuiltinScalarFunction::FromUnixtime => { Signature::uniform(1, vec![Int64], self.volatility()) } diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 834420e413b0..ae534f4bb44b 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -885,29 +885,30 @@ nary_scalar_expr!( scalar_expr!(DatePart, date_part, part date, "extracts a subfield from the date"); scalar_expr!(DateTrunc, date_trunc, part date, "truncates the date to a specified level of precision"); scalar_expr!(DateBin, date_bin, stride source origin, "coerces an arbitrary timestamp to the start of the nearest specified interval"); -scalar_expr!( +nary_scalar_expr!( + ToTimestamp, + to_timestamp, + "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`" +); +nary_scalar_expr!( ToTimestampMillis, to_timestamp_millis, - date, - "converts a string to a `Timestamp(Milliseconds, None)`" + "converts a string and optional formats to a `Timestamp(Milliseconds, None)`" ); -scalar_expr!( +nary_scalar_expr!( ToTimestampMicros, to_timestamp_micros, - date, - "converts a string to a `Timestamp(Microseconds, None)`" + "converts a string and optional formats to a `Timestamp(Microseconds, None)`" ); -scalar_expr!( +nary_scalar_expr!( ToTimestampNanos, to_timestamp_nanos, - date, - "converts a string to a `Timestamp(Nanoseconds, None)`" + "converts a string and optional formats to a `Timestamp(Nanoseconds, None)`" ); -scalar_expr!( +nary_scalar_expr!( ToTimestampSeconds, to_timestamp_seconds, - date, - "converts a string to a `Timestamp(Seconds, None)`" + "converts a string and optional formats to a `Timestamp(Seconds, None)`" ); scalar_expr!( FromUnixtime, diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index 589bbc8a952b..d21d89c19d2e 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -17,7 +17,6 @@ //! DateTime expressions -use crate::datetime_expressions; use crate::expressions::cast_column; use arrow::compute::cast; use arrow::{ @@ -37,7 +36,9 @@ use arrow::{ use arrow_array::temporal_conversions::NANOSECONDS; use arrow_array::timezone::Tz; use arrow_array::types::ArrowTimestampType; +use arrow_array::GenericStringArray; use chrono::prelude::*; +use chrono::LocalResult::Single; use chrono::{Duration, Months, NaiveDate}; use datafusion_common::cast::{ as_date32_array, as_date64_array, as_generic_string_array, as_primitive_array, @@ -49,9 +50,96 @@ use datafusion_common::{ ScalarValue, }; use datafusion_expr::ColumnarValue; +use itertools::Either; use std::str::FromStr; use std::sync::Arc; +/// Error message if nanosecond conversion request beyond supported interval +const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; + +/// Accepts a string with a `chrono` format and converts it to a +/// nanosecond precision timestamp. +/// +/// See [`chrono::format::strftime`] for the full set of supported formats. +/// +/// Implements the `to_timestamp` function to convert a string to a +/// timestamp, following the model of spark SQL’s to_`timestamp`. +/// +/// Internally, this function uses the `chrono` library for the +/// datetime parsing +/// +/// ## Timestamp Precision +/// +/// Function uses the maximum precision timestamps supported by +/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This +/// means the range of dates that timestamps can represent is ~1677 AD +/// to 2262 AM +/// +/// ## Timezone / Offset Handling +/// +/// Numerical values of timestamps are stored compared to offset UTC. +/// +/// Any timestamp in the formatting string is handled according to the rules +/// defined by `chrono`. +/// +/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html +/// +#[inline] +pub(crate) fn string_to_timestamp_nanos_formatted( + s: &str, + format: &str, +) -> Result { + string_to_datetime_formatted(&Utc, s, format)? + .naive_utc() + .timestamp_nanos_opt() + .ok_or_else(|| { + DataFusionError::Execution(ERR_NANOSECONDS_NOT_SUPPORTED.to_string()) + }) +} + +/// Accepts a string and parses it using the [`chrono::format::strftime`] specifiers +/// relative to the provided `timezone` +/// +/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled +/// +/// * `2023-01-01 040506 America/Los_Angeles` +/// +/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error +/// will be returned +/// +/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html +/// [IANA timezones]: https://www.iana.org/time-zones +pub(crate) fn string_to_datetime_formatted( + timezone: &T, + s: &str, + format: &str, +) -> Result, DataFusionError> { + let err = |err_ctx: &str| { + DataFusionError::Execution(format!( + "Error parsing timestamp from '{s}' using format '{format}': {err_ctx}" + )) + }; + + // attempt to parse the string assuming it has a timezone + let dt = DateTime::parse_from_str(s, format); + + if let Err(e) = &dt { + // no timezone or other failure, try without a timezone + let ndt = NaiveDateTime::parse_from_str(s, format); + if let Err(e) = &ndt { + return Err(err(&e.to_string())); + } + + if let Single(e) = &timezone.from_local_datetime(&ndt.unwrap()) { + Ok(e.to_owned()) + } else { + Err(err(&e.to_string())) + } + } else { + Ok(dt.unwrap().with_timezone(timezone)) + } +} + /// given a function `op` that maps a `&str` to a Result of an arrow native type, /// returns a `PrimitiveArray` after the application /// of the function to `args[0]`. @@ -84,7 +172,96 @@ where array.iter().map(|x| x.map(&op).transpose()).collect() } -// given an function that maps a `&str` to a arrow native type, +/// given a function `op` that maps `&str`, `&str` to the first successful Result +/// of an arrow native type, returns a `PrimitiveArray` after the application of the +/// function to `args` and the subsequence application of the `op2` function to any +/// successful result. This function calls the `op` function with the first and second +/// argument and if not successful continues with first and third, first and fourth, +/// etc until the result was successful or no more arguments are present. +/// # Errors +/// This function errors iff: +/// * the number of arguments is not > 1 or +/// * the array arguments are not castable to a `GenericStringArray` or +/// * the function `op` errors for all input +pub(crate) fn strings_to_primitive_function<'a, T, O, F, F2>( + args: &'a [ColumnarValue], + op: F, + op2: F2, + name: &str, +) -> Result> +where + O: ArrowPrimitiveType, + T: OffsetSizeTrait, + F: Fn(&'a str, &'a str) -> Result, + F2: Fn(O::Native) -> O::Native, +{ + if args.len() < 2 { + return internal_err!( + "{:?} args were supplied but {} takes 2 or more arguments", + args.len(), + name + ); + } + + // this will throw the error if any of the array args are not castable to GenericStringArray + let data = args + .iter() + .map(|a| match a { + ColumnarValue::Array(a) => { + Ok(Either::Left(as_generic_string_array::(a.as_ref())?)) + } + ColumnarValue::Scalar(s) => match s { + ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => Ok(Either::Right(a)), + other => internal_err!( + "Unexpected scalar type encountered '{other}' for function '{name}'" + ), + }, + }) + .collect::, &Option>>>>()?; + + let first_arg = &data.first().unwrap().left().unwrap(); + + first_arg + .iter() + .enumerate() + .map(|(pos, x)| { + let mut val = None; + + if let Some(x) = x { + let param_args = data.iter().skip(1); + + // go through the args and find the first successful result. Only the last + // failure will be returned if no successful result was received. + for param_arg in param_args { + // param_arg is an array, use the corresponding index into the array as the arg + // we're currently parsing + let p = *param_arg; + let r = if p.is_left() { + let p = p.left().unwrap(); + op(x, p.value(pos)) + } + // args is a scalar, use it directly + else if let Some(p) = p.right().unwrap() { + op(x, p.as_str()) + } else { + continue; + }; + + if r.is_ok() { + val = Some(Ok(op2(r.unwrap()))); + break; + } else { + val = Some(r); + } + } + }; + + val.transpose() + }) + .collect() +} + +// given an function that maps a `&str` to an arrow native type, // returns a `ColumnarValue` where the function is applied to either a `ArrayRef` or `ScalarValue` // depending on the `args`'s variant. fn handle<'a, O, F, S>( @@ -99,24 +276,112 @@ where { match &args[0] { ColumnarValue::Array(a) => match a.data_type() { - DataType::Utf8 => Ok(ColumnarValue::Array(Arc::new( + DataType::Utf8 | DataType::LargeUtf8 => Ok(ColumnarValue::Array(Arc::new( unary_string_to_primitive_function::(&[a.as_ref()], op, name)?, ))), - DataType::LargeUtf8 => Ok(ColumnarValue::Array(Arc::new( - unary_string_to_primitive_function::(&[a.as_ref()], op, name)?, - ))), - other => internal_err!("Unsupported data type {other:?} for function {name}"), + other => exec_err!("Unsupported data type {other:?} for function {name}"), }, ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(a) => { + ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => { let result = a.as_ref().map(|x| (op)(x)).transpose()?; Ok(ColumnarValue::Scalar(S::scalar(result))) } - ScalarValue::LargeUtf8(a) => { - let result = a.as_ref().map(|x| (op)(x)).transpose()?; - Ok(ColumnarValue::Scalar(S::scalar(result))) + other => exec_err!("Unsupported data type {other:?} for function {name}"), + }, + } +} + +// given an function that maps a `&str`, `&str` to an arrow native type, +// returns a `ColumnarValue` where the function is applied to either a `ArrayRef` or `ScalarValue` +// depending on the `args`'s variant. +fn handle_multiple<'a, O, F, S, M>( + args: &'a [ColumnarValue], + op: F, + op2: M, + name: &str, +) -> Result +where + O: ArrowPrimitiveType, + S: ScalarType, + F: Fn(&'a str, &'a str) -> Result, + M: Fn(O::Native) -> O::Native, +{ + match &args[0] { + ColumnarValue::Array(a) => match a.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + // validate the column types + for (pos, arg) in args.iter().enumerate() { + match arg { + ColumnarValue::Array(arg) => match arg.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + // all good + }, + other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"), + }, + ColumnarValue::Scalar(arg) => { match arg.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + // all good + }, + other => return exec_err!("Unsupported data type {other:?} for function {name}, arg # {pos}"), + }} + } + } + + Ok(ColumnarValue::Array(Arc::new( + strings_to_primitive_function::(args, op, op2, name)?, + ))) + } + other => { + exec_err!("Unsupported data type {other:?} for function {name}") + } + }, + // if the first argument is a scalar utf8 all arguments are expected to be scalar utf8 + ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => { + let mut val: Option> = None; + let mut err: Option = None; + + match a { + Some(a) => { + // enumerate all the values finding the first one that returns an Ok result + for (pos, v) in args.iter().enumerate().skip(1) { + if let ColumnarValue::Scalar(s) = v { + if let ScalarValue::Utf8(x) | ScalarValue::LargeUtf8(x) = + s + { + if let Some(s) = x { + match op(a.as_str(), s.as_str()) { + Ok(r) => { + val = Some(Ok(ColumnarValue::Scalar( + S::scalar(Some(op2(r))), + ))); + break; + } + Err(e) => { + err = Some(e); + } + } + } + } else { + return exec_err!("Unsupported data type {s:?} for function {name}, arg # {pos}"); + } + } else { + return exec_err!("Unsupported data type {v:?} for function {name}, arg # {pos}"); + } + } + } + None => (), + } + + if let Some(v) = val { + v + } else { + Err(err.unwrap()) + } + } + other => { + exec_err!("Unsupported data type {other:?} for function {name}") } - other => internal_err!("Unsupported data type {other:?} for function {name}"), }, } } @@ -126,53 +391,61 @@ fn string_to_timestamp_nanos_shim(s: &str) -> Result { string_to_timestamp_nanos(s).map_err(|e| e.into()) } +fn to_timestamp_impl>( + args: &[ColumnarValue], + name: &str, +) -> Result { + let factor = match T::UNIT { + TimeUnit::Second => 1_000_000_000, + TimeUnit::Millisecond => 1_000_000, + TimeUnit::Microsecond => 1_000, + TimeUnit::Nanosecond => 1, + }; + + match args.len() { + 1 => handle::( + args, + |s| string_to_timestamp_nanos_shim(s).map(|n| n / factor), + name, + ), + n if n >= 2 => handle_multiple::( + args, + string_to_timestamp_nanos_formatted, + |n| n / factor, + name, + ), + _ => internal_err!("Unsupported 0 argument count for function {name}"), + } +} + /// to_timestamp SQL function /// -/// Note: `to_timestamp` returns `Timestamp(Nanosecond)` though its arguments are interpreted as **seconds**. The supported range for integer input is between `-9223372037` and `9223372036`. +/// Note: `to_timestamp` returns `Timestamp(Nanosecond)` though its arguments are interpreted as **seconds**. +/// The supported range for integer input is between `-9223372037` and `9223372036`. /// Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. /// Please use `to_timestamp_seconds` for the input outside of supported bounds. pub fn to_timestamp(args: &[ColumnarValue]) -> Result { - handle::( - args, - string_to_timestamp_nanos_shim, - "to_timestamp", - ) + to_timestamp_impl::(args, "to_timestamp") } /// to_timestamp_millis SQL function pub fn to_timestamp_millis(args: &[ColumnarValue]) -> Result { - handle::( - args, - |s| string_to_timestamp_nanos_shim(s).map(|n| n / 1_000_000), - "to_timestamp_millis", - ) + to_timestamp_impl::(args, "to_timestamp_millis") } /// to_timestamp_micros SQL function pub fn to_timestamp_micros(args: &[ColumnarValue]) -> Result { - handle::( - args, - |s| string_to_timestamp_nanos_shim(s).map(|n| n / 1_000), - "to_timestamp_micros", - ) + to_timestamp_impl::(args, "to_timestamp_micros") } /// to_timestamp_nanos SQL function pub fn to_timestamp_nanos(args: &[ColumnarValue]) -> Result { - handle::( - args, - string_to_timestamp_nanos_shim, - "to_timestamp_nanos", - ) + to_timestamp_impl::(args, "to_timestamp_nanos") } /// to_timestamp_seconds SQL function pub fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result { - handle::( - args, - |s| string_to_timestamp_nanos_shim(s).map(|n| n / 1_000_000_000), - "to_timestamp_seconds", - ) + to_timestamp_impl::(args, "to_timestamp_seconds") } /// Create an implementation of `now()` that always returns the @@ -915,22 +1188,51 @@ where Ok(b) } -/// to_timestammp() SQL function implementation +fn validate_to_timestamp_data_types( + args: &[ColumnarValue], + name: &str, +) -> Option> { + for (idx, a) in args.iter().skip(1).enumerate() { + match a.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + // all good + } + _ => { + return Some(internal_err!( + "{name} function unsupported data type at index {}: {}", + idx + 1, + a.data_type() + )); + } + } + } + + None +} + +/// to_timestamp() SQL function implementation pub fn to_timestamp_invoke(args: &[ColumnarValue]) -> Result { - if args.len() != 1 { + if args.is_empty() { return internal_err!( - "to_timestamp function requires 1 arguments, got {}", + "to_timestamp function requires 1 or more arguments, got {}", args.len() ); } + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = validate_to_timestamp_data_types(args, "to_timestamp") { + return value; + } + } + match args[0].data_type() { - DataType::Int64 => cast_column( + DataType::Int32 | DataType::Int64 => cast_column( &cast_column(&args[0], &DataType::Timestamp(TimeUnit::Second, None), None)?, &DataType::Timestamp(TimeUnit::Nanosecond, None), None, ), - DataType::Float64 => cast_column( + DataType::Null | DataType::Float64 => cast_column( &args[0], &DataType::Timestamp(TimeUnit::Nanosecond, None), None, @@ -940,7 +1242,7 @@ pub fn to_timestamp_invoke(args: &[ColumnarValue]) -> Result { &DataType::Timestamp(TimeUnit::Nanosecond, None), None, ), - DataType::Utf8 => datetime_expressions::to_timestamp(args), + DataType::Utf8 => to_timestamp(args), other => { internal_err!( "Unsupported data type {:?} for function to_timestamp", @@ -952,20 +1254,31 @@ pub fn to_timestamp_invoke(args: &[ColumnarValue]) -> Result { /// to_timestamp_millis() SQL function implementation pub fn to_timestamp_millis_invoke(args: &[ColumnarValue]) -> Result { - if args.len() != 1 { + if args.is_empty() { return internal_err!( - "to_timestamp_millis function requires 1 argument, got {}", + "to_timestamp_millis function requires 1 or more arguments, got {}", args.len() ); } + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = validate_to_timestamp_data_types(args, "to_timestamp_millis") + { + return value; + } + } + match args[0].data_type() { - DataType::Int64 | DataType::Timestamp(_, None) => cast_column( + DataType::Null + | DataType::Int32 + | DataType::Int64 + | DataType::Timestamp(_, None) => cast_column( &args[0], &DataType::Timestamp(TimeUnit::Millisecond, None), None, ), - DataType::Utf8 => datetime_expressions::to_timestamp_millis(args), + DataType::Utf8 => to_timestamp_millis(args), other => { internal_err!( "Unsupported data type {:?} for function to_timestamp_millis", @@ -977,20 +1290,31 @@ pub fn to_timestamp_millis_invoke(args: &[ColumnarValue]) -> Result Result { - if args.len() != 1 { + if args.is_empty() { return internal_err!( - "to_timestamp_micros function requires 1 argument, got {}", + "to_timestamp_micros function requires 1 or more arguments, got {}", args.len() ); } + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = validate_to_timestamp_data_types(args, "to_timestamp_micros") + { + return value; + } + } + match args[0].data_type() { - DataType::Int64 | DataType::Timestamp(_, None) => cast_column( + DataType::Null + | DataType::Int32 + | DataType::Int64 + | DataType::Timestamp(_, None) => cast_column( &args[0], &DataType::Timestamp(TimeUnit::Microsecond, None), None, ), - DataType::Utf8 => datetime_expressions::to_timestamp_micros(args), + DataType::Utf8 => to_timestamp_micros(args), other => { internal_err!( "Unsupported data type {:?} for function to_timestamp_micros", @@ -1002,20 +1326,31 @@ pub fn to_timestamp_micros_invoke(args: &[ColumnarValue]) -> Result Result { - if args.len() != 1 { + if args.is_empty() { return internal_err!( - "to_timestamp_nanos function requires 1 argument, got {}", + "to_timestamp_nanos function requires 1 or more arguments, got {}", args.len() ); } + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = validate_to_timestamp_data_types(args, "to_timestamp_nanos") + { + return value; + } + } + match args[0].data_type() { - DataType::Int64 | DataType::Timestamp(_, None) => cast_column( + DataType::Null + | DataType::Int32 + | DataType::Int64 + | DataType::Timestamp(_, None) => cast_column( &args[0], &DataType::Timestamp(TimeUnit::Nanosecond, None), None, ), - DataType::Utf8 => datetime_expressions::to_timestamp_nanos(args), + DataType::Utf8 => to_timestamp_nanos(args), other => { internal_err!( "Unsupported data type {:?} for function to_timestamp_nanos", @@ -1027,18 +1362,30 @@ pub fn to_timestamp_nanos_invoke(args: &[ColumnarValue]) -> Result Result { - if args.len() != 1 { + if args.is_empty() { return internal_err!( - "to_timestamp_seconds function requires 1 argument, got {}", + "to_timestamp_seconds function requires 1 or more arguments, got {}", args.len() ); } + // validate that any args after the first one are Utf8 + if args.len() > 1 { + if let Some(value) = + validate_to_timestamp_data_types(args, "to_timestamp_seconds") + { + return value; + } + } + match args[0].data_type() { - DataType::Int64 | DataType::Timestamp(_, None) => { + DataType::Null + | DataType::Int32 + | DataType::Int64 + | DataType::Timestamp(_, None) => { cast_column(&args[0], &DataType::Timestamp(TimeUnit::Second, None), None) } - DataType::Utf8 => datetime_expressions::to_timestamp_seconds(args), + DataType::Utf8 => to_timestamp_seconds(args), other => { internal_err!( "Unsupported data type {:?} for function to_timestamp_seconds", @@ -1077,7 +1424,13 @@ mod tests { use arrow::array::{ as_primitive_array, ArrayRef, Int64Array, IntervalDayTimeArray, StringBuilder, }; - use arrow_array::TimestampNanosecondArray; + use arrow_array::types::Int64Type; + use arrow_array::{ + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, + }; + use datafusion_common::assert_contains; + use datafusion_expr::ScalarFunctionImplementation; use super::*; @@ -1108,6 +1461,47 @@ mod tests { Ok(()) } + #[test] + fn to_timestamp_with_formats_arrays_and_nulls() -> Result<()> { + // ensure that arrow array implementation is wired up and handles nulls correctly + + let mut date_string_builder = StringBuilder::with_capacity(2, 1024); + let mut format1_builder = StringBuilder::with_capacity(2, 1024); + let mut format2_builder = StringBuilder::with_capacity(2, 1024); + let mut format3_builder = StringBuilder::with_capacity(2, 1024); + let mut ts_builder = TimestampNanosecondArray::builder(2); + + date_string_builder.append_null(); + format1_builder.append_null(); + format2_builder.append_null(); + format3_builder.append_null(); + ts_builder.append_null(); + + date_string_builder.append_value("2020-09-08T13:42:29.19085Z"); + format1_builder.append_value("%s"); + format2_builder.append_value("%c"); + format3_builder.append_value("%+"); + ts_builder.append_value(1599572549190850000); + + let expected_timestamps = &ts_builder.finish() as &dyn Array; + + let string_array = [ + ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), + ]; + let parsed_timestamps = to_timestamp(&string_array) + .expect("that to_timestamp with format args parsed values without error"); + if let ColumnarValue::Array(parsed_array) = parsed_timestamps { + assert_eq!(parsed_array.len(), 2); + assert_eq!(expected_timestamps, parsed_array.as_ref()); + } else { + panic!("Expected a columnar array") + } + Ok(()) + } + #[test] fn date_trunc_test() { let cases = vec![ @@ -1663,7 +2057,7 @@ mod tests { let int64array = ColumnarValue::Array(Arc::new(builder.finish())); let expected_err = - "Internal error: Unsupported data type Int64 for function to_timestamp"; + "Execution error: Unsupported data type Int64 for function to_timestamp"; match to_timestamp(&[int64array]) { Ok(_) => panic!("Expected error but got success"), Err(e) => { @@ -1675,4 +2069,303 @@ mod tests { } Ok(()) } + + #[test] + fn to_timestamp_with_formats_invalid_input_type() -> Result<()> { + // pass the wrong type of input array to to_timestamp and test + // that we get an error. + + let mut builder = Int64Array::builder(1); + builder.append_value(1); + let int64array = [ + ColumnarValue::Array(Arc::new(builder.finish())), + ColumnarValue::Array(Arc::new(builder.finish())), + ]; + + let expected_err = + "Execution error: Unsupported data type Int64 for function to_timestamp"; + match to_timestamp(&int64array) { + Ok(_) => panic!("Expected error but got success"), + Err(e) => { + assert!( + e.to_string().contains(expected_err), + "Can not find expected error '{expected_err}'. Actual error '{e}'" + ); + } + } + Ok(()) + } + + #[test] + fn to_timestamp_with_unparseable_data() -> Result<()> { + let mut date_string_builder = StringBuilder::with_capacity(2, 1024); + + date_string_builder.append_null(); + + date_string_builder.append_value("2020-09-08 - 13:42:29.19085Z"); + + let string_array = + ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef); + + let expected_err = + "Arrow error: Parser error: Error parsing timestamp from '2020-09-08 - 13:42:29.19085Z': error parsing time"; + match to_timestamp(&[string_array]) { + Ok(_) => panic!("Expected error but got success"), + Err(e) => { + assert!( + e.to_string().contains(expected_err), + "Can not find expected error '{expected_err}'. Actual error '{e}'" + ); + } + } + Ok(()) + } + + #[test] + fn to_timestamp_with_no_matching_formats() -> Result<()> { + let mut date_string_builder = StringBuilder::with_capacity(2, 1024); + let mut format1_builder = StringBuilder::with_capacity(2, 1024); + let mut format2_builder = StringBuilder::with_capacity(2, 1024); + let mut format3_builder = StringBuilder::with_capacity(2, 1024); + + date_string_builder.append_null(); + format1_builder.append_null(); + format2_builder.append_null(); + format3_builder.append_null(); + + date_string_builder.append_value("2020-09-08T13:42:29.19085Z"); + format1_builder.append_value("%s"); + format2_builder.append_value("%c"); + format3_builder.append_value("%H:%M:%S"); + + let string_array = [ + ColumnarValue::Array(Arc::new(date_string_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), + ]; + + let expected_err = + "Execution error: Error parsing timestamp from '2020-09-08T13:42:29.19085Z' using format '%H:%M:%S': input contains invalid characters"; + match to_timestamp(&string_array) { + Ok(_) => panic!("Expected error but got success"), + Err(e) => { + assert!( + e.to_string().contains(expected_err), + "Can not find expected error '{expected_err}'. Actual error '{e}'" + ); + } + } + Ok(()) + } + + #[test] + fn string_to_timestamp_formatted() { + // Explicit timezone + assert_eq!( + 1599572549190855000, + parse_timestamp_formatted("2020-09-08T13:42:29.190855+00:00", "%+").unwrap() + ); + assert_eq!( + 1599572549190855000, + parse_timestamp_formatted("2020-09-08T13:42:29.190855Z", "%+").unwrap() + ); + assert_eq!( + 1599572549000000000, + parse_timestamp_formatted("2020-09-08T13:42:29Z", "%+").unwrap() + ); // no fractional part + assert_eq!( + 1599590549190855000, + parse_timestamp_formatted("2020-09-08T13:42:29.190855-05:00", "%+").unwrap() + ); + assert_eq!( + 1599590549000000000, + parse_timestamp_formatted("1599590549", "%s").unwrap() + ); + assert_eq!( + 1599572549000000000, + parse_timestamp_formatted("09-08-2020 13/42/29", "%m-%d-%Y %H/%M/%S") + .unwrap() + ); + } + + fn parse_timestamp_formatted(s: &str, format: &str) -> Result { + let result = string_to_timestamp_nanos_formatted(s, format); + if let Err(e) = &result { + eprintln!("Error parsing timestamp '{s}' using format '{format}': {e:?}"); + } + result + } + + #[test] + fn string_to_timestamp_formatted_invalid() { + // Test parsing invalid formats + let cases = [ + ("", "%Y%m%d %H%M%S", "premature end of input"), + ("SS", "%c", "premature end of input"), + ("Wed, 18 Feb 2015 23:16:09 GMT", "", "trailing input"), + ( + "Wed, 18 Feb 2015 23:16:09 GMT", + "%XX", + "input contains invalid characters", + ), + ( + "Wed, 18 Feb 2015 23:16:09 GMT", + "%Y%m%d %H%M%S", + "input contains invalid characters", + ), + ]; + + for (s, f, ctx) in cases { + let expected = format!("Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}"); + let actual = string_to_datetime_formatted(&Utc, s, f) + .unwrap_err() + .to_string(); + assert_eq!(actual, expected) + } + } + + #[test] + fn string_to_timestamp_invalid_arguments() { + // Test parsing invalid formats + let cases = [ + ("", "%Y%m%d %H%M%S", "premature end of input"), + ("SS", "%c", "premature end of input"), + ("Wed, 18 Feb 2015 23:16:09 GMT", "", "trailing input"), + ( + "Wed, 18 Feb 2015 23:16:09 GMT", + "%XX", + "input contains invalid characters", + ), + ( + "Wed, 18 Feb 2015 23:16:09 GMT", + "%Y%m%d %H%M%S", + "input contains invalid characters", + ), + ]; + + for (s, f, ctx) in cases { + let expected = format!("Execution error: Error parsing timestamp from '{s}' using format '{f}': {ctx}"); + let actual = string_to_datetime_formatted(&Utc, s, f) + .unwrap_err() + .to_string(); + assert_eq!(actual, expected) + } + } + + #[test] + fn test_to_timestamp_arg_validation() { + let mut date_string_builder = StringBuilder::with_capacity(2, 1024); + date_string_builder.append_value("2020-09-08T13:42:29.19085Z"); + + let data = date_string_builder.finish(); + + let funcs: Vec<(ScalarFunctionImplementation, TimeUnit)> = vec![ + (Arc::new(to_timestamp), TimeUnit::Nanosecond), + (Arc::new(to_timestamp_micros), TimeUnit::Microsecond), + (Arc::new(to_timestamp_millis), TimeUnit::Millisecond), + (Arc::new(to_timestamp_nanos), TimeUnit::Nanosecond), + (Arc::new(to_timestamp_seconds), TimeUnit::Second), + ]; + + let mut nanos_builder = TimestampNanosecondArray::builder(2); + let mut millis_builder = TimestampMillisecondArray::builder(2); + let mut micros_builder = TimestampMicrosecondArray::builder(2); + let mut sec_builder = TimestampSecondArray::builder(2); + + nanos_builder.append_value(1599572549190850000); + millis_builder.append_value(1599572549190); + micros_builder.append_value(1599572549190850); + sec_builder.append_value(1599572549); + + let nanos_expected_timestamps = &nanos_builder.finish() as &dyn Array; + let millis_expected_timestamps = &millis_builder.finish() as &dyn Array; + let micros_expected_timestamps = µs_builder.finish() as &dyn Array; + let sec_expected_timestamps = &sec_builder.finish() as &dyn Array; + + for (func, time_unit) in funcs { + // test UTF8 + let string_array = [ + ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("%s".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("%c".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("%+".to_string()))), + ]; + let parsed_timestamps = func(&string_array) + .expect("that to_timestamp with format args parsed values without error"); + if let ColumnarValue::Array(parsed_array) = parsed_timestamps { + assert_eq!(parsed_array.len(), 1); + match time_unit { + TimeUnit::Nanosecond => { + assert_eq!(nanos_expected_timestamps, parsed_array.as_ref()) + } + TimeUnit::Millisecond => { + assert_eq!(millis_expected_timestamps, parsed_array.as_ref()) + } + TimeUnit::Microsecond => { + assert_eq!(micros_expected_timestamps, parsed_array.as_ref()) + } + TimeUnit::Second => { + assert_eq!(sec_expected_timestamps, parsed_array.as_ref()) + } + }; + } else { + panic!("Expected a columnar array") + } + + // test LargeUTF8 + let string_array = [ + ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("%s".to_string()))), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("%c".to_string()))), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("%+".to_string()))), + ]; + let parsed_timestamps = func(&string_array) + .expect("that to_timestamp with format args parsed values without error"); + if let ColumnarValue::Array(parsed_array) = parsed_timestamps { + assert_eq!(parsed_array.len(), 1); + match time_unit { + TimeUnit::Nanosecond => { + assert_eq!(nanos_expected_timestamps, parsed_array.as_ref()) + } + TimeUnit::Millisecond => { + assert_eq!(millis_expected_timestamps, parsed_array.as_ref()) + } + TimeUnit::Microsecond => { + assert_eq!(micros_expected_timestamps, parsed_array.as_ref()) + } + TimeUnit::Second => { + assert_eq!(sec_expected_timestamps, parsed_array.as_ref()) + } + }; + } else { + panic!("Expected a columnar array") + } + + // test other types + let string_array = [ + ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), + ColumnarValue::Scalar(ScalarValue::Int32(Some(1))), + ColumnarValue::Scalar(ScalarValue::Int32(Some(2))), + ColumnarValue::Scalar(ScalarValue::Int32(Some(3))), + ]; + + let expected = "Unsupported data type Int32 for function".to_string(); + let actual = func(&string_array).unwrap_err().to_string(); + assert_contains!(actual, expected); + + // test other types + let string_array = [ + ColumnarValue::Array(Arc::new(data.clone()) as ArrayRef), + ColumnarValue::Array(Arc::new(PrimitiveArray::::new( + vec![1i64].into(), + None, + )) as ArrayRef), + ]; + + let expected = "Unsupported data type".to_string(); + let actual = func(&string_array).unwrap_err().to_string(); + assert_contains!(actual, expected); + } + } } diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 973e366d0bbd..aae19a15b89a 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -65,10 +65,9 @@ use datafusion_expr::{ radians, random, regexp_match, regexp_replace, repeat, replace, reverse, right, round, rpad, rtrim, sha224, sha256, sha384, sha512, signum, sin, sinh, split_part, sqrt, starts_with, string_to_array, strpos, struct_fun, substr, substr_index, - substring, tan, tanh, to_hex, to_timestamp_micros, to_timestamp_millis, - to_timestamp_nanos, to_timestamp_seconds, translate, trim, trunc, upper, uuid, - AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, - Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet, + substring, tan, tanh, to_hex, translate, trim, trunc, upper, uuid, AggregateFunction, + Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, + GetFieldAccess, GetIndexedField, GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, @@ -476,7 +475,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Trim => Self::Trim, ScalarFunction::Ltrim => Self::Ltrim, ScalarFunction::Rtrim => Self::Rtrim, - ScalarFunction::ToTimestamp => Self::ToTimestamp, ScalarFunction::ArrayAppend => Self::ArrayAppend, ScalarFunction::ArraySort => Self::ArraySort, ScalarFunction::ArrayConcat => Self::ArrayConcat, @@ -523,7 +521,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Digest => Self::Digest, ScalarFunction::Encode => Self::Encode, ScalarFunction::Decode => Self::Decode, - ScalarFunction::ToTimestampMillis => Self::ToTimestampMillis, ScalarFunction::Log2 => Self::Log2, ScalarFunction::Signum => Self::Signum, ScalarFunction::Ascii => Self::Ascii, @@ -548,6 +545,8 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Strpos => Self::Strpos, ScalarFunction::Substr => Self::Substr, ScalarFunction::ToHex => Self::ToHex, + ScalarFunction::ToTimestamp => Self::ToTimestamp, + ScalarFunction::ToTimestampMillis => Self::ToTimestampMillis, ScalarFunction::ToTimestampMicros => Self::ToTimestampMicros, ScalarFunction::ToTimestampNanos => Self::ToTimestampNanos, ScalarFunction::ToTimestampSeconds => Self::ToTimestampSeconds, @@ -1689,17 +1688,55 @@ pub fn parse_expr( parse_expr(&args[1], registry)?, )), ScalarFunction::ToHex => Ok(to_hex(parse_expr(&args[0], registry)?)), + ScalarFunction::ToTimestamp => { + let args: Vec<_> = args + .iter() + .map(|expr| parse_expr(expr, registry)) + .collect::>()?; + Ok(Expr::ScalarFunction(expr::ScalarFunction::new( + BuiltinScalarFunction::ToTimestamp, + args, + ))) + } ScalarFunction::ToTimestampMillis => { - Ok(to_timestamp_millis(parse_expr(&args[0], registry)?)) + let args: Vec<_> = args + .iter() + .map(|expr| parse_expr(expr, registry)) + .collect::>()?; + Ok(Expr::ScalarFunction(expr::ScalarFunction::new( + BuiltinScalarFunction::ToTimestampMillis, + args, + ))) } ScalarFunction::ToTimestampMicros => { - Ok(to_timestamp_micros(parse_expr(&args[0], registry)?)) + let args: Vec<_> = args + .iter() + .map(|expr| parse_expr(expr, registry)) + .collect::>()?; + Ok(Expr::ScalarFunction(expr::ScalarFunction::new( + BuiltinScalarFunction::ToTimestampMicros, + args, + ))) } ScalarFunction::ToTimestampNanos => { - Ok(to_timestamp_nanos(parse_expr(&args[0], registry)?)) + let args: Vec<_> = args + .iter() + .map(|expr| parse_expr(expr, registry)) + .collect::>()?; + Ok(Expr::ScalarFunction(expr::ScalarFunction::new( + BuiltinScalarFunction::ToTimestampNanos, + args, + ))) } ScalarFunction::ToTimestampSeconds => { - Ok(to_timestamp_seconds(parse_expr(&args[0], registry)?)) + let args: Vec<_> = args + .iter() + .map(|expr| parse_expr(expr, registry)) + .collect::>()?; + Ok(Expr::ScalarFunction(expr::ScalarFunction::new( + BuiltinScalarFunction::ToTimestampSeconds, + args, + ))) } ScalarFunction::Now => Ok(now()), ScalarFunction::Translate => Ok(translate( @@ -1741,18 +1778,6 @@ pub fn parse_expr( ScalarFunction::ArrowTypeof => { Ok(arrow_typeof(parse_expr(&args[0], registry)?)) } - ScalarFunction::ToTimestamp => { - let args: Vec<_> = args - .iter() - .map(|expr| parse_expr(expr, registry)) - .collect::>()?; - Ok(Expr::ScalarFunction( - datafusion_expr::expr::ScalarFunction::new( - BuiltinScalarFunction::ToTimestamp, - args, - ), - )) - } ScalarFunction::Flatten => Ok(flatten(parse_expr(&args[0], registry)?)), ScalarFunction::StringToArray => Ok(string_to_array( parse_expr(&args[0], registry)?, diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 2ab3dbdac61b..5c7687aa27b2 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -18,7 +18,7 @@ ########## ## Common timestamp data # -# ts_data: Int64 nanosecods +# ts_data: Int64 nanoseconds # ts_data_nanos: Timestamp(Nanosecond, None) # ts_data_micros: Timestamp(Microsecond, None) # ts_data_millis: Timestamp(Millisecond, None) @@ -331,6 +331,35 @@ SELECT COUNT(*) FROM ts_data_secs where ts > to_timestamp_seconds('2020-09-08T12 ---- 2 +# to_timestamp with formatting +query I +SELECT COUNT(*) FROM ts_data_nanos where ts > to_timestamp('2020-09-08T12:00:00+00:00', '2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%s%#z') +---- +2 + +# to_timestamp_nanos with formatting +query I +SELECT COUNT(*) FROM ts_data_nanos where ts > to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#z') +---- +2 + +# to_timestamp_millis with formatting +query I +SELECT COUNT(*) FROM ts_data_millis where ts > to_timestamp_millis('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#z') +---- +2 + +# to_timestamp_micros with formatting +query I +SELECT COUNT(*) FROM ts_data_micros where ts > to_timestamp_micros('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#z') +---- +2 + +# to_timestamp_seconds with formatting +query I +SELECT COUNT(*) FROM ts_data_secs where ts > to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#z') +---- +2 # to_timestamp float inputs @@ -1880,7 +1909,7 @@ SELECT to_timestamp(null), to_timestamp(0), to_timestamp(1926632005), to_timesta ---- NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:59:59 1969-12-31T23:59:59 -# verify timestamp syntax stlyes are consistent +# verify timestamp syntax styles are consistent query BBBBBBBBBBBBB SELECT to_timestamp(null) is null as c1, null::timestamp is null as c2, @@ -1922,6 +1951,116 @@ true true true true true true #---- #0001-04-25T00:00:00 +63022-07-16T12:59:37 0001-04-25T00:00:00 +63022-07-16T12:59:37 0001-04-25T00:00:00 +63022-07-16T12:59:37 +# verify timestamp data with formatting options +query PPPPPP +SELECT to_timestamp(null, '%+'), to_timestamp(0, '%s'), to_timestamp(1926632005, '%s'), to_timestamp(1, '%+', '%s'), to_timestamp(-1, '%c', '%+', '%s'), to_timestamp(0-1, '%c', '%+', '%s') +---- +NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:59:59 1969-12-31T23:59:59 + +# verify timestamp data with formatting options +query PPPPPP +SELECT to_timestamp(null, '%+'), to_timestamp(0, '%s'), to_timestamp(1926632005, '%s'), to_timestamp(1, '%+', '%s'), to_timestamp(-1, '%c', '%+', '%s'), to_timestamp(0-1, '%c', '%+', '%s') +---- +NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:59:59 1969-12-31T23:59:59 + +# verify timestamp output types with formatting options +query TTT +SELECT arrow_typeof(to_timestamp(1, '%c', '%s')), arrow_typeof(to_timestamp(null, '%+', '%s')), arrow_typeof(to_timestamp('2023-01-10 12:34:56.000', '%Y-%m-%d %H:%M:%S%.f')) +---- +Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) + +# to_timestamp with invalid formatting +query error input contains invalid characters +SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_nanos with invalid formatting +query error input contains invalid characters +SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_millis with invalid formatting +query error input contains invalid characters +SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_micros with invalid formatting +query error input contains invalid characters +SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp_seconds with invalid formatting +query error input contains invalid characters +SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%c', '%+') + +# to_timestamp with broken formatting +query error bad or unsupported format string +SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_nanos with broken formatting +query error bad or unsupported format string +SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_millis with broken formatting +query error bad or unsupported format string +SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_micros with broken formatting +query error bad or unsupported format string +SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%q') + +# to_timestamp_seconds with broken formatting +query error bad or unsupported format string +SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%q') + +# Create string timestamp table with different formats +# including a few very non-standard formats + +statement ok +create table ts_utf8_data(ts varchar(100), format varchar(100)) as values + ('2020-09-08 12/00/00+00:00', '%Y-%m-%d %H/%M/%S%#z'), + ('2031-01-19T23:33:25+05:00', '%+'), + ('08-09-2020 12:00:00+00:00', '%d-%m-%Y %H:%M:%S%#z'), + ('1926632005', '%s'), + ('2000-01-01T01:01:01+07:00', '%+'); + +# verify timestamp data using tables with formatting options +query P +SELECT to_timestamp(t.ts, t.format) from ts_utf8_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 + +# verify timestamp data using tables with formatting options +query P +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%s', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 + +# verify timestamp data using tables with formatting options where at least one column cannot be parsed +query error Error parsing timestamp from '1926632005' using format '%d-%m-%Y %H:%M:%S%#z': input contains invalid characters +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%d-%m-%Y %H:%M:%S%#z') from ts_utf8_data as t + +# verify timestamp data using tables with formatting options where one of the formats is invalid +query P +SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#z', '%+') from ts_utf8_data as t +---- +2020-09-08T12:00:00 +2031-01-19T18:33:25 +2020-09-08T12:00:00 +2031-01-19T23:33:25 +1999-12-31T18:01:01 + +# timestamp data using tables with formatting options in an array is not supported at this time +query error function unsupported data type at index 1: +SELECT to_timestamp(t.ts, make_array('%Y-%m-%d %H/%M/%S%#z', '%s', '%q', '%d-%m-%Y %H:%M:%S%#z', '%+')) from ts_utf8_data as t + +statement ok +drop table ts_utf8_data + ########## ## Test binary temporal coercion for Date and Timestamp ########## diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 9dd008f8fc44..c72ef94f42ea 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1471,84 +1471,107 @@ extract(field FROM source) Converts a value to a timestamp (`YYYY-MM-DDT00:00:00Z`). Supports strings, integer, unsigned integer, and double types as input. -Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') -Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`) -return the corresponding timestamp. +Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats] are provided. +Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). +Returns the corresponding timestamp. Note: `to_timestamp` returns `Timestamp(Nanosecond)`. The supported range for integer input is between `-9223372037` and `9223372036`. -Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds. +Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` +for the input outside of supported bounds. ``` -to_timestamp(expression) +to_timestamp(expression[, ..., format_n]) ``` #### Arguments - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **format_n**: Optional [Chrono format] strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned. + +[chrono format]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html ### `to_timestamp_millis` Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000Z`). Supports strings, integer, and unsigned integer types as input. -Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') -Integers and unsigned integers are interpreted as milliseconds since the unix epoch (`1970-01-01T00:00:00Z`) -return the corresponding timestamp. +Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format]s are provided. +Integers and unsigned integers are interpreted as milliseconds since the unix epoch (`1970-01-01T00:00:00Z`). +Returns the corresponding timestamp. ``` -to_timestamp_millis(expression) +to_timestamp_millis(expression[, ..., format_n]) ``` #### Arguments - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **format_n**: Optional [Chrono format] strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned. ### `to_timestamp_micros` Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000Z`). Supports strings, integer, and unsigned integer types as input. -Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') +Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format]s are provided. Integers and unsigned integers are interpreted as microseconds since the unix epoch (`1970-01-01T00:00:00Z`) -return the corresponding timestamp. +Returns the corresponding timestamp. ``` -to_timestamp_nanos(expression) +to_timestamp_micros(expression[, ..., format_n]) ``` +#### Arguments + +- **expression**: Expression to operate on. + Can be a constant, column, or function, and any combination of arithmetic operators. +- **format_n**: Optional [Chrono format] strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned. + ### `to_timestamp_nanos` Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000Z`). Supports strings, integer, and unsigned integer types as input. -Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') -Integers and unsigned integers are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`) -return the corresponding timestamp. +Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format]s are provided. +Integers and unsigned integers are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`). +Returns the corresponding timestamp. ``` -to_timestamp_nanos(expression) +to_timestamp_nanos(expression[, ..., format_n]) ``` #### Arguments - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **format_n**: Optional [Chrono format] strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned. ### `to_timestamp_seconds` Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000Z`). Supports strings, integer, and unsigned integer types as input. -Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') -Integers and unsigned integers are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`) -return the corresponding timestamp. +Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono format]s are provided. +Integers and unsigned integers are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). +Returns the corresponding timestamp. ``` -to_timestamp_seconds(expression) +to_timestamp_seconds(expression[, ..., format_n]) ``` #### Arguments - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **format_n**: Optional [Chrono format] strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned. ### `from_unixtime`