From 9c9d519cc3e2440904830ddebf8c04b0d05310c1 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 28 Nov 2022 22:06:33 -0800 Subject: [PATCH 1/6] Enable necessary test --- arrow-cast/Cargo.toml | 4 ++++ arrow-cast/src/cast.rs | 30 ++++++++++++------------------ 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index a5911a0a49e8..b0e1d6b80db0 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -48,5 +48,9 @@ num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } [dev-dependencies] +half = { version = "2.1", default-features = false, features = ["num-traits"] } +arrow-array = { version = "28.0.0", path = "../arrow-array", features = ["chrono-tz"]} + +[features] [build-dependencies] diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 07c7d6a3ac55..90667b69d51a 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -160,7 +160,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time64(TimeUnit::Nanosecond) | Timestamp(TimeUnit::Nanosecond, None) ) => true, - (Utf8, _) => DataType::is_numeric(to_type), + (Utf8, _) => DataType::is_numeric(to_type) && to_type != &Float16, (LargeUtf8, LargeBinary | Date32 @@ -171,11 +171,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time64(TimeUnit::Nanosecond) | Timestamp(TimeUnit::Nanosecond, None) ) => true, - (LargeUtf8, _) => DataType::is_numeric(to_type), + (LargeUtf8, _) => DataType::is_numeric(to_type) && to_type != &Float16, (Timestamp(_, _), Utf8) | (Timestamp(_, _), LargeUtf8) => true, (Date32, Utf8) | (Date32, LargeUtf8) => true, (Date64, Utf8) | (Date64, LargeUtf8) => true, - (_, Utf8 | LargeUtf8) => DataType::is_numeric(from_type) || from_type == &Binary, + (_, Utf8 | LargeUtf8) => (DataType::is_numeric(from_type) && from_type != &Float16) || from_type == &Binary, // start numeric casts ( @@ -972,6 +972,7 @@ pub fn cast_with_options( Int16 => cast_numeric_to_bool::(array), Int32 => cast_numeric_to_bool::(array), Int64 => cast_numeric_to_bool::(array), + Float16 => cast_numeric_to_bool::(array), Float32 => cast_numeric_to_bool::(array), Float64 => cast_numeric_to_bool::(array), Utf8 => cast_utf8_to_boolean(array, cast_options), @@ -989,6 +990,7 @@ pub fn cast_with_options( Int16 => cast_bool_to_numeric::(array, cast_options), Int32 => cast_bool_to_numeric::(array, cast_options), Int64 => cast_bool_to_numeric::(array, cast_options), + Float16 => cast_bool_to_numeric::(array, cast_options), Float32 => cast_bool_to_numeric::(array, cast_options), Float64 => cast_bool_to_numeric::(array, cast_options), Utf8 => { @@ -3565,6 +3567,8 @@ where #[cfg(test)] mod tests { use super::*; + #[allow(unused_imports)] + use half::f16; macro_rules! generate_cast_test_case { ($INPUT_ARRAY: expr, $OUTPUT_TYPE_ARRAY: ident, $OUTPUT_TYPE: expr, $OUTPUT_VALUES: expr) => { @@ -3614,7 +3618,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_decimal_to_decimal_round() { let array = vec![ Some(1123454), @@ -3733,7 +3736,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_decimal128_to_decimal128() { let input_type = DataType::Decimal128(20, 3); let output_type = DataType::Decimal128(20, 4); @@ -4096,7 +4098,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_numeric_to_decimal128() { let decimal_type = DataType::Decimal128(38, 6); // u8, u16, u32, u64 @@ -4268,7 +4269,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_numeric_to_decimal256() { // test negative cast type let decimal_type = DataType::Decimal256(58, 6); @@ -5247,7 +5247,6 @@ mod tests { } #[test] - #[cfg(feature = "chrono-tz")] fn test_cast_timestamp_to_string() { let a = TimestampMillisecondArray::from(vec![ Some(864000000005), @@ -6774,7 +6773,6 @@ mod tests { #[test] #[cfg_attr(miri, ignore)] // running forever - #[cfg(feature = "chrono-tz")] fn test_can_cast_types() { // this function attempts to ensure that can_cast_types stays // in sync with cast. It simply tries all combinations of @@ -6842,7 +6840,6 @@ mod tests { } /// Create instances of arrays with varying types for cast tests - #[cfg(feature = "chrono-tz")] fn get_arrays_of_all_types() -> Vec { let tz_name = String::from("America/New_York"); let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; @@ -6893,6 +6890,11 @@ mod tests { Arc::new(UInt16Array::from(vec![1, 2])), Arc::new(UInt32Array::from(vec![1, 2])), Arc::new(UInt64Array::from(vec![1, 2])), + Arc::new( + [Some(f16::from_f64(1.0)), Some(f16::from_f64(2.0))] + .into_iter() + .collect::(), + ), Arc::new(Float32Array::from(vec![1.0, 2.0])), Arc::new(Float64Array::from(vec![1.0, 2.0])), Arc::new(TimestampSecondArray::from(vec![1000, 2000])), @@ -6982,7 +6984,6 @@ mod tests { LargeListArray::from(list_data) } - #[cfg(feature = "chrono-tz")] fn make_fixed_size_list_array() -> FixedSizeListArray { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) @@ -7004,7 +7005,6 @@ mod tests { FixedSizeListArray::from(list_data) } - #[cfg(feature = "chrono-tz")] fn make_fixed_size_binary_array() -> FixedSizeBinaryArray { let values: [u8; 15] = *b"hellotherearrow"; @@ -7016,7 +7016,6 @@ mod tests { FixedSizeBinaryArray::from(array_data) } - #[cfg(feature = "chrono-tz")] fn make_union_array() -> UnionArray { let mut builder = UnionBuilder::with_capacity_dense(7); builder.append::("a", 1).unwrap(); @@ -7025,7 +7024,6 @@ mod tests { } /// Creates a dictionary with primitive dictionary values, and keys of type K - #[cfg(feature = "chrono-tz")] fn make_dictionary_primitive() -> ArrayRef { // Pick Int32 arbitrarily for dictionary values let mut b: PrimitiveDictionaryBuilder = @@ -7036,7 +7034,6 @@ mod tests { } /// Creates a dictionary with utf8 values, and keys of type K - #[cfg(feature = "chrono-tz")] fn make_dictionary_utf8() -> ArrayRef { // Pick Int32 arbitrarily for dictionary values let mut b: StringDictionaryBuilder = StringDictionaryBuilder::new(); @@ -7046,7 +7043,6 @@ mod tests { } // Get a selection of datatypes to try and cast to - #[cfg(feature = "chrono-tz")] fn get_all_types() -> Vec { use DataType::*; let tz_name = String::from("America/New_York"); @@ -7143,7 +7139,6 @@ mod tests { } #[test] - #[cfg(feature = "chrono-tz")] fn test_timestamp_cast_utf8() { let array: PrimitiveArray = vec![Some(37800000000), None, Some(86339000000)].into(); @@ -7241,7 +7236,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_f64_to_decimal128() { // to reproduce https://github.com/apache/arrow-rs/issues/2997 From 045c6b2f9706e9719f00200e3eaa3682c5268532 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 29 Nov 2022 12:08:15 -0800 Subject: [PATCH 2/6] Move tests --- arrow-cast/Cargo.toml | 4 - arrow-cast/src/cast.rs | 313 ---------------------------- arrow/tests/array_cast.rs | 417 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 417 insertions(+), 317 deletions(-) create mode 100644 arrow/tests/array_cast.rs diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index b0e1d6b80db0..a5911a0a49e8 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -48,9 +48,5 @@ num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } [dev-dependencies] -half = { version = "2.1", default-features = false, features = ["num-traits"] } -arrow-array = { version = "28.0.0", path = "../arrow-array", features = ["chrono-tz"]} - -[features] [build-dependencies] diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 90667b69d51a..aec8f27441c8 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -3567,8 +3567,6 @@ where #[cfg(test)] mod tests { use super::*; - #[allow(unused_imports)] - use half::f16; macro_rules! generate_cast_test_case { ($INPUT_ARRAY: expr, $OUTPUT_TYPE_ARRAY: ident, $OUTPUT_TYPE: expr, $OUTPUT_VALUES: expr) => { @@ -5246,24 +5244,6 @@ mod tests { assert!(c.is_null(2)); } - #[test] - fn test_cast_timestamp_to_string() { - let a = TimestampMillisecondArray::from(vec![ - Some(864000000005), - Some(1545696000001), - None, - ]) - .with_timezone("UTC".to_string()); - let array = Arc::new(a) as ArrayRef; - dbg!(&array); - let b = cast(&array, &DataType::Utf8).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(&DataType::Utf8, c.data_type()); - assert_eq!("1997-05-19 00:00:00.005 +00:00", c.value(0)); - assert_eq!("2018-12-25 00:00:00.001 +00:00", c.value(1)); - assert!(c.is_null(2)); - } - #[test] fn test_cast_date32_to_string() { let a = Date32Array::from(vec![10000, 17890]); @@ -6771,40 +6751,6 @@ mod tests { assert!(!c.is_valid(5)); // "2000-01-01" } - #[test] - #[cfg_attr(miri, ignore)] // running forever - fn test_can_cast_types() { - // this function attempts to ensure that can_cast_types stays - // in sync with cast. It simply tries all combinations of - // types and makes sure that if `can_cast_types` returns - // true, so does `cast` - - let all_types = get_all_types(); - - for array in get_arrays_of_all_types() { - for to_type in &all_types { - println!("Test casting {:?} --> {:?}", array.data_type(), to_type); - let cast_result = cast(&array, to_type); - let reported_cast_ability = can_cast_types(array.data_type(), to_type); - - // check for mismatch - match (cast_result, reported_cast_ability) { - (Ok(_), false) => { - panic!("Was able to cast array {:?} from {:?} to {:?} but can_cast_types reported false", - array, array.data_type(), to_type) - } - (Err(e), true) => { - panic!("Was not able to cast array {:?} from {:?} to {:?} but can_cast_types reported true. \ - Error was {:?}", - array, array.data_type(), to_type, e) - } - // otherwise it was a match - _ => {} - }; - } - } - } - #[test] fn test_cast_list_containers() { // large-list to list @@ -6839,103 +6785,6 @@ mod tests { assert_eq!(&expected.value(2), &actual.value(2)); } - /// Create instances of arrays with varying types for cast tests - fn get_arrays_of_all_types() -> Vec { - let tz_name = String::from("America/New_York"); - let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; - vec![ - Arc::new(BinaryArray::from(binary_data.clone())), - Arc::new(LargeBinaryArray::from(binary_data.clone())), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - Arc::new(make_list_array()), - Arc::new(make_large_list_array()), - Arc::new(make_fixed_size_list_array()), - Arc::new(make_fixed_size_binary_array()), - Arc::new(StructArray::from(vec![ - ( - Field::new("a", DataType::Boolean, false), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, - ), - ( - Field::new("b", DataType::Int32, false), - Arc::new(Int32Array::from(vec![42, 28, 19, 31])), - ), - ])), - Arc::new(make_union_array()), - Arc::new(NullArray::new(10)), - Arc::new(StringArray::from(vec!["foo", "bar"])), - Arc::new(LargeStringArray::from(vec!["foo", "bar"])), - Arc::new(BooleanArray::from(vec![true, false])), - Arc::new(Int8Array::from(vec![1, 2])), - Arc::new(Int16Array::from(vec![1, 2])), - Arc::new(Int32Array::from(vec![1, 2])), - Arc::new(Int64Array::from(vec![1, 2])), - Arc::new(UInt8Array::from(vec![1, 2])), - Arc::new(UInt16Array::from(vec![1, 2])), - Arc::new(UInt32Array::from(vec![1, 2])), - Arc::new(UInt64Array::from(vec![1, 2])), - Arc::new( - [Some(f16::from_f64(1.0)), Some(f16::from_f64(2.0))] - .into_iter() - .collect::(), - ), - Arc::new(Float32Array::from(vec![1.0, 2.0])), - Arc::new(Float64Array::from(vec![1.0, 2.0])), - Arc::new(TimestampSecondArray::from(vec![1000, 2000])), - Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])), - Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000])), - Arc::new(TimestampNanosecondArray::from(vec![1000, 2000])), - Arc::new( - TimestampSecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), - ), - Arc::new( - TimestampMillisecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), - ), - Arc::new( - TimestampMicrosecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), - ), - Arc::new( - TimestampNanosecondArray::from(vec![1000, 2000]).with_timezone(tz_name), - ), - Arc::new(Date32Array::from(vec![1000, 2000])), - Arc::new(Date64Array::from(vec![1000, 2000])), - Arc::new(Time32SecondArray::from(vec![1000, 2000])), - Arc::new(Time32MillisecondArray::from(vec![1000, 2000])), - Arc::new(Time64MicrosecondArray::from(vec![1000, 2000])), - Arc::new(Time64NanosecondArray::from(vec![1000, 2000])), - Arc::new(IntervalYearMonthArray::from(vec![1000, 2000])), - Arc::new(IntervalDayTimeArray::from(vec![1000, 2000])), - Arc::new(IntervalMonthDayNanoArray::from(vec![1000, 2000])), - Arc::new(DurationSecondArray::from(vec![1000, 2000])), - Arc::new(DurationMillisecondArray::from(vec![1000, 2000])), - Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), - Arc::new(DurationNanosecondArray::from(vec![1000, 2000])), - Arc::new( - create_decimal_array(vec![Some(1), Some(2), Some(3), None], 38, 0) - .unwrap(), - ), - ] - } - fn make_list_array() -> ListArray { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) @@ -6984,134 +6833,6 @@ mod tests { LargeListArray::from(list_data) } - fn make_fixed_size_list_array() -> FixedSizeListArray { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build() - .unwrap(); - - // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, true)), - 2, - ); - let list_data = ArrayData::builder(list_data_type) - .len(5) - .add_child_data(value_data) - .build() - .unwrap(); - FixedSizeListArray::from(list_data) - } - - fn make_fixed_size_binary_array() -> FixedSizeBinaryArray { - let values: [u8; 15] = *b"hellotherearrow"; - - let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) - .len(3) - .add_buffer(Buffer::from(&values[..])) - .build() - .unwrap(); - FixedSizeBinaryArray::from(array_data) - } - - fn make_union_array() -> UnionArray { - let mut builder = UnionBuilder::with_capacity_dense(7); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.build().unwrap() - } - - /// Creates a dictionary with primitive dictionary values, and keys of type K - fn make_dictionary_primitive() -> ArrayRef { - // Pick Int32 arbitrarily for dictionary values - let mut b: PrimitiveDictionaryBuilder = - PrimitiveDictionaryBuilder::new(); - b.append(1).unwrap(); - b.append(2).unwrap(); - Arc::new(b.finish()) - } - - /// Creates a dictionary with utf8 values, and keys of type K - fn make_dictionary_utf8() -> ArrayRef { - // Pick Int32 arbitrarily for dictionary values - let mut b: StringDictionaryBuilder = StringDictionaryBuilder::new(); - b.append("foo").unwrap(); - b.append("bar").unwrap(); - Arc::new(b.finish()) - } - - // Get a selection of datatypes to try and cast to - fn get_all_types() -> Vec { - use DataType::*; - let tz_name = String::from("America/New_York"); - - vec![ - Null, - Boolean, - Int8, - Int16, - Int32, - UInt64, - UInt8, - UInt16, - UInt32, - UInt64, - Float16, - Float32, - Float64, - Timestamp(TimeUnit::Second, None), - Timestamp(TimeUnit::Millisecond, None), - Timestamp(TimeUnit::Microsecond, None), - Timestamp(TimeUnit::Nanosecond, None), - Timestamp(TimeUnit::Second, Some(tz_name.clone())), - Timestamp(TimeUnit::Millisecond, Some(tz_name.clone())), - Timestamp(TimeUnit::Microsecond, Some(tz_name.clone())), - Timestamp(TimeUnit::Nanosecond, Some(tz_name)), - Date32, - Date64, - Time32(TimeUnit::Second), - Time32(TimeUnit::Millisecond), - Time64(TimeUnit::Microsecond), - Time64(TimeUnit::Nanosecond), - Duration(TimeUnit::Second), - Duration(TimeUnit::Millisecond), - Duration(TimeUnit::Microsecond), - Duration(TimeUnit::Nanosecond), - Interval(IntervalUnit::YearMonth), - Interval(IntervalUnit::DayTime), - Interval(IntervalUnit::MonthDayNano), - Binary, - FixedSizeBinary(10), - LargeBinary, - Utf8, - LargeUtf8, - List(Box::new(Field::new("item", DataType::Int8, true))), - List(Box::new(Field::new("item", DataType::Utf8, true))), - FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10), - FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), - LargeList(Box::new(Field::new("item", DataType::Int8, true))), - LargeList(Box::new(Field::new("item", DataType::Utf8, false))), - Struct(vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Utf8, true), - ]), - Union( - vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Utf8, true), - ], - vec![0, 1], - UnionMode::Dense, - ), - Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32)), - Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), - Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), - Decimal128(38, 0), - ] - } - #[test] fn test_utf8_cast_offsets() { // test if offset of the array is taken into account during cast @@ -7138,40 +6859,6 @@ mod tests { assert_eq!(&out1, &out2.slice(1, 2)) } - #[test] - fn test_timestamp_cast_utf8() { - let array: PrimitiveArray = - vec![Some(37800000000), None, Some(86339000000)].into(); - let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); - - let expected = StringArray::from(vec![ - Some("1970-01-01 10:30:00"), - None, - Some("1970-01-01 23:58:59"), - ]); - - assert_eq!( - out.as_any().downcast_ref::().unwrap(), - &expected - ); - - let array: PrimitiveArray = - vec![Some(37800000000), None, Some(86339000000)].into(); - let array = array.with_timezone("Australia/Sydney".to_string()); - let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); - - let expected = StringArray::from(vec![ - Some("1970-01-01 20:30:00 +10:00"), - None, - Some("1970-01-02 09:58:59 +10:00"), - ]); - - assert_eq!( - out.as_any().downcast_ref::().unwrap(), - &expected - ); - } - #[test] fn test_list_to_string() { let str_array = StringArray::from(vec!["a", "b", "c", "d", "e", "f", "g", "h"]); diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs new file mode 100644 index 000000000000..f573cce6843b --- /dev/null +++ b/arrow/tests/array_cast.rs @@ -0,0 +1,417 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::builder::{ + PrimitiveDictionaryBuilder, StringDictionaryBuilder, UnionBuilder, +}; +use arrow_array::types::{ + ArrowDictionaryKeyType, Int16Type, Int32Type, Int64Type, Int8Type, + TimestampMicrosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, +}; +use arrow_array::{ + Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, + Decimal128Array, DurationMicrosecondArray, DurationMillisecondArray, + DurationNanosecondArray, DurationSecondArray, FixedSizeBinaryArray, + FixedSizeListArray, Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, + Int64Array, Int8Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, + IntervalYearMonthArray, LargeBinaryArray, LargeListArray, LargeStringArray, + ListArray, NullArray, PrimitiveArray, StringArray, StructArray, + Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, + Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, + UInt64Array, UInt8Array, UnionArray, +}; +use arrow_buffer::Buffer; +use arrow_cast::{can_cast_types, cast}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, Field, IntervalUnit, TimeUnit, UnionMode}; +use half::f16; +use std::sync::Arc; + +#[test] +#[cfg(feature = "chrono-tz")] +fn test_cast_timestamp_to_string() { + let a = TimestampMillisecondArray::from(vec![ + Some(864000000005), + Some(1545696000001), + None, + ]) + .with_timezone("UTC".to_string()); + let array = Arc::new(a) as ArrayRef; + dbg!(&array); + let b = cast(&array, &DataType::Utf8).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(&DataType::Utf8, c.data_type()); + assert_eq!("1997-05-19 00:00:00.005 +00:00", c.value(0)); + assert_eq!("2018-12-25 00:00:00.001 +00:00", c.value(1)); + assert!(c.is_null(2)); +} + +#[test] +#[cfg_attr(miri, ignore)] // running forever +#[cfg(feature = "chrono-tz")] +fn test_can_cast_types() { + // this function attempts to ensure that can_cast_types stays + // in sync with cast. It simply tries all combinations of + // types and makes sure that if `can_cast_types` returns + // true, so does `cast` + + let all_types = get_all_types(); + + for array in get_arrays_of_all_types() { + for to_type in &all_types { + println!("Test casting {:?} --> {:?}", array.data_type(), to_type); + let cast_result = cast(&array, to_type); + let reported_cast_ability = can_cast_types(array.data_type(), to_type); + + // check for mismatch + match (cast_result, reported_cast_ability) { + (Ok(_), false) => { + panic!("Was able to cast array {:?} from {:?} to {:?} but can_cast_types reported false", + array, array.data_type(), to_type) + } + (Err(e), true) => { + panic!("Was not able to cast array {:?} from {:?} to {:?} but can_cast_types reported true. \ + Error was {:?}", + array, array.data_type(), to_type, e) + } + // otherwise it was a match + _ => {} + }; + } + } +} + +/// Create instances of arrays with varying types for cast tests +#[cfg(feature = "chrono-tz")] +fn get_arrays_of_all_types() -> Vec { + let tz_name = String::from("America/New_York"); + let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; + vec![ + Arc::new(BinaryArray::from(binary_data.clone())), + Arc::new(LargeBinaryArray::from(binary_data.clone())), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + Arc::new(make_list_array()), + Arc::new(make_large_list_array()), + Arc::new(make_fixed_size_list_array()), + Arc::new(make_fixed_size_binary_array()), + Arc::new(StructArray::from(vec![ + ( + Field::new("a", DataType::Boolean, false), + Arc::new(BooleanArray::from(vec![false, false, true, true])) + as Arc, + ), + ( + Field::new("b", DataType::Int32, false), + Arc::new(Int32Array::from(vec![42, 28, 19, 31])), + ), + ])), + Arc::new(make_union_array()), + Arc::new(NullArray::new(10)), + Arc::new(StringArray::from(vec!["foo", "bar"])), + Arc::new(LargeStringArray::from(vec!["foo", "bar"])), + Arc::new(BooleanArray::from(vec![true, false])), + Arc::new(Int8Array::from(vec![1, 2])), + Arc::new(Int16Array::from(vec![1, 2])), + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(Int64Array::from(vec![1, 2])), + Arc::new(UInt8Array::from(vec![1, 2])), + Arc::new(UInt16Array::from(vec![1, 2])), + Arc::new(UInt32Array::from(vec![1, 2])), + Arc::new(UInt64Array::from(vec![1, 2])), + Arc::new( + [Some(f16::from_f64(1.0)), Some(f16::from_f64(2.0))] + .into_iter() + .collect::(), + ), + Arc::new(Float32Array::from(vec![1.0, 2.0])), + Arc::new(Float64Array::from(vec![1.0, 2.0])), + Arc::new(TimestampSecondArray::from(vec![1000, 2000])), + Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])), + Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000])), + Arc::new(TimestampNanosecondArray::from(vec![1000, 2000])), + Arc::new( + TimestampSecondArray::from(vec![1000, 2000]).with_timezone(tz_name.clone()), + ), + Arc::new( + TimestampMillisecondArray::from(vec![1000, 2000]) + .with_timezone(tz_name.clone()), + ), + Arc::new( + TimestampMicrosecondArray::from(vec![1000, 2000]) + .with_timezone(tz_name.clone()), + ), + Arc::new(TimestampNanosecondArray::from(vec![1000, 2000]).with_timezone(tz_name)), + Arc::new(Date32Array::from(vec![1000, 2000])), + Arc::new(Date64Array::from(vec![1000, 2000])), + Arc::new(Time32SecondArray::from(vec![1000, 2000])), + Arc::new(Time32MillisecondArray::from(vec![1000, 2000])), + Arc::new(Time64MicrosecondArray::from(vec![1000, 2000])), + Arc::new(Time64NanosecondArray::from(vec![1000, 2000])), + Arc::new(IntervalYearMonthArray::from(vec![1000, 2000])), + Arc::new(IntervalDayTimeArray::from(vec![1000, 2000])), + Arc::new(IntervalMonthDayNanoArray::from(vec![1000, 2000])), + Arc::new(DurationSecondArray::from(vec![1000, 2000])), + Arc::new(DurationMillisecondArray::from(vec![1000, 2000])), + Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), + Arc::new(DurationNanosecondArray::from(vec![1000, 2000])), + Arc::new( + create_decimal_array(vec![Some(1), Some(2), Some(3), None], 38, 0).unwrap(), + ), + ] +} + +#[cfg(feature = "chrono-tz")] +fn make_fixed_size_list_array() -> FixedSizeListArray { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a fixed size list array from the above two + let list_data_type = + DataType::FixedSizeList(Box::new(Field::new("item", DataType::Int32, true)), 2); + let list_data = ArrayData::builder(list_data_type) + .len(5) + .add_child_data(value_data) + .build() + .unwrap(); + FixedSizeListArray::from(list_data) +} + +#[cfg(feature = "chrono-tz")] +fn make_fixed_size_binary_array() -> FixedSizeBinaryArray { + let values: [u8; 15] = *b"hellotherearrow"; + + let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) + .len(3) + .add_buffer(Buffer::from(&values[..])) + .build() + .unwrap(); + FixedSizeBinaryArray::from(array_data) +} + +fn make_list_array() -> ListArray { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); + + // Construct a list array from the above two + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .add_child_data(value_data) + .build() + .unwrap(); + ListArray::from(list_data) +} + +fn make_large_list_array() -> LargeListArray { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8]); + + // Construct a list array from the above two + let list_data_type = + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .add_child_data(value_data) + .build() + .unwrap(); + LargeListArray::from(list_data) +} + +#[cfg(feature = "chrono-tz")] +fn make_union_array() -> UnionArray { + let mut builder = UnionBuilder::with_capacity_dense(7); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.build().unwrap() +} + +/// Creates a dictionary with primitive dictionary values, and keys of type K +#[cfg(feature = "chrono-tz")] +fn make_dictionary_primitive() -> ArrayRef { + // Pick Int32 arbitrarily for dictionary values + let mut b: PrimitiveDictionaryBuilder = + PrimitiveDictionaryBuilder::new(); + b.append(1).unwrap(); + b.append(2).unwrap(); + Arc::new(b.finish()) +} + +/// Creates a dictionary with utf8 values, and keys of type K +#[cfg(feature = "chrono-tz")] +fn make_dictionary_utf8() -> ArrayRef { + // Pick Int32 arbitrarily for dictionary values + let mut b: StringDictionaryBuilder = StringDictionaryBuilder::new(); + b.append("foo").unwrap(); + b.append("bar").unwrap(); + Arc::new(b.finish()) +} + +fn create_decimal_array( + array: Vec>, + precision: u8, + scale: i8, +) -> Result { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) +} + +// Get a selection of datatypes to try and cast to +#[cfg(feature = "chrono-tz")] +fn get_all_types() -> Vec { + use DataType::*; + let tz_name = String::from("America/New_York"); + + vec![ + Null, + Boolean, + Int8, + Int16, + Int32, + UInt64, + UInt8, + UInt16, + UInt32, + UInt64, + Float16, + Float32, + Float64, + Timestamp(TimeUnit::Second, None), + Timestamp(TimeUnit::Millisecond, None), + Timestamp(TimeUnit::Microsecond, None), + Timestamp(TimeUnit::Nanosecond, None), + Timestamp(TimeUnit::Second, Some(tz_name.clone())), + Timestamp(TimeUnit::Millisecond, Some(tz_name.clone())), + Timestamp(TimeUnit::Microsecond, Some(tz_name.clone())), + Timestamp(TimeUnit::Nanosecond, Some(tz_name)), + Date32, + Date64, + Time32(TimeUnit::Second), + Time32(TimeUnit::Millisecond), + Time64(TimeUnit::Microsecond), + Time64(TimeUnit::Nanosecond), + Duration(TimeUnit::Second), + Duration(TimeUnit::Millisecond), + Duration(TimeUnit::Microsecond), + Duration(TimeUnit::Nanosecond), + Interval(IntervalUnit::YearMonth), + Interval(IntervalUnit::DayTime), + Interval(IntervalUnit::MonthDayNano), + Binary, + FixedSizeBinary(10), + LargeBinary, + Utf8, + LargeUtf8, + List(Box::new(Field::new("item", DataType::Int8, true))), + List(Box::new(Field::new("item", DataType::Utf8, true))), + FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10), + FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), + LargeList(Box::new(Field::new("item", DataType::Int8, true))), + LargeList(Box::new(Field::new("item", DataType::Utf8, false))), + Struct(vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Utf8, true), + ]), + Union( + vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Utf8, true), + ], + vec![0, 1], + UnionMode::Dense, + ), + Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32)), + Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), + Decimal128(38, 0), + ] +} + +#[test] +#[cfg(feature = "chrono-tz")] +fn test_timestamp_cast_utf8() { + let array: PrimitiveArray = + vec![Some(37800000000), None, Some(86339000000)].into(); + let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); + + let expected = StringArray::from(vec![ + Some("1970-01-01 10:30:00"), + None, + Some("1970-01-01 23:58:59"), + ]); + + assert_eq!( + out.as_any().downcast_ref::().unwrap(), + &expected + ); + + let array: PrimitiveArray = + vec![Some(37800000000), None, Some(86339000000)].into(); + let array = array.with_timezone("Australia/Sydney".to_string()); + let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); + + let expected = StringArray::from(vec![ + Some("1970-01-01 20:30:00 +10:00"), + None, + Some("1970-01-02 09:58:59 +10:00"), + ]); + + assert_eq!( + out.as_any().downcast_ref::().unwrap(), + &expected + ); +} From 61b6be4715e35837cab1642ac7377d84a3bf9bab Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 29 Nov 2022 12:18:42 -0800 Subject: [PATCH 3/6] Trigger Build From 4eac6df22c09a7fdd1a7449bafe24727eaa87617 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 29 Nov 2022 13:29:34 -0800 Subject: [PATCH 4/6] Fix clippy --- arrow/tests/array_cast.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index f573cce6843b..57eeb2253007 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -195,7 +195,7 @@ fn make_fixed_size_list_array() -> FixedSizeListArray { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) .build() .unwrap(); From 877f2f0c3fc8d25e2bd4f8763ede0a985ec631dc Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 29 Nov 2022 14:44:40 -0800 Subject: [PATCH 5/6] Fix test --- arrow/tests/array_cast.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index 57eeb2253007..dc1db38dad13 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -363,7 +363,7 @@ fn get_all_types() -> Vec { LargeList(Box::new(Field::new("item", DataType::Int8, true))), LargeList(Box::new(Field::new("item", DataType::Utf8, false))), Struct(vec![ - Field::new("f1", DataType::Int32, false), + Field::new("f1", DataType::Int32, true), Field::new("f2", DataType::Utf8, true), ]), Union( From 7c82706b67bcd967b969b675702107d2e0a63292 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 30 Nov 2022 10:28:21 -0800 Subject: [PATCH 6/6] Move feature requirement to Cargo.toml --- arrow/Cargo.toml | 4 ++++ arrow/tests/array_cast.rs | 10 ---------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index a97ec1ac123f..876d0d65084e 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -273,3 +273,7 @@ required-features = ["csv", "chrono-tz"] [[test]] name = "pyarrow" required-features = ["pyarrow"] + +[[test]] +name = "array_cast" +required-features = ["chrono-tz"] diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index dc1db38dad13..95fb973289a5 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -43,7 +43,6 @@ use half::f16; use std::sync::Arc; #[test] -#[cfg(feature = "chrono-tz")] fn test_cast_timestamp_to_string() { let a = TimestampMillisecondArray::from(vec![ Some(864000000005), @@ -63,7 +62,6 @@ fn test_cast_timestamp_to_string() { #[test] #[cfg_attr(miri, ignore)] // running forever -#[cfg(feature = "chrono-tz")] fn test_can_cast_types() { // this function attempts to ensure that can_cast_types stays // in sync with cast. It simply tries all combinations of @@ -97,7 +95,6 @@ fn test_can_cast_types() { } /// Create instances of arrays with varying types for cast tests -#[cfg(feature = "chrono-tz")] fn get_arrays_of_all_types() -> Vec { let tz_name = String::from("America/New_York"); let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; @@ -190,7 +187,6 @@ fn get_arrays_of_all_types() -> Vec { ] } -#[cfg(feature = "chrono-tz")] fn make_fixed_size_list_array() -> FixedSizeListArray { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) @@ -210,7 +206,6 @@ fn make_fixed_size_list_array() -> FixedSizeListArray { FixedSizeListArray::from(list_data) } -#[cfg(feature = "chrono-tz")] fn make_fixed_size_binary_array() -> FixedSizeBinaryArray { let values: [u8; 15] = *b"hellotherearrow"; @@ -270,7 +265,6 @@ fn make_large_list_array() -> LargeListArray { LargeListArray::from(list_data) } -#[cfg(feature = "chrono-tz")] fn make_union_array() -> UnionArray { let mut builder = UnionBuilder::with_capacity_dense(7); builder.append::("a", 1).unwrap(); @@ -279,7 +273,6 @@ fn make_union_array() -> UnionArray { } /// Creates a dictionary with primitive dictionary values, and keys of type K -#[cfg(feature = "chrono-tz")] fn make_dictionary_primitive() -> ArrayRef { // Pick Int32 arbitrarily for dictionary values let mut b: PrimitiveDictionaryBuilder = @@ -290,7 +283,6 @@ fn make_dictionary_primitive() -> ArrayRef { } /// Creates a dictionary with utf8 values, and keys of type K -#[cfg(feature = "chrono-tz")] fn make_dictionary_utf8() -> ArrayRef { // Pick Int32 arbitrarily for dictionary values let mut b: StringDictionaryBuilder = StringDictionaryBuilder::new(); @@ -311,7 +303,6 @@ fn create_decimal_array( } // Get a selection of datatypes to try and cast to -#[cfg(feature = "chrono-tz")] fn get_all_types() -> Vec { use DataType::*; let tz_name = String::from("America/New_York"); @@ -382,7 +373,6 @@ fn get_all_types() -> Vec { } #[test] -#[cfg(feature = "chrono-tz")] fn test_timestamp_cast_utf8() { let array: PrimitiveArray = vec![Some(37800000000), None, Some(86339000000)].into();